Source code for pica.ml.vectorizer
import array
from collections import defaultdict
import numpy as np
from scipy import sparse as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils.fixes import sp_version
[docs]class CustomVectorizer(CountVectorizer):
"""
modified from CountVectorizer to override the _validate_vocabulary function, which invoked an error because
multiple indices of the dictionary contained the same feature index. However, this is we intend with the
compress_vocabulary function.
Other functions had to be adopted: get_feature_names (allow decompression), _count_vocab (reduce the matrix size)
"""
def _validate_vocabulary(self):
"""
overriding the validation which does not accept multiple feature-names to encode for one feature
"""
if self.vocabulary:
self.vocabulary_ = dict(self.vocabulary)
self.fixed_vocabulary_ = True
else:
self.fixed_vocabulary_ = False
[docs] def get_feature_names(self):
"""Array mapping from feature integer indices to feature name"""
if not hasattr(self, 'vocabulary_'):
self._validate_vocabulary()
self._check_vocabulary()
# return value is different from normal CountVectorizer output: maintain dict instead of returning a list
return sorted(self.vocabulary_.items(), key=lambda x: x[1])
def _count_vocab(self, raw_documents, fixed_vocab):
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
Modified to reduce the actual size of the matrix returned if compression of vocabulary is used
"""
if fixed_vocab:
vocabulary = self.vocabulary_
else:
vocabulary = defaultdict()
vocabulary.default_factory = vocabulary.__len__
analyze = self.build_analyzer()
j_indices = []
indptr = []
values = array.array(str("i"))
indptr.append(0)
for doc in raw_documents:
feature_counter = {}
for feature in analyze(doc):
try:
feature_idx = vocabulary[feature]
if feature_idx not in feature_counter:
feature_counter[feature_idx] = 1
else:
feature_counter[feature_idx] += 1
except KeyError:
# Ignore out-of-vocabulary items for fixed_vocab=True
continue
j_indices.extend(feature_counter.keys())
values.extend(feature_counter.values())
indptr.append(len(j_indices))
if not fixed_vocab:
# disable defaultdict behaviour
vocabulary = dict(vocabulary)
if not vocabulary:
raise ValueError("empty vocabulary; perhaps the documents only"
" contain stop words")
if indptr[-1] > 2147483648: # = 2**31 - 1
if sp_version >= (0, 14):
indices_dtype = np.int64
else:
raise ValueError(f'sparse CSR array has {indptr[-1]} non-zero '
f'elements and requires 64 bit indexing, '
f' which is unsupported with scipy {".".join(sp_version)}. '
f'Please upgrade to scipy >=0.14')
else:
indices_dtype = np.int32
j_indices = np.asarray(j_indices, dtype=indices_dtype)
indptr = np.asarray(indptr, dtype=indices_dtype)
values = np.frombuffer(values, dtype=np.intc)
# modification here:
vocab_len = vocabulary[max(vocabulary, key=vocabulary.get)] + 1
X = sp.csr_matrix((values, j_indices, indptr),
shape=(len(indptr) - 1, vocab_len),
dtype=self.dtype)
X.sort_indices()
return vocabulary, X