Source code for pica.ml.feature_select
from time import time
from typing import List
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from pica.structure.records import TrainingRecord
from pica.util.logging import get_logger
from pica.util.helpers import get_x_y_tn
import numpy as np
DEFAULT_STEP_SIZE = 0.0025
DEFAULT_SCORING_FUNCTION = 'balanced_accuracy'
[docs]def compress_vocabulary(records: List[TrainingRecord], pipeline: Pipeline):
"""
Method to group features, that store redundant information, to avoid overfitting and speed up process (in some
cases). Might be replaced or complemented by a feature selection method in future versions.
Compressing vocabulary is optional, for the test dataset it took 30 seconds, while the time saved later on is not
significant.
:param records: a list of TrainingRecord objects.
:param pipeline: the targeted pipeline where the vocabulary should be modified
:return: nothing, sets the vocabulary for CountVectorizer step
"""
X, y, tn = get_x_y_tn(records) # we actually only need X
vec = pipeline.named_steps["vec"]
if not vec.vocabulary:
vec.fit(X)
names = [name for name, i in vec.get_feature_names()]
else:
names = sorted(vec.vocabulary, key=vec.vocabulary.get)
X_trans = vec.transform(X)
seen = {}
new_vocabulary = {}
new_index = 0
for i in range(len(names)):
column = X_trans.getcol(i).nonzero()[0]
key = tuple(column)
found_id = seen.get(key)
if not found_id:
seen[key] = new_index
new_vocabulary[names[i]] = new_index
new_index += 1
else:
new_vocabulary[names[i]] = found_id
# set vocabulary to vectorizer
pipeline.named_steps["vec"].vocabulary = new_vocabulary
pipeline.named_steps["vec"].vocabulary_ = new_vocabulary
pipeline.named_steps["vec"].fixed_vocabulary_ = True
[docs]def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipeline, step: float = DEFAULT_STEP_SIZE,
n_features: int = None, random_state: np.random.RandomState = None):
"""
Function to apply RFE to limit the vocabulary used by the CustomVectorizer, optional step.
:param records: list of TrainingRecords, entire training set.
:param pipeline: the pipeline which vocabulary should be modified
:param step: rate of features to eliminate at each step. the lower the number, the more steps
:param n_features: number of features to select (if None: half of the provided features)
:param random_state: random state for deterministic results
:return: number of features used
"""
# TODO: enable logging (optional)
t1 = time()
X, y, tn = get_x_y_tn(records)
vec = pipeline.named_steps["vec"]
estimator = pipeline.named_steps["clf"]
# get previous vocabulary (might be already compressed)
if not vec.vocabulary:
vec.fit(X)
previous_vocabulary = {name: i for name, i in vec.get_feature_names()}
else:
previous_vocabulary = vec.vocabulary
if not n_features:
n_features = len(previous_vocabulary) // 2
X_trans = vec.transform(X)
logger = get_logger(__name__, verb=True)
split = StratifiedKFold(shuffle=True, n_splits=5, random_state=random_state)
selector = RFECV(estimator, step=step, min_features_to_select=n_features, cv=split, n_jobs=5,
scoring=DEFAULT_SCORING_FUNCTION)
selector = selector.fit(X=X_trans, y=y)
original_size = len(previous_vocabulary)
support = selector.get_support()
support = support.nonzero()[0]
new_id = {support[x]: x for x in range(len(support))}
vocabulary = {feature: new_id[i] for feature, i in previous_vocabulary.items() if not new_id.get(i) is None}
size_after = selector.n_features_
t2 = time()
logger.info(f"{size_after} features were selected of {original_size} using Recursive Feature Eliminiation"
f" in {np.round(t2 - t1, 2)} seconds.")
# set vocabulary to vectorizer
pipeline.named_steps["vec"].vocabulary = vocabulary
pipeline.named_steps["vec"].vocabulary_ = vocabulary
pipeline.named_steps["vec"].fixed_vocabulary_ = True
return size_after
[docs]def multiple_step_rfecv(records: List[TrainingRecord], pipeline: Pipeline, n_features: int, step=(0.01, 0.01, 0.01, ),
random_state: np.random.RandomState = None):
"""
Function to apply multiple steps-sizes of RFECV in series, currently not used. Strategy might be problematic,
no clear benefit. #TODO rethink or remove
:param records: Data used
:param pipeline: The base estimator used
:param n_features: Goal number of features
:param step: List of steps that should be applied
:param random_state: random state for deterministic results
:return:
"""
# step = [0.0025]
for s in step:
size_after = recursive_feature_elimination(records, pipeline=pipeline, step=s, n_features=n_features,
random_state=random_state)
if size_after == n_features:
break
return size_after