Source code for pica.ml.feature_select

from time import time
from typing import List

from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

from pica.structure.records import TrainingRecord
from pica.util.logging import get_logger
from pica.util.helpers import get_x_y_tn

import numpy as np

DEFAULT_STEP_SIZE = 0.0025
DEFAULT_SCORING_FUNCTION = 'balanced_accuracy'


[docs]def compress_vocabulary(records: List[TrainingRecord], pipeline: Pipeline):
    """
    Method to group features, that store redundant information, to avoid overfitting and speed up process (in some
    cases). Might be replaced or complemented by a feature selection method in future versions.

    Compressing vocabulary is optional, for the test dataset it took 30 seconds, while the time saved later on is not
    significant.

    :param records: a list of TrainingRecord objects.
    :param pipeline: the targeted pipeline where the vocabulary should be modified
    :return: nothing, sets the vocabulary for CountVectorizer step
    """

    X, y, tn = get_x_y_tn(records)  # we actually only need X
    vec = pipeline.named_steps["vec"]
    if not vec.vocabulary:
        vec.fit(X)
        names = [name for name, i in vec.get_feature_names()]
    else:
        names = sorted(vec.vocabulary, key=vec.vocabulary.get)

    X_trans = vec.transform(X)

    seen = {}
    new_vocabulary = {}
    new_index = 0
    for i in range(len(names)):
        column = X_trans.getcol(i).nonzero()[0]
        key = tuple(column)
        found_id = seen.get(key)
        if not found_id:
            seen[key] = new_index
            new_vocabulary[names[i]] = new_index
            new_index += 1
        else:
            new_vocabulary[names[i]] = found_id

    # set vocabulary to vectorizer
    pipeline.named_steps["vec"].vocabulary = new_vocabulary
    pipeline.named_steps["vec"].vocabulary_ = new_vocabulary
    pipeline.named_steps["vec"].fixed_vocabulary_ = True


[docs]def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipeline, step: float = DEFAULT_STEP_SIZE,
                                  n_features: int = None, random_state: np.random.RandomState = None):
    """
    Function to apply RFE to limit the vocabulary used by the CustomVectorizer, optional step.

    :param records: list of TrainingRecords, entire training set.
    :param pipeline: the pipeline which vocabulary should be modified
    :param step: rate of features to eliminate at each step. the lower the number, the more steps
    :param n_features: number of features to select (if None: half of the provided features)
    :param random_state: random state for deterministic results
    :return: number of features used
    """

    # TODO: enable logging (optional)

    t1 = time()

    X, y, tn = get_x_y_tn(records)
    vec = pipeline.named_steps["vec"]
    estimator = pipeline.named_steps["clf"]

    # get previous vocabulary (might be already compressed)
    if not vec.vocabulary:
        vec.fit(X)
        previous_vocabulary = {name: i for name, i in vec.get_feature_names()}
    else:
        previous_vocabulary = vec.vocabulary

    if not n_features:
        n_features = len(previous_vocabulary) // 2

    X_trans = vec.transform(X)

    logger = get_logger(__name__, verb=True)
    split = StratifiedKFold(shuffle=True, n_splits=5, random_state=random_state)
    selector = RFECV(estimator, step=step, min_features_to_select=n_features, cv=split, n_jobs=5,
                     scoring=DEFAULT_SCORING_FUNCTION)
    selector = selector.fit(X=X_trans, y=y)

    original_size = len(previous_vocabulary)
    support = selector.get_support()
    support = support.nonzero()[0]
    new_id = {support[x]: x for x in range(len(support))}
    vocabulary = {feature: new_id[i] for feature, i in previous_vocabulary.items() if not new_id.get(i) is None}
    size_after = selector.n_features_

    t2 = time()

    logger.info(f"{size_after} features were selected of {original_size} using Recursive Feature Eliminiation"
                f" in {np.round(t2 - t1, 2)} seconds.")

    # set vocabulary to vectorizer
    pipeline.named_steps["vec"].vocabulary = vocabulary
    pipeline.named_steps["vec"].vocabulary_ = vocabulary
    pipeline.named_steps["vec"].fixed_vocabulary_ = True

    return size_after


[docs]def multiple_step_rfecv(records: List[TrainingRecord], pipeline: Pipeline, n_features: int, step=(0.01, 0.01, 0.01, ),
                        random_state: np.random.RandomState = None):
    """
    Function to apply multiple steps-sizes of RFECV in series, currently not used. Strategy might be problematic,
    no clear benefit. #TODO rethink or remove

    :param records: Data used
    :param pipeline: The base estimator used
    :param n_features: Goal number of features
    :param step: List of steps that should be applied
    :param random_state: random state for deterministic results
    :return:
    """
    # step = [0.0025]
    for s in step:
        size_after = recursive_feature_elimination(records, pipeline=pipeline, step=s, n_features=n_features,
                                                   random_state=random_state)
        if size_after == n_features:
            break

    return size_after