Source code for IngeoML.feature_selection

# Copyright 2023 Mario Graff Guerrero

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Callable, Union
from numbers import Integral
from sklearn.feature_selection import SelectFromModel
from sklearn.utils._param_validation import Interval
from sklearn.base import is_classifier, clone
from sklearn.model_selection import check_cv
from sklearn.metrics import check_scoring
import numpy as np


[docs] class SelectFromModelCV(SelectFromModel): """ >>> from IngeoML import SelectFromModelCV >>> from sklearn.svm import LinearSVC >>> from sklearn.datasets import load_wine >>> from sklearn.metrics import f1_score >>> import pandas as pd >>> import seaborn as sns >>> X, y = load_wine(return_X_y=True) >>> scoring = lambda y, hy: f1_score(y, hy, average='macro') >>> select = SelectFromModelCV(estimator=LinearSVC(dual='auto'), scoring=scoring, prefit=False).fit(X, y) The performance of the selection mechanisim can be seen in the following figure >>> perf = select.cv_results_ >>> _ = [{'d': k, 'macro-f1': v} for k, v in perf.items()] >>> df = pd.DataFrame(_) >>> sns.set_style('whitegrid') >>> sns.lineplot(df, x='d', y='macro-f1') .. figure:: SelectFromModelCV.png """ _parameter_constraints: dict = { **SelectFromModel._parameter_constraints, "min_features_to_select": [Interval(Integral, 0, None, closed="neither")], "cv": ["cv_object"], "scoring": [None, str, callable], "n_jobs": [None, Integral], } _parameter_constraints.pop("threshold")
[docs] def __init__(self, estimator: Any, *, prefit: bool = False, norm_order: Union[float, int] = 1, max_features: Union[Callable[..., Any], int, None] = None, importance_getter: Union[str, Callable[..., Any]] = 'auto', min_features_to_select: int = 2, cv=None, scoring=None, max_iter: int=10) -> None: super().__init__(estimator, threshold=-np.inf, prefit=prefit, norm_order=norm_order, max_features=max_features, importance_getter=importance_getter) self.min_features_to_select = min_features_to_select self.scoring = scoring self.cv = cv self.max_iter = max_iter
@property def max_iter(self): """Number of points to sample between 2 and :py:attr:`max_features`""" return self._max_iter @max_iter.setter def max_iter(self, value): self._max_iter = value
[docs] def fit(self, X, y, groups=None): """Choose the number of features""" cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) if self.max_features is not None: max_features = self.max_features else: max_features = X.shape[1] - 2 max_split = min(self.max_iter, X.shape[1] - 2, max_features) dims = np.linspace(2, X.shape[1] - 1, max_split).astype(int) folds = [(tr, vs) for tr, vs in cv.split(X, y, groups=groups)] scores = [] if not self.prefit: estimator = clone(self.estimator).fit(X, y) else: estimator = self.estimator for dim in dims: hy = np.empty_like(y) select = SelectFromModel(estimator=estimator, threshold=self.threshold, prefit=True, norm_order=self.norm_order, max_features=dim, importance_getter=self.importance_getter).fit(X, y) for tr, vs in folds: Xt = select.transform(X) m = clone(self.estimator).fit(Xt[tr], y[tr]) hy[vs] = m.predict(Xt[vs]) _ = scorer(y, hy) scores.append(_) self.max_features = dims[np.argmax(scores)] self.cv_results_ = {dim: score for dim, score in zip(dims, scores)} super().fit(X, y) return self
@property def cv(self): """Crossvalidation parameters""" return self._cv @cv.setter def cv(self, value): self._cv = value @property def scoring(self): """Score function""" return self._scoring @scoring.setter def scoring(self, value): self._scoring = value @property def min_features_to_select(self): """Minimum number of features to select""" return self._min_features_to_select @min_features_to_select.setter def min_features_to_select(self, value): self._min_features_to_select = value
# @property # def n_jobs(self): # """Number of jobs used in multiprocessing.""" # return self._n_jobs # @n_jobs.setter # def n_jobs(self, value): # self._n_jobs = value