diff --git a/mlxtend/feature_selection/columns.py b/mlxtend/feature_selection/columns.py new file mode 100644 index 000000000..281955d67 --- /dev/null +++ b/mlxtend/feature_selection/columns.py @@ -0,0 +1,271 @@ + +from typing import NamedTuple, Any +from copy import copy + +import numpy as np +from sklearn.base import clone +from sklearn.preprocessing import (OneHotEncoder, + OrdinalEncoder) +from sklearn.utils.validation import check_is_fitted +from sklearn.exceptions import NotFittedError + + +class Column(NamedTuple): + + """ + A column extractor with a possible + encoder (following `sklearn` fit/transform template). + """ + + idx: Any + name: str + is_categorical: bool = False + is_ordinal: bool = False + columns: tuple = () + encoder: Any = None + + def get_columns(self, X, fit=False): + + """ + Extract associated column from X, + encoding it with `self.encoder` if not None. + + Parameters + ---------- + + X : array-like + X on which model matrix will be evaluated. + If a `pd.DataFrame` or `pd.Series`, variables that are of + categorical dtype will be treated as categorical. + + fit : bool + If True, fit `self.encoder` on corresponding + column. + + Returns + ------- + cols : `np.ndarray` + Evaluated columns -- if an encoder is used, + several columns may be produced. + + names : (str,) + Column names + """ + + cols = _get_column(self.idx, X, twodim=self.encoder is not None) + if fit: + self.fit_encoder(X) + + if self.encoder is not None: + cols = self.encoder.transform(cols) + cols = np.asarray(cols) + + names = self.columns + if hasattr(self.encoder, 'columns_'): + names = ['{0}[{1}]'.format(self.name, c) + for c in self.encoder.columns_] + if not names: + names = ['{0}[{1}]'.format(self.name, i) + for i in range(cols.shape[1])] + return cols, names + + def fit_encoder(self, X): + + """ + Fit `self.encoder`. + + Parameters + ---------- + X : array-like + X on which encoder will be fit. + + Returns + ------- + None + """ + cols = _get_column(self.idx, X, twodim=self.encoder is not None) + if self.encoder is not None: + try: + check_is_fitted(self.encoder) + except NotFittedError: + self.encoder.fit(cols) + return np.asarray(cols) + + +# private functions + + +def _get_column(idx, X, twodim=False, loc=True): + """ + Extract column `idx` from `X`, + optionally making it two-dimensional + as many sklearn encoders assume + two-dimensional input + """ + if isinstance(X, np.ndarray): + col = X[:, idx] + elif hasattr(X, 'loc'): + if loc: + col = X.loc[:, idx] + else: # use iloc instead + col = X.iloc[:, idx] + else: + raise ValueError('expecting an ndarray or a ' + + '"loc/iloc" methods, got %s' % str(X)) + if twodim and np.asarray(col).ndim == 1: + return np.asarray(col).reshape((-1, 1)) + return np.asarray(col) + + +def _get_column_info(X, + columns, + is_categorical, + is_ordinal, + default_encoders={ + 'ordinal': OrdinalEncoder(), + 'categorical': OneHotEncoder(drop='first', + sparse=False) + } + ): + + + """ + Compute a dictionary + of `Column` instances for each column + of `X`. Keys are `columns`. + + Categorical and ordinal columns use the + default encoding provided. + + """ + + column_info = {} + for i, col in enumerate(columns): + if type(col) == int: + name = f'X{col}' + else: + name = str(col) + Xcol = _get_column(col, X, twodim=True) + if is_categorical[i]: + if is_ordinal[i]: + encoder = clone(default_encoders['ordinal']) + encoder.fit(Xcol) + columns = ['Ord({0})'.format(col)] + else: + encoder = clone(default_encoders['categorical']) + cols = encoder.fit_transform(Xcol) + if hasattr(encoder, 'columns_'): + columns_ = encoder.columns_ + else: + columns_ = range(cols.shape[1]) + columns = ['Cat({0})[{1}]'.format(col, c) + for c in range(cols.shape[1])] + + column_info[col] = Column(col, + name, + is_categorical[i], + is_ordinal[i], + tuple(columns), + encoder) + else: + column_info[col] = Column(col, + name, + columns=(name,)) + return column_info + +# extracted from method of BaseHistGradientBoosting from +# https://github.com/scikit-learn/scikit-learn/blob/2beed55847ee70d363bdbfe14ee4401438fba057/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +# max_bins is ignored + + +def _check_categories(categorical_features, X): + """Check and validate categorical features in X + + Return + ------ + is_categorical : ndarray of shape (n_features,) or None, dtype=bool + Indicates whether a feature is categorical. If no feature is + categorical, this is None. + known_categories : list of size n_features or None + The list contains, for each feature: + - an array of shape (n_categories,) with the unique cat values + - None if the feature is not categorical + None if no feature is categorical. + """ + if categorical_features is None: + return None, None + + categorical_features = np.asarray(categorical_features) + + if categorical_features.size == 0: + return None, None + + if categorical_features.dtype.kind not in ('i', 'b'): + raise ValueError("categorical_features must be an array-like of " + "bools or array-like of ints.") + + n_features = X.shape[1] + + # check for categorical features as indices + if categorical_features.dtype.kind == 'i': + if (np.max(categorical_features) >= n_features + or np.min(categorical_features) < 0): + raise ValueError("categorical_features set as integer " + "indices must be in [0, n_features - 1]") + is_categorical = np.zeros(n_features, dtype=bool) + is_categorical[categorical_features] = True + else: + if categorical_features.shape[0] != n_features: + raise ValueError("categorical_features set as a boolean mask " + "must have shape (n_features,), got: " + f"{categorical_features.shape}") + is_categorical = categorical_features + + if not np.any(is_categorical): + return None, None + + # compute the known categories in the training data. We need to do + # that here instead of in the BinMapper because in case of early + # stopping, the mapper only gets a fraction of the training data. + known_categories = [] + + for f_idx in range(n_features): + if is_categorical[f_idx]: + categories = np.array([v for v in + set(_get_column(f_idx, + X, + loc=False))]) + missing = [] + for c in categories: + try: + missing.append(np.isnan(c)) + except TypeError: # not a float + missing.append(False) + missing = np.array(missing) + if missing.any(): + categories = sorted(categories[~missing]) + + else: + categories = None + known_categories.append(categories) + + return is_categorical, known_categories + + +def _categorical_from_df(df): + """ + Find + """ + is_categorical = [] + is_ordinal = [] + for c in df.columns: + try: + is_categorical.append(df[c].dtype == 'category') + is_ordinal.append(df[c].cat.ordered) + except TypeError: + is_categorical.append(False) + is_ordinal.append(False) + is_categorical = np.array(is_categorical) + is_ordinal = np.array(is_ordinal) + + return is_categorical, is_ordinal diff --git a/mlxtend/feature_selection/generic_selector.py b/mlxtend/feature_selection/generic_selector.py new file mode 100644 index 000000000..3c82b1217 --- /dev/null +++ b/mlxtend/feature_selection/generic_selector.py @@ -0,0 +1,522 @@ +# Sebastian Raschka 2014-2020 +# mlxtend Machine Learning Library Extensions +# +# Algorithm for sequential feature selection. +# Author: Sebastian Raschka +# +# License: BSD 3 clause + +# Modified by Jonathan Taylor 2021 +# +# Derives from sequential_feature_selector +# but allows custom model search + +import types +import sys +from copy import deepcopy + +import numpy as np +import scipy as sp + +from sklearn.metrics import get_scorer +from sklearn.base import (clone, MetaEstimatorMixin) +from sklearn.model_selection import cross_val_score +from joblib import Parallel, delayed + +from ..externals.name_estimators import _name_estimators +from ..utils.base_compostion import _BaseXComposition + +class FeatureSelector(_BaseXComposition, MetaEstimatorMixin): + + """Feature Selection for Classification and Regression. + + Parameters + ---------- + estimator: scikit-learn classifier or regressor + strategy: Strategy + Description of search strategy: a named tuple + with fields `initial_state`, + `candidate_states`, `build_submodel`, + `check_finished` and `postprocess`. + + verbose: int (default: 0), level of verbosity to use in logging. + If 0, no output, + if 1 number of features in current set, if 2 detailed logging + including timestamp and cv scores at step. + scoring: str, callable, or None (default: None) + If None (default), uses 'accuracy' for sklearn classifiers + and 'r2' for sklearn regressors. + If str, uses a sklearn scoring metric string identifier, for example + {accuracy, f1, precision, recall, roc_auc} for classifiers, + {'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error', + 'median_absolute_error', 'r2'} for regressors. + If a callable object or function is provided, it has to be conform with + sklearn's signature ``scorer(estimator, X, y)``; see + http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html + for more information. + cv: int (default: 5) + Integer or iterable yielding train, test splits. If cv is an integer + and `estimator` is a classifier (or y consists of integer class + labels) stratified k-fold. Otherwise regular k-fold cross-validation + is performed. No cross-validation if cv is None, False, or 0. + n_jobs: int (default: 1) + The number of CPUs to use for evaluating different feature subsets + in parallel. -1 means 'all CPUs'. + pre_dispatch: int, or string (default: '2*n_jobs') + Controls the number of jobs that get dispatched + during parallel execution if `n_jobs > 1` or `n_jobs=-1`. + Reducing this number can be useful to avoid an explosion of + memory consumption when more jobs get dispatched than CPUs can process. + This parameter can be: + None, in which case all the jobs are immediately created and spawned. + Use this for lightweight and fast-running jobs, + to avoid delays due to on-demand spawning of the jobs + An int, giving the exact number of total jobs that are spawned + A string, giving an expression as a function + of n_jobs, as in `2*n_jobs` + clone_estimator: bool (default: True) + Clones estimator if True; works with the original estimator instance + if False. Set to False if the estimator doesn't + implement scikit-learn's set_params and get_params methods. + In addition, it is required to set cv=0, and n_jobs=1. + + Attributes + ---------- + results_: dict + A dictionary of selected feature subsets during the + selection, where the dictionary keys are + the states of these feature selector. The dictionary + values are dictionaries themselves with the following + keys: 'scores' (list individual cross-validation scores) + 'avg_score' (average cross-validation score) + + Notes + ----- + + See `Strategy` for explanation of the fields. + + Examples + ----------- + For usage examples, please see + TBD + + """ + def __init__(self, + estimator, + strategy, + verbose=0, + scoring=None, + cv=5, + n_jobs=1, + pre_dispatch='2*n_jobs', + clone_estimator=True, + fixed_features=None): + + self.estimator = estimator + self.strategy = strategy + self.pre_dispatch = pre_dispatch + # Want to raise meaningful error message if a + # cross-validation generator is inputted + if isinstance(cv, types.GeneratorType): + err_msg = ('Input cv is a generator object, which is not ' + 'supported. Instead please input an iterable yielding ' + 'train, test splits. This can usually be done by ' + 'passing a cross-validation generator to the ' + 'built-in list function. I.e. cv=list()') + raise TypeError(err_msg) + self.cv = cv + self.n_jobs = n_jobs + self.verbose = verbose + self.clone_estimator = clone_estimator + + if self.clone_estimator: + self.est_ = clone(self.estimator) + else: + self.est_ = self.estimator + self.scoring = scoring + + if scoring is None: + if self.est_._estimator_type == 'classifier': + scoring = 'accuracy' + elif self.est_._estimator_type == 'regressor': + scoring = 'r2' + else: + raise AttributeError('Estimator must ' + 'be a Classifier or Regressor.') + if isinstance(scoring, str): + self.scorer = get_scorer(scoring) + else: + self.scorer = scoring + + self.fitted = False + + # don't mess with this unless testing + self._TESTING_INTERRUPT_MODE = False + + @property + def named_estimators(self): + """ + Returns + ------- + List of named estimator tuples, like [('svc', SVC(...))] + """ + return _name_estimators([self.estimator]) + + def get_params(self, deep=True): + # + # Return estimator parameter names for GridSearch support. + # + return self._get_params('named_estimators', deep=deep) + + def set_params(self, **params): + """Set the parameters of this estimator. + Valid parameter keys can be listed with ``get_params()``. + + Returns + ------- + self + """ + self._set_params('estimator', 'named_estimators', **params) + return self + + def fit(self, X, y, groups=None, **fit_params): + """Perform feature selection and learn model from training data. + + Parameters + ---------- + X: {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + New in v 0.13.0: pandas DataFrames are now also accepted as + argument for X. + y: array-like, shape = [n_samples] + Target values. + New in v 0.13.0: pandas DataFrames are now also accepted as + argument for y. + groups: array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Passed to the fit method of the cross-validator. + fit_params: various, optional + Additional parameters that are being passed to the estimator. + For example, `sample_weights=weights`. + + Returns + ------- + self: object + + """ + + # reset from a potential previous fit run + self.interrupted_ = False + self.finished_ = False + + results_ = [] + + # unpack the strategy + + (initial_state, + candidate_states, + build_submodel, + check_finished, + postprocess) = self.strategy + + # fit initial model + + _state, _scores = _calc_score(self.estimator, + self.scorer, + build_submodel, + X, + y, + initial_state, + groups=groups, + cv=self.cv, + pre_dispatch=self.pre_dispatch, + **fit_params) + + # keep a running track of the best state + + iteration = 0 + self.path_ = [deepcopy((_state, iteration, _scores))] + cur = best = (_state, iteration, _scores) + + self.update_results_check(results_, + self.path_, + cur, + [(_state, iteration, _scores)], + check_finished) + iteration += 1 + try: + while not self.finished_: + + batch_results = self._batch(iteration, + cur[0], + candidate_states(cur[0]), + build_submodel, + X, + y, + groups=groups, + **fit_params) + iteration += 1 + cur, best_, self.finished_ = self.update_results_check(results_, + self.path_, + best, + batch_results, + check_finished) + if best_: + best = best_ + self.path_.append(deepcopy(cur)) + + if self._TESTING_INTERRUPT_MODE: + raise KeyboardInterrupt + except KeyboardInterrupt: + self.interrupted_ = True + sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...') + + self.selected_state_, self.results_ = postprocess(results_) + + self.fitted = True + return self + + def transform(self, X): + """Reduce X to its most important features. + + Parameters + ---------- + X: {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + New in v 0.13.0: pandas DataFrames are now also accepted as + argument for X. + + Returns + ------- + Reduced feature subset of X, shape={n_samples, k_features} + + """ + self._check_fitted() + build_submodel = self.strategy.build_submodel + return build_submodel(X, self.selected_state_) + + def fit_transform(self, + X, + y, + groups=None, + **fit_params): + """Fit to training data then reduce X to its most important features. + + Parameters + ---------- + X: {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + New in v 0.13.0: pandas DataFrames are now also accepted as + argument for X. + y: array-like, shape = [n_samples] + Target values. + New in v 0.13.0: a pandas Series are now also accepted as + argument for y. + groups: array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Passed to the fit method of the cross-validator. + fit_params: various, optional + Additional parameters that are being passed to the estimator. + For example, `sample_weights=weights`. + + Returns + ------- + Reduced feature subset of X, shape={n_samples, k_features} + + """ + self.fit(X, y, groups=groups, **fit_params) + return self.transform(X) + + def get_metric_dict(self, confidence_interval=0.95): + """Return metric dictionary + + Parameters + ---------- + confidence_interval: float (default: 0.95) + A positive float between 0.0 and 1.0 to compute the confidence + interval bounds of the CV score averages. + + Returns + ---------- + Dictionary with items where each dictionary value is a list + with the number of iterations (number of feature subsets) as + its length. The dictionary keys corresponding to these lists + are as follows: + 'state': tuple of the indices of the feature subset + 'scores': list with individual CV scores + 'avg_score': of CV average scores + 'std_dev': standard deviation of the CV score average + 'std_err': standard error of the CV score average + 'ci_bound': confidence interval bound of the CV score average + + """ + self._check_fitted() + fdict = deepcopy(self.results_) + + def _calc_confidence(ary, confidence=0.95): + std_err = sp.stats.sem(ary) + bound = std_err * sp.stats.t._ppf((1 + confidence) / 2.0, len(ary)) + return bound, std_err + + for k in fdict: + std_dev = np.std(self.results_[k]['scores']) + bound, std_err = self._calc_confidence( + self.results_[k]['scores'], + confidence=confidence_interval) + fdict[k]['ci_bound'] = bound + fdict[k]['std_dev'] = std_dev + fdict[k]['std_err'] = std_err + return fdict + + # private methods + + def _batch(self, + iteration, + cur_state, + candidates, + build_submodel, + X, + y, + groups=None, + **fit_params): + + results = [] + + if candidates is not None: + + parallel = Parallel(n_jobs=self.n_jobs, + verbose=self.verbose, + pre_dispatch=self.pre_dispatch) + + work = parallel(delayed(_calc_score) + (self.estimator, + self.scorer, + build_submodel, + X, + y, + state, + groups=groups, + cv=self.cv, + pre_dispatch=self.pre_dispatch, + **fit_params) + for state in candidates) + + for state, scores in work: + results.append((state, iteration, scores)) + + return results + + def _check_fitted(self): + if not self.fitted: + raise AttributeError('{} has not been fitted yet.'.format(self.__class__)) + + def update_results_check(self, + results, + path, + best, + batch_results, + check_finished): + """ + Update `results_` with current batch + and return a boolean about whether + we should continue or not. + + Parameters + ---------- + + results: dict + Dictionary of all results. + Keys are state with values + dictionaries having keys + `scores`, `avg_scores`. + + best : (state, score) + Current best state and score. + + batch_results: dict + Dictionary of results from a batch fit. + Keys are tate with values + dictionaries having keys + `scores`, `avg_scores`. + + check_finished: callable + Callable taking three arguments + `(results, best_state, batch_results)` which determines if + the state generator should step. Often will just check + if there is a better score than that at current best state + but can use entire set of results if desired. + + Returns + ------- + + best_state: object + State that had the best `avg_score` + + fitted: bool + If batch_results is empty, fitting + has terminated so return True. + Otherwise False. + + """ + + finished = batch_results == [] + + if not finished: + + (cur, + finished) = check_finished(results, + path, + best, + batch_results) + + results.extend(batch_results) + + if cur[2] is not None and np.nanmean(cur[2]) > np.nanmean(best[2]): + best = cur + + return (cur, + best, + finished) + else: + return ((None, None, None), + (None, None, None), + True) + + +# private functions + +def _calc_score(estimator, + scorer, + build_submodel, + X, + y, + state, + groups=None, + cv=None, + pre_dispatch='2*n_jobs', + **fit_params): + + X_state = build_submodel(X, state) + + if cv: + scores = cross_val_score(estimator, + X_state, + y, + groups=groups, + cv=cv, + scoring=scorer, + n_jobs=1, + pre_dispatch=pre_dispatch, + fit_params=fit_params) + else: + estimator.fit(X_state, + y, + **fit_params) + scores = np.array([scorer(estimator, + X_state, + y)]) + return state, scores + diff --git a/mlxtend/feature_selection/strategy.py b/mlxtend/feature_selection/strategy.py new file mode 100644 index 000000000..719247e60 --- /dev/null +++ b/mlxtend/feature_selection/strategy.py @@ -0,0 +1,771 @@ +# Jonathan Taylor 2021 +# mlxtend Machine Learning Library Extensions +# +# Objects describing search strategy +# Author: Jonathan Taylor +# + +from typing import NamedTuple, Any, Callable +from itertools import chain, combinations +from functools import partial + +import numpy as np +from sklearn.utils import check_random_state + +from .columns import (_get_column_info, + Column, + _categorical_from_df, + _check_categories) + +class Strategy(NamedTuple): + + """ + initial_state: object + Initial state of feature selector. + state_generator: callable + Callable taking single argument `state` and returning + candidates for next batch of scores to be calculated. + build_submodel: callable + Callable taking two arguments `(X, state)` that returns + model matrix represented by `state`. + check_finished: callable + Callable taking three arguments + `(results, best_state, batch_results)` which determines if + the state generator should step. Often will just check + if there is a better score than that at current best state + but can use entire set of results if desired. + """ + + initial_state: Any + candidate_states: Callable + build_submodel: Callable + check_finished: Callable + postprocess: Callable + + +class MinMaxCandidates(object): + + def __init__(self, + X, + min_features=1, + max_features=1, + fixed_features=None, + custom_feature_names=None, + categorical_features=None): + """ + Parameters + ---------- + X: {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + New in v 0.13.0: pandas DataFrames are now also accepted as + argument for X. + min_features: int (default: 1) + Minumum number of features to select + max_features: int (default: 1) + Maximum number of features to select + fixed_features: column identifiers, default=None + Subset of features to keep. Stored as `self.columns[fixed_features]` + where `self.columns` will correspond to columns if X is a `pd.DataFrame` + or an array of integers if X is an `np.ndarray` + custom_feature_names: None or tuple (default: tuple) + Custom feature names for `self.k_feature_names` and + `self.subsets_[i]['feature_names']`. + (new in v 0.13.0) + categorical_features: array-like of {bool, int} of shape (n_features) + or shape (n_categorical_features,), default=None. + Indicates the categorical features. + + - None: no feature will be considered categorical. + - boolean array-like: boolean mask indicating categorical features. + - integer array-like: integer indices indicating categorical + features. + + For each categorical feature, there must be at most `max_bins` unique + categories, and each categorical value must be in [0, max_bins -1]. + + """ + + if hasattr(X, 'loc'): + X_ = X.values + is_categorical, is_ordinal = _categorical_from_df(X) + self.columns = X.columns + else: + X_ = X + is_categorical = _check_categories(categorical_features, + X_)[0] + if is_categorical is None: + is_categorical = np.zeros(X_.shape[1], np.bool) + is_ordinal = np.zeros_like(is_categorical) + self.columns = np.arange(X.shape[1]) + + nfeatures = X_.shape[1] + + if (not isinstance(max_features, int) or + (max_features > nfeatures or max_features < 0)): + raise AttributeError('max_features must be' + ' <= than %d and >= 0' % + (nfeatures + 1)) + + if (not isinstance(min_features, int) or + (min_features > nfeatures or min_features < 0)): + raise AttributeError('min_features must be' + ' <= %d and >= 0' + % (nfeatures + 1)) + + if max_features < min_features: + raise AttributeError('min_features must be <= max_features') + + self.min_features, self.max_features = min_features, max_features + + # make a mapping from the column info to columns in + # implied design matrix + + self.column_info_ = _get_column_info(X, + self.columns, + is_categorical, + is_ordinal) + self.column_map_ = {} + idx = 0 + for col in self.columns: + l = self.column_info_[col].columns + self.column_map_[col] = range(idx, idx + + len(l)) + idx += len(l) + if (custom_feature_names is not None + and len(custom_feature_names) != nfeatures): + raise ValueError('If custom_feature_names is not None, ' + 'the number of elements in custom_feature_names ' + 'must equal %d the number of columns in X.' % idx) + if custom_feature_names is not None: + # recompute the Column info using custom_feature_names + for i, col in enumerate(self.columns): + cur_col = self.column_info_[col] + new_name = custom_feature_names[i] + old_name = cur_col.name + self.column_info_[col] = Column(col, + new_name, + col.is_categorical, + col.is_ordinal, + tuple([n.replace(old_name, + new_name) for n in col.columns]), + col.encoder) + + if fixed_features is not None: + self.fixed_features = set([self.column_info_[f].idx for f in fixed_features]) + else: + self.fixed_features = set([]) + + def candidate_states(self, state): + """ + Produce candidates for fitting. + + Parameters + ---------- + + state: ignored + + Returns + ------- + candidates: iterator + A generator of (indices, label) where indices + are columns of X and label is a name for the + given model. The iterator cycles through + all combinations of columns of nfeature total + of size ranging between min_features and max_features. + If appropriate, restricts combinations to include + a set of fixed features. + Models are labeled with a tuple of the feature names. + The names of the columns default to strings of integers + from range(nfeatures). + + """ + + def chain_(i): + return (c for c in combinations(self.columns, r=i) + if self.fixed_features.issubset(c)) + + candidates = chain.from_iterable(chain_(i) for i in + range(self.min_features, + self.max_features+1)) + return candidates + + def check_finished(self, + results, + path, + best, + batch_results): + """ + Check if we should continue or not. + For exhaustive search we stop because + all models are fit in a single batch. + """ + new_best = (None, None, None) + batch_best_score = -np.inf + + for (state, iteration, scores) in batch_results: + avg_score = np.nanmean(scores) + if avg_score > batch_best_score: + new_best = (state, iteration, scores) + batch_best_score = np.nanmean(scores) + + return new_best, True + + +class Stepwise(MinMaxCandidates): + + def __init__(self, + X, + direction, + min_features=1, + max_features=1, + fixed_features=None, + custom_feature_names=None, + categorical_features=None): + """ + Parameters + ---------- + X: {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + New in v 0.13.0: pandas DataFrames are now also accepted as + argument for X. + direction: str + One of ['forward', 'backward', 'both'] + min_features: int (default: 1) + Minumum number of features to select + max_features: int (default: 1) + Maximum number of features to select + fixed_features: column identifiers, default=None + Subset of features to keep. Stored as `self.columns[fixed_features]` + where `self.columns` will correspond to columns if X is a `pd.DataFrame` + or an array of integers if X is an `np.ndarray` + custom_feature_names: None or tuple (default: tuple) + Custom feature names for `self.k_feature_names` and + `self.subsets_[i]['feature_names']`. + (new in v 0.13.0) + categorical_features: array-like of {bool, int} of shape (n_features) + or shape (n_categorical_features,), default=None. + Indicates the categorical features. + + - None: no feature will be considered categorical. + - boolean array-like: boolean mask indicating categorical features. + - integer array-like: integer indices indicating categorical + features. + + For each categorical feature, there must be at most `max_bins` unique + categories, and each categorical value must be in [0, max_bins -1]. + + """ + + self.direction = direction + + MinMaxCandidates.__init__(self, + X, + min_features, + max_features, + fixed_features, + custom_feature_names, + categorical_features) + + def candidate_states(self, state): + """ + Produce candidates for fitting. + For stepwise search this depends on the direction. + + If 'forward', all columns not in the current state + are added (maintaining an upper limit on the number of columns + at `self.max_features`). + + If 'backward', all columns not in the current state + are dropped (maintaining a lower limit on the number of columns + at `self.min_features`). + + All candidates include `self.fixed_features` if any. + + Parameters + ---------- + + state: ignored + + Returns + ------- + candidates: iterator + A generator of (indices, label) where indices + are columns of X and label is a name for the + given model. The iterator cycles through + all combinations of columns of nfeature total + of size ranging between min_features and max_features. + If appropriate, restricts combinations to include + a set of fixed features. + Models are labeled with a tuple of the feature names. + The names of the columns default to strings of integers + from range(nfeatures). + + """ + + state = set(state) + if len(state) < self.max_features: # union + forward = (tuple(sorted(state | set([c]))) for c in self.columns if (c not in state and + self.fixed_features.issubset(state | set([c])))) + else: + forward = [] + + if len(state) > self.min_features: # symmetric difference + backward = (tuple(sorted(state ^ set([c]))) for c in self.columns if (c in state and + self.fixed_features.issubset(state ^ set([c])))) + else: + backward = [] + + if self.direction == 'forward': + return forward + elif self.direction == 'backward': + return backward + else: + return chain.from_iterable([forward, backward]) + + @staticmethod + def first_peak(X, + direction='forward', + min_features=1, + max_features=1, + fixed_features=None, + initial_features=[], + custom_feature_names=None, + categorical_features=None, + parsimonious=True): + """ + Strategy that stops when no improvement + in score is possible. + + Parameters + ---------- + X: {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + New in v 0.13.0: pandas DataFrames are now also accepted as + argument for X. + direction: str + One of ['forward', 'backward', 'both'] + min_features: int (default: 1) + Minumum number of features to select + max_features: int (default: 1) + Maximum number of features to select + fixed_features: column identifiers, default=None + Subset of features to keep. Stored as `self.columns[fixed_features]` + where `self.columns` will correspond to columns if X is a `pd.DataFrame` + or an array of integers if X is an `np.ndarray` + initial_features: column identifiers, default=[] + Subset of features to be used to initialize. + custom_feature_names: None or tuple (default: tuple) + Custom feature names for `self.k_feature_names` and + `self.subsets_[i]['feature_names']`. + (new in v 0.13.0) + categorical_features: array-like of {bool, int} of shape (n_features) + or shape (n_categorical_features,), default=None. + Indicates the categorical features. + + - None: no feature will be considered categorical. + - boolean array-like: boolean mask indicating categorical features. + - integer array-like: integer indices indicating categorical + features. + + For each categorical feature, there must be at most `max_bins` unique + categories, and each categorical value must be in [0, max_bins -1]. + + parsimonious: bool + If True, use the 1sd rule: among the shortest models + within one standard deviation of the best score + pick the one with the best average score. + + Returns + ------- + + strategy : NamedTuple + + """ + + step = Stepwise(X, + direction, + min_features, + max_features, + fixed_features, + custom_feature_names, + categorical_features) + + # if any categorical features or an intercept + # is included then we must + # create a new design matrix + + build_submodel = partial(_build_submodel, step.column_info_) + + # pick an initial state + + initial_state = tuple(initial_features) + + if not step.fixed_features.issubset(initial_features): + raise ValueError('initial_features should contain %s' % str(step.fixed_features)) + + if not parsimonious: + _postprocess = _postprocess_best + else: + _postprocess = _postprocess_best_1sd + + return Strategy(initial_state, + step.candidate_states, + build_submodel, + first_peak, + _postprocess) + + @staticmethod + def fixed_size(X, + model_size, + direction='forward', + min_features=1, + max_features=1, + fixed_features=None, + initial_features=[], + custom_feature_names=None, + categorical_features=None, + parsimonious=True): + """ + Strategy that stops first time + a given model size is reached. + + Parameters + ---------- + X: {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + New in v 0.13.0: pandas DataFrames are now also accepted as + argument for X. + direction: str + One of ['forward', 'backward', 'both'] + min_features: int (default: 1) + Minumum number of features to select + max_features: int (default: 1) + Maximum number of features to select + fixed_features: column identifiers, default=None + Subset of features to keep. Stored as `self.columns[fixed_features]` + where `self.columns` will correspond to columns if X is a `pd.DataFrame` + or an array of integers if X is an `np.ndarray` + initial_features: column identifiers, default=[] + Subset of features to be used to initialize. + custom_feature_names: None or tuple (default: tuple) + Custom feature names for `self.k_feature_names` and + `self.subsets_[i]['feature_names']`. + (new in v 0.13.0) + categorical_features: array-like of {bool, int} of shape (n_features) + or shape (n_categorical_features,), default=None. + Indicates the categorical features. + + - None: no feature will be considered categorical. + - boolean array-like: boolean mask indicating categorical features. + - integer array-like: integer indices indicating categorical + features. + + For each categorical feature, there must be at most `max_bins` unique + categories, and each categorical value must be in [0, max_bins -1]. + + parsimonious: bool + If True, use the 1sd rule: among the shortest models + within one standard deviation of the best score + pick the one with the best average score. + + Returns + ------- + + strategy : NamedTuple + + """ + + step = Stepwise(X, + direction, + min_features, + max_features, + fixed_features, + custom_feature_names, + categorical_features) + + # if any categorical features or an intercept + # is included then we must + # create a new design matrix + + build_submodel = partial(_build_submodel, step.column_info_) + + # pick an initial state + + initial_state = tuple(initial_features) + + if not step.fixed_features.issubset(initial_features): + raise ValueError('initial_features should contain %s' % str(step.fixed_features)) + + if not parsimonious: + _postprocess = _postprocess_best + else: + _postprocess = _postprocess_best_1sd + + return Strategy(initial_state, + step.candidate_states, + build_submodel, + partial(fixed_size, model_size), + partial(_postprocess_fixed_size, model_size)) + + + +def exhaustive(X, + min_features=1, + max_features=1, + fixed_features=None, + custom_feature_names=None, + categorical_features=None, + parsimonious=True): + """ + Parameters + ---------- + X: {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + New in v 0.13.0: pandas DataFrames are now also accepted as + argument for X. + min_features: int (default: 1) + Minumum number of features to select + max_features: int (default: 1) + Maximum number of features to select + fixed_features: column identifiers, default=None + Subset of features to keep. Stored as `self.columns[fixed_features]` + where `self.columns` will correspond to columns if X is a `pd.DataFrame` + or an array of integers if X is an `np.ndarray` + custom_feature_names: None or tuple (default: tuple) + Custom feature names for `self.k_feature_names` and + `self.subsets_[i]['feature_names']`. + (new in v 0.13.0) + categorical_features: array-like of {bool, int} of shape (n_features) + or shape (n_categorical_features,), default=None. + Indicates the categorical features. + + - None: no feature will be considered categorical. + - boolean array-like: boolean mask indicating categorical features. + - integer array-like: integer indices indicating categorical + features. + + For each categorical feature, there must be at most `max_bins` unique + categories, and each categorical value must be in [0, max_bins -1]. + + parsimonious: bool + If True, use the 1sd rule: among the shortest models + within one standard deviation of the best score + pick the one with the best average score. + + Returns + ------- + + initial_state: tuple + (column_names, feature_idx) + + state_generator: callable + Object that proposes candidates + based on current state. Takes a single + argument `state` + + build_submodel: callable + Candidate generator that enumerate + all valid subsets of columns. + + check_finished: callable + Check whether to stop. Takes two arguments: + `best_result` a dict with keys ['scores', 'avg_score']; + and `state`. + + """ + + strategy = MinMaxCandidates(X, + min_features, + max_features, + fixed_features, + custom_feature_names, + categorical_features) + + # if any categorical features or an intercept + # is included then we must + # create a new design matrix + + build_submodel = partial(_build_submodel, strategy.column_info_) + + if strategy.fixed_features: + initial_features = sorted(strategy.fixed_features) + else: + initial_features = range(strategy.min_features) + initial_state = tuple(initial_features) + + if not parsimonious: + _postprocess = _postprocess_best + else: + _postprocess = _postprocess_best_1sd + + return Strategy(initial_state, + strategy.candidate_states, + build_submodel, + strategy.check_finished, + _postprocess) + +def first_peak(results, + path, + best, + batch_results): + """ + Check if we should continue or not. + + For first_peak search we stop if we cannot improve + over our current best score. + + """ + new_best = (None, None, None) + batch_best_score = -np.inf + + for state, iteration, scores in batch_results: + avg_score = np.nanmean(scores) + if avg_score > batch_best_score: + new_best = (state, iteration, scores) + batch_best_score = avg_score + + any_better = batch_best_score > np.nanmean(best[2]) + return new_best, not any_better + +def fixed_size(model_size, + results, + path, + best, + batch_results): + """ + Check if we should continue or not. + + For first_peak search we stop if we cannot improve + over our current best score. + + """ + new_best = (None, None, None) + batch_best_score = -np.inf + + for state, iteration, scores in batch_results: + avg_score = np.nanmean(scores) + if avg_score > batch_best_score: + new_best = (state, iteration, scores) + batch_best_score = avg_score + + any_better = batch_best_score > np.nanmean(best[2]) + return new_best, len(new_best[0]) == model_size + + +# private functions + + +def _build_submodel(column_info, X, cols): + if cols: + return np.column_stack([column_info[col].get_columns(X, fit=True)[0] for col in cols]) + else: + return np.zeros((X.shape[0], 1)) + +def _postprocess_fixed_size(model_size, results): + """ + Find the best state from `results` + based on `avg_score`. + + Return best state and results + """ + + best_state = None + best_score = -np.inf + + new_results = {} + for (state, iteration, scores) in results: + new_state = tuple(state) # [v.name for v in state]) + avg_score = np.nanmean(scores) + if avg_score > best_score and len(new_state) == model_size: + best_state = new_state + best_score = avg_score + new_results[new_state] = avg_score + return best_state, new_results + +def _postprocess_best(results): + """ + Find the best state from `results` + based on `avg_score`. + + Return best state and results + """ + + best_state = None + best_score = -np.inf + + new_results = {} + for (state, iteration, scores) in results: + new_state = tuple(state) # [v.name for v in state]) + avg_score = np.nanmean(scores) + if avg_score > best_score: + best_state = new_state + best_score = avg_score + new_results[new_state] = avg_score + return best_state, new_results + + +def _postprocess_best_1sd(results): + """ + Find the best state from `results` + based on np.nanmean(scores) + + Find models satisfying the 1sd rule + and choose the state with best score + among the smallest such states. + + Return best state and results + + Models are compared by length of state + """ + + best_state = None + best_score = -np.inf + + for state, iteration, scores in results: + avg_score = np.nanmean(scores) + if avg_score > best_score: + best_state = state + best_score = avg_score + + states_1sd = [] + + for (state, iteration, scores) in results: + if len(state) >= len(best_state): + continue + _limit = (np.nanmean(scores) + + np.nanstd(scores) / np.sqrt(scores.shape[0])) + if _limit >= best_score: + states_1sd.append((state, iteration, scores)) + + shortest_1sd = np.inf + + for (state, iteration, scores) in states_1sd: + if len(state) < shortest_1sd: + shortest_1sd = len(state) + + best_state_1sd = None + best_score_1sd = -np.inf + + for (state, iteration, scores) in states_1sd: + avg_score = np.nanmean(scores) + if ((len(state) == shortest_1sd) + and (avg_score <= + best_score_1sd)): + best_state_1sd = state + best_score_1sd = avg_score + + new_results = {} + for (state, iteration, scores) in results: + new_state = tuple(state) #[v.name for v in state]) + new_results[new_state] = np.nanmean(scores) + if best_state_1sd: + best_state_1sd = tuple([v.name for v in best_state_1sd]) + return best_state_1sd, new_results + else: + best_state = tuple(best_state) # [v.name for v in best_state]) + return best_state, new_results diff --git a/mlxtend/feature_selection/tests/test_generic_selector.py b/mlxtend/feature_selection/tests/test_generic_selector.py new file mode 100644 index 000000000..1164dd539 --- /dev/null +++ b/mlxtend/feature_selection/tests/test_generic_selector.py @@ -0,0 +1,418 @@ +from itertools import product + +import pytest +import warnings + +import numpy as np +from sklearn.linear_model import LinearRegression +from mlxtend.feature_selection.generic_selector import FeatureSelector +from mlxtend.feature_selection.strategy import exhaustive, Stepwise + + +have_pandas = True +try: + import pandas as pd +except ImportError: + have_pandas = False + +def test_exhaustive_selector(): + + n, p = 100, 5 + X = np.random.standard_normal((n, p)) + Y = np.random.standard_normal(n) + + strategy = exhaustive(X, + max_features=4, + fixed_features=[2,3]) + + exhaustive_selector = FeatureSelector(LinearRegression(), + strategy) + + exhaustive_selector.fit(X, Y) + + strategy = exhaustive(X, + max_features=4) + + exhaustive_selector = FeatureSelector(LinearRegression(), + strategy) + + exhaustive_selector.fit(X, Y) + exhaustive_selector.transform(X) + exhaustive_selector.fit_transform(X, Y) + + # test CV + + strategy = exhaustive(X, + max_features=5) + + exhaustive_selector = FeatureSelector(LinearRegression(), + strategy, + cv=3) + # test CV, verbose + + strategy = exhaustive(X, + max_features=3) + + for cv, verbose in product([None, 3], + [0,1,2]): + + exhaustive_selector = FeatureSelector(LinearRegression(), + strategy, + cv=cv, + verbose=verbose) + + exhaustive_selector.fit(X, Y) + print(cv, verbose, exhaustive_selector.selected_state_) + +def test_exhaustive_categorical(): + + n, p = 100, 5 + X = np.random.standard_normal((n, p)) + X[:,0] = np.random.choice(range(5), (n,), replace=True) + Y = np.random.standard_normal(n) + + categorical_features = [True] + [False]*4 + strategy = exhaustive(X, + max_features=4, + fixed_features=[2,3], + categorical_features=categorical_features) + + exhaustive_selector = FeatureSelector(LinearRegression(), + strategy) + + exhaustive_selector.fit(X, Y) + + +def test_step_categorical(): + + n, p = 100, 10 + X = np.random.standard_normal((n, p)) + X[:,0] = np.random.choice(range(5), (n,), replace=True) + Y = np.random.standard_normal(n) + + categorical_features = [True] + [False]*9 + strategy = Stepwise.first_peak(X, + max_features=4, + fixed_features=[2,3], + initial_features=[2,3], + categorical_features=categorical_features) + + step_selector = FeatureSelector(LinearRegression(), + strategy) + step_selector.fit(X, Y) + +def test_step_scoring(): + + n, p = 100, 10 + + def AIC(estimator, X, Y): + Yhat = estimator.predict(X) + return 0.5 * ((Y - Yhat)**2).sum() + 2 * X.shape[1] + + X = np.random.standard_normal((n, p)) + X[:,0] = np.random.choice(range(5), (n,), replace=True) + Y = np.random.standard_normal(n) + + categorical_features = [True] + [False]*9 + strategy = Stepwise.first_peak(X, + max_features=4, + fixed_features=[2,3], + initial_features=[2,3], + categorical_features=categorical_features) + + step_selector = FeatureSelector(LinearRegression(), + strategy, + scoring='neg_mean_squared_error') + step_selector.fit(X, Y) + + step_selector = FeatureSelector(LinearRegression(), + strategy, + scoring='neg_mean_squared_error', + cv=None) + step_selector.fit(X, Y) + + step_selector = FeatureSelector(LinearRegression(), + strategy, + scoring=AIC, + cv=None) + step_selector.fit(X, Y) + +def test_step_bigger(): + + n, p = 100, 20 + X = np.random.standard_normal((n, p)) + X[:,0] = np.random.choice(range(5), (n,), replace=True) + Y = np.random.standard_normal(n) + + categorical_features = [True] + [False]*(p-1) + + for direction in ['forward', 'backward', 'both']: + strategy = Stepwise.first_peak(X, + direction=direction, + max_features=p, + fixed_features=[2,3], + initial_features=[2,3], + categorical_features=categorical_features) + + step_selector = FeatureSelector(LinearRegression(), + strategy) + + step_selector.fit(X, Y) + +@pytest.mark.skipif(not have_pandas, reason='pandas unavailable') +def test_pandas1(): + + n, p = 100, 5 + X = np.random.standard_normal((n, p)) + Y = np.random.standard_normal(n) + D = pd.DataFrame(X, columns=['A', 'B', 'C', 'D', 'E']) + D['A'] = pd.Categorical(np.random.choice(range(5), (n,), replace=True)) + + for direction in ['forward', 'backward', 'both']: + + strategy = Stepwise.first_peak(D, + direction=direction, + max_features=p, + fixed_features=['A','C'], + initial_features=['A','C']) + + step_selector = FeatureSelector(LinearRegression(), + strategy, + cv=3) + + step_selector.fit(D, Y) + print(step_selector.path_) + +def neg_AIC(linmod, X, Y): + """ + Negative AIC for linear regression model + """ + n, p = X.shape + Yhat = linmod.predict(X) + sigma_MLE = np.linalg.norm(Y - Yhat) / np.sqrt(n) + # adding 2 -- one for sigma^2, one for intercept fit by LinearRegression + return -(n * np.log(2 * np.pi * sigma_MLE**2) + n + 2 * (p + 2)) + +def test_boston_forward(): + """Ran the following R cell + ```{R} + library(MASS) + data(Boston) + M=step(glm(medv ~ 1, data=Boston), list(upper=glm(medv ~ ., data=Boston)), direction='forward', trace=FALSE) + print(AIC(M)) + names(coef(M)[2:12]) + ``` + + Output: + + [1] 3023.726 + [1] "lstat" "rm" "ptratio" "dis" "nox" "chas" "black" + [8] "zn" "crim" "rad" "tax" + """ + + try: + import statsmodels.api as sm + except ImportError as e: + warnings.warn('import of statsmodels failed') + return + + selected_R = ["lstat", "rm", "ptratio", "dis", + "nox", "chas", "black", + "zn", "crim", "rad", "tax"] + + Boston = sm.datasets.get_rdataset('Boston', package='MASS').data + D = Boston.drop('medv', axis=1) + X = D.values + Y = Boston['medv'] + + strategy = Stepwise.first_peak(X, + max_features=X.shape[1], + min_features=0, + direction='forward', + parsimonious=False) + + step_selector = FeatureSelector(LinearRegression(), + strategy, + scoring=neg_AIC, + cv=None) + step_selector.fit(X, Y) + + selected_vars = [D.columns[j] for j in step_selector.selected_state_] + + Xsel = X[:,list(step_selector.selected_state_)] + selected_model = LinearRegression().fit(Xsel, Y) + + assert(sorted(selected_vars) == sorted(selected_R)) + assert(np.fabs(neg_AIC(selected_model, Xsel, Y) + 3023.726) < 0.01) + +def test_boston_both(): + """Ran the following R cell + ```{R} + %%R + library(MASS) + data(Boston) + M=step(glm(medv ~ ., data=Boston), list(upper=glm(medv ~ 1, data=Boston)), direction='both', trace=FALSE) + print(AIC(M)) + names(coef(M)[2:12]) + ``` + + Output: + + [1] 3023.726 + [1] "crim" "zn" "chas" "nox" "rm" "dis" "rad" + [8] "tax" "ptratio" "black" "lstat" + + """ + + try: + import statsmodels.api as sm + except ImportError as e: + warnings.warn('import of statsmodels failed') + return + + selected_R = ["crim", "zn", "chas", "nox", "rm", "dis", "rad", + "tax", "ptratio", "black", "lstat"] + + Boston = sm.datasets.get_rdataset('Boston', package='MASS').data + D = Boston.drop('medv', axis=1) + X = D.values + Y = Boston['medv'] + + strategy = Stepwise.first_peak(X, + max_features=X.shape[1], + min_features=0, + direction='both', + parsimonious=False) + + step_selector = FeatureSelector(LinearRegression(), + strategy, + scoring=neg_AIC, + cv=None) + step_selector.fit(X, Y) + + selected_vars = [D.columns[j] for j in step_selector.selected_state_] + + Xsel = X[:,list(step_selector.selected_state_)] + selected_model = LinearRegression().fit(Xsel, Y) + + assert(sorted(selected_vars) == sorted(selected_R)) + assert(np.fabs(neg_AIC(selected_model, Xsel, Y) + 3023.726) < 0.01) + +def test_boston_back(): + """Ran the following R cell + ```{R} + %%R + library(MASS) + data(Boston) + M=step(glm(medv ~ ., data=Boston), list(upper=glm(medv ~ 1, data=Boston)), direction='back', trace=FALSE) + print(AIC(M)) + names(coef(M)[2:12]) + ``` + + Output: + + [1] 3023.726 + [1] "crim" "zn" "chas" "nox" "rm" "dis" "rad" + [8] "tax" "ptratio" "black" "lstat" + + """ + + try: + import statsmodels.api as sm + except ImportError as e: + warnings.warn('import of statsmodels failed') + return + + selected_R = ["crim", "zn", "chas", "nox", "rm", "dis", "rad", + "tax", "ptratio", "black", "lstat"] + + Boston = sm.datasets.get_rdataset('Boston', package='MASS').data + D = Boston.drop('medv', axis=1) + X = D.values + Y = Boston['medv'] + + strategy = Stepwise.first_peak(X, + max_features=X.shape[1], + direction='backward', + initial_features=list(range(X.shape[1])), + parsimonious=False) + + step_selector = FeatureSelector(LinearRegression(), + strategy, + scoring=neg_AIC, + cv=None) + step_selector.fit(X, Y) + + selected_vars = [D.columns[j] for j in step_selector.selected_state_] + + Xsel = X[:,list(step_selector.selected_state_)] + selected_model = LinearRegression().fit(Xsel, Y) + + assert(sorted(selected_vars) == sorted(selected_R)) + assert(np.fabs(neg_AIC(selected_model, Xsel, Y) + 3023.726) < 0.01) + +def test_boston_forward_3step(): + """Ran the following R cell + ```{R} + library(MASS) + data(Boston) + M=step(glm(medv ~ 1, data=Boston), list(upper=glm(medv ~ ., data=Boston)), direction='forward', trace=FALSE, steps=3) + print(AIC(M)) + names(coef(M)[2:4]) + ``` + + Output: + + [1] 3116.097 + [1] "lstat" "rm" "ptratio" + + """ + + try: + import statsmodels.api as sm + except ImportError as e: + warnings.warn('import of statsmodels failed') + return + + selected_R = ["lstat", "rm", "ptratio"] + + Boston = sm.datasets.get_rdataset('Boston', package='MASS').data + D = Boston.drop('medv', axis=1) + X = D.values + Y = Boston['medv'] + + strategy = Stepwise.first_peak(X, + max_features=3, + min_features=3, + initial_features=[], + direction='forward', + parsimonious=False) + + step_selector = FeatureSelector(LinearRegression(), + strategy, + scoring=neg_AIC, + cv=None) + step_selector.fit(X, Y) + selected_vars = [D.columns[j] for j in step_selector.selected_state_] + + strategy2 = Stepwise.fixed_size(X, + 3, + max_features=X.shape[1], + min_features=0, + initial_features=[], + direction='forward') + step_selector2 = FeatureSelector(LinearRegression(), + strategy2, + scoring=neg_AIC, + cv=None) + step_selector2.fit(X, Y) + selected_vars2 = [D.columns[j] for j in step_selector2.selected_state_] + print(selected_vars, selected_vars2) + + Xsel = X[:,list(step_selector.selected_state_)] + selected_model = LinearRegression().fit(Xsel, Y) + + assert(sorted(selected_vars) == sorted(selected_R)) + assert(sorted(selected_vars2) == sorted(selected_R)) + assert(np.fabs(neg_AIC(selected_model, Xsel, Y) + 3116.097) < 0.01) +