|
| 1 | +""" |
| 2 | +=================================================== |
| 3 | +Feature selection performance on redundant features |
| 4 | +=================================================== |
| 5 | +
|
| 6 | +In this examples, we will compare the performance of feature selectors on the |
| 7 | +datasets, which contain redundant features. |
| 8 | +Here four types of features should be distinguished: |
| 9 | +
|
| 10 | +* Unuseful features: the features do not contribute to the target |
| 11 | +* Dependent informative features: the features contribute to the target and form |
| 12 | + the redundant features |
| 13 | +* Redundant features: the features are constructed by linear transformation of |
| 14 | + dependent informative features |
| 15 | +* Independent informative features: the features contribute to the target but |
| 16 | + does not contribute to the redundant features. |
| 17 | +
|
| 18 | +.. note:: |
| 19 | + If we do not distinguish dependent and independent informative features and use |
| 20 | + informative features to form both the target and the redundant features. The task |
| 21 | + will be much easier. |
| 22 | +""" |
| 23 | + |
| 24 | +# Authors: Sikai Zhang |
| 25 | +# SPDX-License-Identifier: MIT |
| 26 | + |
| 27 | +# %% |
| 28 | +# Define dataset generator |
| 29 | +# ------------------------ |
| 30 | + |
| 31 | +import numpy as np |
| 32 | + |
| 33 | + |
| 34 | +def make_redundant( |
| 35 | + n_samples, |
| 36 | + n_features, |
| 37 | + dep_info_ids, |
| 38 | + indep_info_ids, |
| 39 | + redundant_ids, |
| 40 | + random_seed, |
| 41 | +): |
| 42 | + """Make a dataset with linearly redundant features. |
| 43 | +
|
| 44 | + Parameters |
| 45 | + ---------- |
| 46 | + n_samples : int |
| 47 | + The number of samples. |
| 48 | +
|
| 49 | + n_features : int |
| 50 | + The number of features. |
| 51 | +
|
| 52 | + dep_info_ids : list[int] |
| 53 | + The indices of dependent informative features. |
| 54 | +
|
| 55 | + indep_info_ids : list[int] |
| 56 | + The indices of independent informative features. |
| 57 | +
|
| 58 | + redundant_ids : list[int] |
| 59 | + The indices of redundant features. |
| 60 | +
|
| 61 | + random_seed : int |
| 62 | + Random seed. |
| 63 | +
|
| 64 | + Returns |
| 65 | + ------- |
| 66 | + X : array-like of shape (n_samples, n_features) |
| 67 | + Feature matrix. |
| 68 | +
|
| 69 | + y : array-like of shape (n_samples,) |
| 70 | + Target vector. |
| 71 | + """ |
| 72 | + rng = np.random.default_rng(random_seed) |
| 73 | + info_ids = dep_info_ids + indep_info_ids |
| 74 | + n_dep_info = len(dep_info_ids) |
| 75 | + n_info = len(info_ids) |
| 76 | + n_redundant = len(redundant_ids) |
| 77 | + informative_coef = rng.random(n_info) |
| 78 | + redundant_coef = rng.random((n_dep_info, n_redundant)) |
| 79 | + |
| 80 | + X = rng.random((n_samples, n_features)) |
| 81 | + y = np.dot(X[:, info_ids], informative_coef) |
| 82 | + |
| 83 | + X[:, redundant_ids] = X[:, dep_info_ids]@redundant_coef |
| 84 | + return X, y |
| 85 | + |
| 86 | +# %% |
| 87 | +# Define score function |
| 88 | +# --------------------- |
| 89 | +# This function is used to compute the number of correct features missed by selectors. |
| 90 | +# |
| 91 | +# * For independent informative features, selectors should select all of them |
| 92 | +# * For dependent informative features, selectors only need to select any |
| 93 | +# ``n_dep_info``-combination of the set ``dep_info_ids`` + ``redundant_ids``. That |
| 94 | +# means if the indices of dependent informative features are :math:`[0, 1]` and the |
| 95 | +# indices of the redundant features are :math:`[5]`, then the correctly selected |
| 96 | +# indices can be any of :math:`[0, 1]`, :math:`[0, 5]`, and :math:`[1, 5]`. |
| 97 | + |
| 98 | +def get_n_missed( |
| 99 | + dep_info_ids, |
| 100 | + indep_info_ids, |
| 101 | + redundant_ids, |
| 102 | + selected_ids |
| 103 | +): |
| 104 | + """Get the number of features miss selected.""" |
| 105 | + n_redundant = len(redundant_ids) |
| 106 | + n_missed_indep = len(np.setdiff1d(indep_info_ids, selected_ids)) |
| 107 | + n_missed_dep = len( |
| 108 | + np.setdiff1d(dep_info_ids+redundant_ids, selected_ids) |
| 109 | + )-n_redundant |
| 110 | + if n_missed_dep < 0: |
| 111 | + n_missed_dep = 0 |
| 112 | + return n_missed_indep+n_missed_dep |
| 113 | + |
| 114 | +# %% |
| 115 | +# Prepare selectors |
| 116 | +# ----------------- |
| 117 | +# We compare :class:`fastcan.FastCan` with eight selectors of :mod:`sklearn`, which |
| 118 | +# include the Select From a Model (SFM) algorithm, the Recursive Feature Elimination |
| 119 | +# (RFE) algorithm, the Sequential Feature Selection (SFS) algorithm, and Select K Best |
| 120 | +# (SKB) algorithm. |
| 121 | +# The list of the selectors are given below: |
| 122 | +# |
| 123 | +# * fastcan: :class:`fastcan.FastCan` selector |
| 124 | +# * skb_reg: is the SKB algorithm ranking features with ANOVA (analysis of variance) |
| 125 | +# F-statistic and p-values |
| 126 | +# * skb_mir: is the SKB algorithm ranking features mutual information for regression |
| 127 | +# * sfm_lsvr: the SFM algorithm with a linear support vector regressor |
| 128 | +# * sfm_rfr: the SFM algorithm with a random forest regressor |
| 129 | +# * rfe_lsvr: is the RFE algorithm with a linear support vector regressor |
| 130 | +# * rfe_rfr: is the RFE algorithm with a random forest regressor |
| 131 | +# * sfs_lsvr: is the forward SFS algorithm with a linear support vector regressor |
| 132 | +# * sfs_rfr: is the forward SFS algorithm with a random forest regressor |
| 133 | + |
| 134 | + |
| 135 | +from sklearn.ensemble import RandomForestRegressor |
| 136 | +from sklearn.feature_selection import ( |
| 137 | + RFE, |
| 138 | + SelectFromModel, |
| 139 | + SelectKBest, |
| 140 | + SequentialFeatureSelector, |
| 141 | + f_regression, |
| 142 | + mutual_info_regression, |
| 143 | +) |
| 144 | +from sklearn.svm import LinearSVR |
| 145 | + |
| 146 | +from fastcan import FastCan |
| 147 | + |
| 148 | +lsvr = LinearSVR(C = 1, dual="auto", max_iter=100000, random_state=0) |
| 149 | +rfr = RandomForestRegressor(n_estimators = 10, random_state=0) |
| 150 | + |
| 151 | + |
| 152 | +N_SAMPLES = 1000 |
| 153 | +N_FEATURES = 10 |
| 154 | +DEP_INFO_IDS = [2, 4, 7, 9] |
| 155 | +INDEP_INFO_IDS = [0, 1, 6] |
| 156 | +REDUNDANT_IDS = [5, 8] |
| 157 | +N_SELECTED = len(DEP_INFO_IDS+INDEP_INFO_IDS) |
| 158 | +N_REPEATED = 10 |
| 159 | + |
| 160 | +selector_dict = { |
| 161 | + "fastcan": FastCan(N_SELECTED, verbose=0), |
| 162 | + "skb_reg": SelectKBest(f_regression, k=N_SELECTED), |
| 163 | + "skb_mir": SelectKBest(mutual_info_regression, k=N_SELECTED), |
| 164 | + "sfm_lsvr": SelectFromModel(lsvr, max_features=N_SELECTED, threshold=-np.inf), |
| 165 | + "sfm_rfr": SelectFromModel(rfr, max_features=N_SELECTED, threshold=-np.inf), |
| 166 | + "rfe_lsvr": RFE(lsvr, n_features_to_select=N_SELECTED, step=1), |
| 167 | + "rfe_rfr": RFE(rfr, n_features_to_select=N_SELECTED, step=1), |
| 168 | + "sfs_lsvr": SequentialFeatureSelector(lsvr, n_features_to_select=N_SELECTED, cv=2), |
| 169 | + "sfs_rfr": SequentialFeatureSelector(rfr, n_features_to_select=N_SELECTED, cv=2), |
| 170 | +} |
| 171 | + |
| 172 | +# %% |
| 173 | +# Run test |
| 174 | +# -------- |
| 175 | + |
| 176 | +N_SELECTORS = len(selector_dict) |
| 177 | +n_missed = np.zeros((N_REPEATED, N_SELECTORS), dtype=int) |
| 178 | + |
| 179 | +for i in range(N_REPEATED): |
| 180 | + X, y = make_redundant( |
| 181 | + n_samples=N_SAMPLES, |
| 182 | + n_features=N_FEATURES, |
| 183 | + dep_info_ids=DEP_INFO_IDS, |
| 184 | + indep_info_ids=INDEP_INFO_IDS, |
| 185 | + redundant_ids=REDUNDANT_IDS, |
| 186 | + random_seed=i, |
| 187 | + ) |
| 188 | + for j, selector in enumerate(selector_dict.values()): |
| 189 | + result_ids = selector.fit(X, y).get_support(indices=True) |
| 190 | + n_missed[i, j] = get_n_missed( |
| 191 | + dep_info_ids=DEP_INFO_IDS, |
| 192 | + indep_info_ids=INDEP_INFO_IDS, |
| 193 | + redundant_ids=REDUNDANT_IDS, |
| 194 | + selected_ids=result_ids, |
| 195 | + ) |
| 196 | + |
| 197 | +# %% |
| 198 | +# Plot results |
| 199 | +# ------------ |
| 200 | +# :class:`fastcan.FastCan` correctly selects all informative features with zero missed |
| 201 | +# features. |
| 202 | + |
| 203 | +import matplotlib.pyplot as plt |
| 204 | + |
| 205 | +fig, ax = plt.subplots(figsize = (8, 5)) |
| 206 | +rects = ax.bar(selector_dict.keys(), n_missed.sum(0), width = 0.5) |
| 207 | +ax.bar_label(rects, n_missed.sum(0), padding=3) |
| 208 | +plt.xlabel("Selector") |
| 209 | +plt.ylabel("No. of missed features") |
| 210 | +plt.title("Performance of selectors on datasets with linear redundant features") |
| 211 | +plt.show() |
0 commit comments