Skip to content

Commit 36a9c99

Browse files
committed
DOC add plot_intuitive
1 parent c1abc20 commit 36a9c99

File tree

10 files changed

+368
-198
lines changed

10 files changed

+368
-198
lines changed

README.rst

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,13 @@ FastCan is a feature selection method, which has following advantages:
3333

3434
#. Extremely **fast**. See :ref:`sphx_glr_auto_examples_plot_speed.py`.
3535

36-
#. Support unsupervised feature selection.
36+
#. Support unsupervised feature selection. See :ref:`Unsupervised feature selection <unsupervised>`.
3737

38-
#. Support multioutput feature selection.
38+
#. Support multioutput feature selection. See :ref:`Multioutput feature selection <multioutput>`.
3939

40-
#. Skip redundant features.
40+
#. Skip redundant features. See :ref:`Feature redundancy <redundancy>`.
41+
42+
#. Evalaute relative usefulness of features. See :ref:`sphx_glr_auto_examples_plot_intuitive.py`.
4143

4244

4345
Installation

doc/conf.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242
"sphinx.ext.intersphinx",
4343
"sphinx_gallery.gen_gallery",
4444
"sphinx_design",
45-
"matplotlib.sphinxext.plot_directive",
4645
]
4746

4847
# List of patterns, relative to source directory, that match files and

doc/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ API Reference
1919

2020
FastCan
2121
ssc
22+
ols
2223

2324
Useful Links
2425
------------

doc/intuitive.rst

Lines changed: 0 additions & 16 deletions
This file was deleted.

doc/user_guide.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ User Guide
88
:numbered:
99
:maxdepth: 1
1010

11-
intuitive.rst
1211
unsupervised.rst
1312
multioutput.rst
1413
redundancy.rst

examples/plot_affinity.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
=================
3-
Affine Invariance
3+
Affine invariance
44
=================
55
66
.. currentmodule:: fastcan

examples/plot_fisher.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from sklearn import datasets
2424
from sklearn.preprocessing import OneHotEncoder
2525

26-
2726
X, y = datasets.load_iris(return_X_y=True)
2827
# drop="first" is necessary, otherwise, the transformed target is not full column rank
2928
y_enc = OneHotEncoder(
@@ -40,8 +39,8 @@
4039

4140
import numpy as np
4241
from scipy import linalg
43-
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
4442
from sklearn.covariance import empirical_covariance
43+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
4544

4645
clf = LinearDiscriminantAnalysis(solver="eigen").fit(X, y)
4746
Sw = clf.covariance_ # within scatter

examples/plot_intuitive.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
"""
2+
=======================
3+
Intuitively explanation
4+
=======================
5+
6+
.. currentmodule:: fastcan
7+
8+
Let's intuitively understand the two methods, h-correlation and eta-cosine,
9+
in :class:`FastCan`.
10+
"""
11+
12+
# Authors: Sikai Zhang
13+
# SPDX-License-Identifier: MIT
14+
15+
# %%
16+
# Select the first feature
17+
# ------------------------
18+
# For feature selection, it is normally easy to define a criterion to evaluate a
19+
# feature's usefulness, but it is hard to compute the amount of redundancy between
20+
# a new feature and many selected features. Here we use the ``diabetes`` dataset,
21+
# which has 10 features, as an example. If R-squared between a feature (transformed to
22+
# the predicted target by a linear regression model) and the target to describe its
23+
# usefulness, the results are shown in the following figure. It can be seen that
24+
# Feature 2 is the most useful and Feature 8 is the second. However, does that mean
25+
# that the total usefullness of Feature 2 + Feature 8 is the sum of their R-squared
26+
# scores? Probably not, because there may be redundancy between Feature 2 and Feature 8.
27+
# Actually, what we want is a kind of usefulness score which has the **superposition**
28+
# property, so that the usefullness of each feature can be added together without
29+
# redundancy.
30+
31+
32+
import matplotlib.pyplot as plt
33+
import numpy as np
34+
from matplotlib.patches import Patch
35+
from sklearn.datasets import load_diabetes
36+
from sklearn.linear_model import LinearRegression
37+
38+
from fastcan import FastCan
39+
40+
plt.rcParams['axes.spines.right'] = False
41+
plt.rcParams['axes.spines.top'] = False
42+
43+
def get_r2(feats, target, feats_selected=None):
44+
"""Get R-squared between [feats_selected, feat_i] and target."""
45+
46+
n_samples, n_features = feats.shape
47+
if feats_selected is None:
48+
feats_selected = np.zeros((n_samples, 0))
49+
50+
lr = LinearRegression()
51+
r2 = np.zeros(n_features)
52+
for i in range(n_features):
53+
feats_i = np.column_stack((feats_selected, feats[:, i]))
54+
r2[i] = lr.fit(feats_i, target).score(feats_i, target)
55+
return r2
56+
57+
def plot_bars(ids, r2_left, r2_selected):
58+
"""Plot the relative R-squared with a bar plot."""
59+
legend_selected = Patch(color='tab:green', label='X_selected')
60+
legend_cand = Patch(color='tab:blue', label='x_i: candidates')
61+
legend_best = Patch(color='tab:orange', label='Best candidate')
62+
n_features = len(ids)
63+
n_selected = len(r2_selected)
64+
65+
left = np.zeros(n_features)+sum(r2_selected)
66+
left_selected = np.cumsum(r2_selected)
67+
left_selected = np.r_[0, left_selected]
68+
left_selected = left_selected[:-1]
69+
left[:n_selected] = left_selected
70+
71+
label = [""]*n_features
72+
label[np.argmax(r2_left)+n_selected] = f"{max(r2_left):.5f}"
73+
74+
colors = ["tab:blue"]*(n_features - n_selected)
75+
colors[np.argmax(r2_left)] = "tab:orange"
76+
colors = ["tab:green"]*n_selected + colors
77+
78+
hbars = plt.barh(ids, width=np.r_[score_selected, r2_left], color=colors, left=left)
79+
plt.axvline(x = sum(r2_selected), color = 'tab:orange', linestyle="--")
80+
plt.bar_label(hbars, label)
81+
plt.yticks(np.arange(n_features))
82+
plt.xlabel("R-squared between [X_selected, x_i] and y")
83+
plt.ylabel("Feature index")
84+
plt.legend(handles=[legend_selected, legend_cand, legend_best])
85+
plt.show()
86+
87+
X, y = load_diabetes(return_X_y=True)
88+
89+
90+
id_left = np.arange(X.shape[1])
91+
id_selected = []
92+
score_selected = []
93+
94+
95+
96+
score_0 = get_r2(X, y)
97+
98+
plot_bars(id_left, score_0, score_selected)
99+
100+
101+
# %%
102+
# Select the second feature
103+
# -------------------------
104+
# Let's compute the R-squared between Feature 2 + Feature i and the target, which is
105+
# shown in the figure below. The bars at the right-hand-side (RHS) of the dashed line is
106+
# the additional R-squared scores based on the scores of Feature 2, which we call
107+
# **relative** usefulness to Feature 2. It is also seen that the bar of Feature 8
108+
# in this figure is much shorter than the bar in the previous figure.
109+
# Because the redundancy between Feature 2 and Feature 8 is removed.
110+
# Therefore, these bars at RHS can be the desired usefulness score with the
111+
# **superposition** property.
112+
113+
index = np.argmax(score_0)
114+
id_selected += [id_left[index]]
115+
score_selected += [score_0[index]]
116+
id_left = np.delete(id_left, index)
117+
score_1 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected)
118+
119+
120+
plot_bars(np.r_[id_selected, id_left], score_1, score_selected)
121+
122+
123+
124+
# %%
125+
# Select the third feature
126+
# ------------------------
127+
# Again, let's compute the R-squared between Feature 2 + Feature 8 + Feature i and
128+
# the target, and the additonal R-squared contributed by the rest of the features is
129+
# shown in following figure. It can be found that after selecting Features 2 and 8, the
130+
# rest of the features can provide a very limited contribution.
131+
132+
index = np.argmax(score_1)
133+
id_selected += [id_left[index]]
134+
score_selected += [score_1[index]]
135+
id_left = np.delete(id_left, index)
136+
score_2 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected)
137+
138+
plot_bars(np.r_[id_selected, id_left], score_2, score_selected)
139+
140+
141+
142+
# %%
143+
# h-correlation and eta-cosine
144+
# ----------------------------
145+
# ``h-correlation`` is a fast way to compute the value of the bars
146+
# at the RHS of the dashed lines. The fast computational speed is achieved by
147+
# orthogonalization, which removes the redundancy between the features. We use the
148+
# orthogonalization first to makes the rest of features orthogonal to the selected
149+
# features and then compute their additonal R-squared values. ``eta-cosine`` uses
150+
# the samilar idea, but has an additonal preprocessing step to compress the features
151+
# :math:`X \in \mathbb{R}^{N\times n}` and the target
152+
# :math:`X \in \mathbb{R}^{N\times n}` to :math:`X_c \in \mathbb{R}^{(m+n)\times n}`
153+
# and :math:`Y_c \in \mathbb{R}^{(m+n)\times m}`.
154+
155+
scores = FastCan(3, verbose=0).fit(X, y).scores_
156+
157+
print(f"First selected feature's score: {scores[0]:.5f}")
158+
print(f"Second selected feature's score: {scores[1]:.5f}")
159+
print(f"Third selected feature's score: {scores[2]:.5f}")
160+
161+
# %%
162+
# Relative usefulness
163+
# -------------------
164+
# The idea about relative usefulness can be very helpful, when we want to
165+
# evaluate features based on some prior knowledges. For example, we have
166+
# some magnetic impedance spectroscopy (MIS) features of cervix tissue in
167+
# pregnant women and we want to evaluate the usefulness of these features
168+
# for predicting spontaneous preterm births (sPTB). The prior knowledge is that
169+
# cervical length (CL) and quantitative fetal fibronectin (fFN) are effective risk
170+
# factors for sPTB, so the redundancy between CL+fFN and MIS features should be
171+
# avoided. Therefore, the relative usefulness of MIS features to CL and fFN should
172+
# be computed. We can use the argument ``indices_include`` to compute the relative
173+
# usefulness. Use the ``diabetes`` dataset as an example. Assuming the prior
174+
# knowledge is that Feature 3 is very important, the relative usefulness of the rest
175+
# features to Feature 3 given in the figure below, which is the same as the
176+
# result from :class:`FastCan`.
177+
178+
index = 3
179+
id_selected = [index]
180+
score_selected = [score_0[index]]
181+
id_left = np.arange(X.shape[1])
182+
id_left = np.delete(id_left, index)
183+
score_1_7 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected)
184+
185+
plot_bars(np.r_[id_selected, id_left], score_1_7, score_selected)
186+
187+
scores = FastCan(2, indices_include=[3], verbose=0).fit(X, y).scores_
188+
189+
print(f"First selected feature's score: {scores[0]:.5f}")
190+
print(f"Second selected feature's score: {scores[1]:.5f}")

0 commit comments

Comments
 (0)