Skip to content

Commit 1318faa

Browse files
committed
DOC unsupervised init
1 parent e0961f8 commit 1318faa

File tree

3 files changed

+99
-10
lines changed

3 files changed

+99
-10
lines changed

doc/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
"sphinx.ext.intersphinx",
4343
"sphinx_gallery.gen_gallery",
4444
"sphinx_design",
45+
"matplotlib.sphinxext.plot_directive",
4546
]
4647

4748
# List of patterns, relative to source directory, that match files and

doc/unsupervised.rst

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,76 @@
66
Unsupervised feature selection
77
==============================
88

9-
We can use :class:`FastCan` to do unsupervised feature selection.
9+
We can use :class:`FastCan` to do unsupervised feature selection.
10+
The unsupervised application of :class:`FastCan` tries to select features, which
11+
maximize the sum of the squared canonical correlation (SSC) with the principal
12+
components (PCs) acquired from PCA (principal component analysis) of the feature
13+
matrix :math:`X`.
14+
15+
>>> from sklearn.decomposition import PCA
16+
>>> from sklearn import datasets
17+
>>> from fastcan import FastCan
18+
>>> iris = datasets.load_iris()
19+
>>> X = iris["data"]
20+
>>> y = iris["target"]
21+
>>> f_names = iris["feature_names"]
22+
>>> t_names = iris["target_names"]
23+
>>> pca = PCA(n_components=2)
24+
>>> X_pcs = pca.fit_transform(X)
25+
>>> selector = FastCan(n_features_to_select=2, verbose=0)
26+
>>> selector.fit(X, X_pcs[:, :2])
27+
>>> selector.indices_
28+
array([2, 1], dtype=int32)
29+
30+
.. note::
31+
There is no guarantee that this unsupervised :class:`FastCan` will select
32+
the optimal subset of the features, which has the highest SSC with PCs.
33+
Because :class:`FastCan` selects features in a greedy manner, which may lead to
34+
suboptimal results. See the following plots.
35+
36+
.. plot::
37+
:context: close-figs
38+
:align: center
39+
40+
from itertools import combinations
41+
import matplotlib.pyplot as plt
42+
from sklearn.cross_decomposition import CCA
43+
44+
def ssc(X, y):
45+
"""Sum of the squared canonical correlation coefficients.
46+
Parameters
47+
----------
48+
X : array-like of shape (n_samples, n_features)
49+
Feature matrix.
50+
51+
y : array-like of shape (n_samples, n_outputs)
52+
Target matrix.
53+
54+
Returns
55+
-------
56+
ssc : float
57+
Sum of the squared canonical correlation coefficients.
58+
"""
59+
n_components = min(X.shape[1], y.shape[1])
60+
cca = CCA(n_components=n_components)
61+
X_c, y_c = cca.fit_transform(X, y)
62+
corrcoef = np.diagonal(
63+
np.corrcoef(X_c, y_c, rowvar=False),
64+
offset=n_components
65+
)
66+
return sum(corrcoef**2)
67+
68+
comb = list(combinations([0, 1, 2, 3], 2))
69+
fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(8, 6), layout="constrained")
70+
for i in range(2):
71+
for j in range(3):
72+
f1_idx = comb[i*3+j][0]
73+
f2_idx = comb[i*3+j][1]
74+
score = ssc(X[:, [f1_idx, f2_idx]], X_pcs)
75+
scatter = axs[i, j].scatter(X[:, f1_idx], X[:, f2_idx], c=y)
76+
axs[i, j].set(xlabel=f_names[f1_idx], ylabel=f_names[f2_idx])
77+
axs[i, j].set_title(f"SSC: {score:.3f}")
78+
for spine in axs[1, 0].spines.values():
79+
spine.set_edgecolor('red')
80+
_ = axs[1, 2].legend(scatter.legend_elements()[0], t_names, loc="lower right")
81+

examples/plot_speed.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,30 @@
3939

4040
from sklearn.cross_decomposition import CCA
4141

42+
def ssc(X, y):
43+
"""Sum of the squared canonical correlation coefficients.
44+
Parameters
45+
----------
46+
X : array-like of shape (n_samples, n_features)
47+
Feature matrix.
48+
49+
y : array-like of shape (n_samples, n_outputs)
50+
Target matrix.
51+
52+
Returns
53+
-------
54+
ssc : float
55+
Sum of the squared canonical correlation coefficients.
56+
"""
57+
n_components = min(X.shape[1], y.shape[1])
58+
cca = CCA(n_components=n_components)
59+
X_c, y_c = cca.fit_transform(X, y)
60+
corrcoef = np.diagonal(
61+
np.corrcoef(X_c, y_c, rowvar=False),
62+
offset=n_components
63+
)
64+
return sum(corrcoef**2)
65+
4266

4367
def baseline(X, y, t):
4468
"""Baseline method using CCA from sklearn.
@@ -64,24 +88,16 @@ def baseline(X, y, t):
6488
the scores is corresponding to the feature selection process.
6589
"""
6690
n_samples, n_features = X.shape
67-
n_targets = y.shape[1]
6891
mask = np.zeros(n_features, dtype=bool)
6992
r2 = np.zeros(n_features, dtype=float)
7093
indices = np.zeros(t, dtype=int)
7194
scores = np.zeros(t, dtype=float)
7295
X_selected = np.zeros((n_samples, 0), dtype=float)
7396
for i in range(t):
74-
n_components = min(i+1, n_targets)
75-
cca = CCA(n_components=n_components)
7697
for j in range(n_features):
7798
if not mask[j]:
7899
X_candidate = np.column_stack((X_selected, X[:, j]))
79-
X_c, y_c = cca.fit_transform(X_candidate, y)
80-
corrcoef = np.diagonal(
81-
np.corrcoef(X_c, y_c, rowvar=False),
82-
offset=n_components
83-
)
84-
r2[j] = sum(corrcoef**2)
100+
r2[j] = ssc(X_candidate, y)
85101
d = np.argmax(r2)
86102
indices[i] = d
87103
scores[i] = r2[d]

0 commit comments

Comments
 (0)