DOC add plot_affinity

MatthewSZhang · MatthewSZhang · commit 06d2f2345e29 · 2024-10-10T10:03:40.000+08:00
diff --git a/doc/ols_and_omp.rst b/doc/ols_and_omp.rst
@@ -14,28 +14,30 @@ Here, let's briefly compare the three methods.
 
 Assume we have a feature matrix :math:`X_s \in \mathbb{R}^{N\times t}`, which constains
 :math:`t` selected features, and a target vector :math:`y \in \mathbb{R}^{N\times 1}`.
-Then the residual :math:`r` of the least-squares can be found by
+Then the residual :math:`r \in \mathbb{R}^{N\times 1}` of the least-squares can be
+found by
 
 .. math::
     r = y - X_s \beta \;\; \text{where} \;\; \beta =  (X_s^\top X_s)^{-1}X_s^\top y
 
-When evaluating a new feature :math:`x_i`
+When evaluating a new candidate feature :math:`x_i \in \mathbb{R}^{N\times 1}`
 
-* for OMP, the feature which maximizes :math:`r^\top x_i` will be selected
+* for OMP, the feature which maximizes :math:`r^\top x_i` will be selected,
 * for OLS, the feature which maximizes :math:`r^\top w_i` will be selected, where
-  :math:`w_i` is the projection of :math:`x_i` on the orthogonal subspace so that it is
-  orthogonal to :math:`X_s`, i.e., :math:`X_s^\top w_i = \mathbf{0} \in \mathbb{R}^{N}`
+  :math:`w_i \in \mathbb{R}^{N\times 1}` is the projection of :math:`x_i` on the
+  orthogonal subspace so that it is orthogonal to :math:`X_s`, i.e.,
+  :math:`X_s^\top w_i = \mathbf{0} \in \mathbb{R}^{t\times 1}`,
 * for :class:`FastCan` (h-correlation algorithm), it is almost same as OLS, but the
   difference is that in :class:`FastCan`, :math:`X_s`, :math:`y`, and :math:`x_i`
   are centered (i.e., zero mean in each column) before the selection.
 
-The small change makes the feature ranking criterion of :class:`FastCan` is equivalent
-to the sum of squared canonical correlation coefficients, which gives it the following
-advantages over OLS and OMP:
+The small difference makes the feature ranking criterion of :class:`FastCan` is
+equivalent to the sum of squared canonical correlation coefficients, which gives
+it the following advantages over OLS and OMP:
 
-* Affine invariant: if features are polluted by affine transformation, i.e., scaled
+* Affine invariance: if features are polluted by affine transformation, i.e., scaled
   and/or added some constants, the selection result given by :class:`FastCan` will be
-  unchanged.
+  unchanged. See :ref:`sphx_glr_auto_examples_plot_affinity.py`.
 * Multioutput: as :class:`FastCan` use canonical correlation for feature ranking, it is
   naturally support feature seleciton on dataset with multioutput.
 
diff --git a/examples/plot_affinity.py b/examples/plot_affinity.py
@@ -0,0 +1,105 @@
+"""
+=================
+Affine Invariance
+=================
+
+.. currentmodule:: fastcan
+
+In this examples, we will compare the robustness of the three feature
+selection methods on affine transformed features.
+"""
+
+# Authors: Sikai Zhang
+# SPDX-License-Identifier: MIT
+
+# %%
+# Initialize test
+# ---------------
+# The three feature selection methods, i.e., OMP, OLS, and :class:`FastCan`,
+# will select three features from the 10 features of `diabetes` dataset. It can be
+# seen, the three methods select the same features.
+
+import numpy as np
+from sklearn.datasets import load_diabetes
+from sklearn.linear_model import OrthogonalMatchingPursuit
+
+from fastcan import FastCan, ols
+
+X, y = load_diabetes(return_X_y=True)
+
+n_selected = 3
+omp_selector = OrthogonalMatchingPursuit(n_nonzero_coefs=n_selected)
+fastcan_selector = FastCan(n_features_to_select=n_selected, verbose=0)
+(ids_omp,) = omp_selector.fit(X, y).coef_.nonzero()
+ids_ols, _ = ols(X, y, n_selected)
+ids_fastcan = fastcan_selector.fit(X, y).indices_
+
+print("Indices of features selected by:")
+print("OMP: ", np.sort(ids_omp))
+print("OLS: ", np.sort(ids_ols))
+print("FastCan: ", np.sort(ids_fastcan))
+
+
+
+# %%
+# Affine transformation
+# ---------------------
+# In this test, the 10 features of ``diabetes`` dataset will be randomly polluted
+# by the affine transformation. The three feature selection methods will select
+# three features from the polluted features. The more stable the result, the better.
+
+
+
+n_features = X.shape[1]
+rng = np.random.default_rng()
+
+ids_omp_all = []
+ids_ols_all = []
+ids_fastcan_all = []
+for i in range(10):
+    X_affine = X @ np.diag(rng.random(n_features)) + rng.random(n_features)
+
+    (ids_omp,) = omp_selector.fit(X_affine, y).coef_.nonzero()
+    ids_ols, _ = ols(X_affine, y, n_selected)
+    ids_fastcan = fastcan_selector.fit(X_affine, y).indices_
+    ids_omp_all += ids_omp.tolist()
+    ids_ols_all += ids_ols.tolist()
+    ids_fastcan_all += ids_fastcan.tolist()
+
+# %%
+# Plot results
+# ------------
+# It can be seen, only :class:`FastCan` has robust results when the feature
+# is polluted by the affine transformation.
+
+import matplotlib.pyplot as plt
+
+bin_lims = np.arange(n_features+1)
+counts_omp, _ = np.histogram(ids_omp_all, bins=bin_lims)
+counts_ols, _ = np.histogram(ids_ols_all, bins=bin_lims)
+counts_fastcan, _ = np.histogram(ids_fastcan_all, bins=bin_lims)
+
+fig, axs = plt.subplots(1, 3, figsize=(8, 3))
+
+axs[0].bar(bin_lims[:-1], counts_omp)
+axs[0].set_xticks(bin_lims[:-1])
+axs[0].set_ylim((0, 11))
+axs[0].set_title("OMP")
+axs[0].set_xlabel("Feature Index")
+axs[0].set_ylabel("Count of Selected Times")
+
+
+axs[1].bar(bin_lims[:-1], counts_ols)
+axs[1].set_xticks(bin_lims[:-1])
+axs[1].set_ylim((0, 11))
+axs[1].set_title("OLS")
+axs[1].set_xlabel("Feature Index")
+
+axs[2].bar(bin_lims[:-1], counts_fastcan)
+axs[2].set_xticks(bin_lims[:-1])
+axs[2].set_ylim((0, 11))
+axs[2].set_title("FastCan")
+axs[2].set_xlabel("Feature Index")
+
+plt.tight_layout()
+plt.show()
diff --git a/examples/plot_ols_omp.py b/examples/plot_ols_omp.py
diff --git a/examples/plot_redundancy.py b/examples/plot_redundancy.py
@@ -3,6 +3,8 @@
 Feature selection performance on redundant features
 ===================================================
 
+.. currentmodule:: fastcan
+
 In this examples, we will compare the performance of feature selectors on the
 datasets, which contain redundant features.
 Here four types of features should be distinguished:
@@ -88,7 +90,7 @@ def make_redundant(
 # ---------------------
 # This function is used to compute the number of correct features missed by selectors.
 #
-# * For independent informative features, selectors should select all of them
+# * For independent informative features, selectors should select all of them.
 # * For dependent informative features, selectors only need to select any
 #   ``n_dep_info``-combination of the set ``dep_info_ids`` + ``redundant_ids``. That
 #   means if the indices of dependent informative features are :math:`[0, 1]` and the
@@ -114,13 +116,13 @@ def get_n_missed(
 # %%
 # Prepare selectors
 # -----------------
-# We compare :class:`fastcan.FastCan` with eight selectors of :mod:`sklearn`, which
+# We compare :class:`FastCan` with eight selectors of :mod:`sklearn`, which
 # include the Select From a Model (SFM) algorithm, the Recursive Feature Elimination
 # (RFE) algorithm, the Sequential Feature Selection (SFS) algorithm, and Select K Best
 # (SKB) algorithm.
 # The list of the selectors are given below:
 #
-# * fastcan: :class:`fastcan.FastCan` selector
+# * fastcan: :class:`FastCan` selector
 # * skb_reg: is the SKB algorithm ranking features with ANOVA (analysis of variance)
 #   F-statistic and p-values
 # * skb_mir: is the SKB algorithm ranking features mutual information for regression
@@ -197,7 +199,7 @@ def get_n_missed(
 # %%
 # Plot results
 # ------------
-# :class:`fastcan.FastCan` correctly selects all informative features with zero missed
+# :class:`FastCan` correctly selects all informative features with zero missed
 # features.
 
 import matplotlib.pyplot as plt
diff --git a/examples/plot_speed.py b/examples/plot_speed.py
@@ -3,9 +3,11 @@
 Computational speed comparison
 ==============================
 
+.. currentmodule:: fastcan
+
 In this examples, we will compare the computational speed of three different feature
-selection methods: h-correlation based :class:`fastcan.FastCan`, eta-cosine based
-:class:`fastcan.FastCan`, and baseline model based on
+selection methods: h-correlation based :class:`FastCan`, eta-cosine based
+:class:`FastCan`, and baseline model based on
 ``sklearn.cross_decomposition.CCA``.
 
 """
diff --git a/fastcan/__init__.py b/fastcan/__init__.py
@@ -3,7 +3,7 @@
 """
 
 from ._fastcan import FastCan
-from ._utils import ssc, ols
+from ._utils import ols, ssc
 
 __all__ = [
     "FastCan",
diff --git a/fastcan/_utils.py b/fastcan/_utils.py
@@ -81,9 +81,7 @@ def ols(X, y, t=1):
         The scores of selected features. The order of
         the scores is corresponding to the feature selection process.
     """
-    X, y = check_X_y(
-        X, y, dtype=float, ensure_2d=True
-    )
+    X, y = check_X_y(X, y, dtype=float, ensure_2d=True)
     n_features = X.shape[1]
     w = X / np.linalg.norm(X, axis=0)
     v = y / np.linalg.norm(y)
@@ -100,11 +98,11 @@ def ols(X, y, t=1):
         d = np.argmax(r2)
         indices[i] = d
         scores[i] = r2[d]
-        if i == t-1:
+        if i == t - 1:
             return indices, scores
         mask[d] = True
         r2[d] = 0
         for j in range(n_features):
             if not mask[j]:
-                w[:, j] = w[:, j] - w[:, d]*(w[:, d] @ w[:, j])
+                w[:, j] = w[:, j] - w[:, d] * (w[:, d] @ w[:, j])
                 w[:, j] /= np.linalg.norm(w[:, j], axis=0)
diff --git a/pixi.lock b/pixi.lock