scikit-learn-contrib
diff --git a/‎doc/ols_and_omp.rst‎
Lines changed: 56 additions & 0 deletions b/‎doc/ols_and_omp.rst‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎doc/user_guide.rst‎
Lines changed: 2 additions & 1 deletion b/‎doc/user_guide.rst‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/plot_ols_omp.py‎
Lines changed: 16 additions & 0 deletions b/‎examples/plot_ols_omp.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎examples/plot_redundancy.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/plot_redundancy.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastcan/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎fastcan/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎fastcan/_fastcan.py‎
Lines changed: 1 addition & 1 deletion b/‎fastcan/_fastcan.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastcan/_ssc.py‎
Lines changed: 0 additions & 47 deletions b/‎fastcan/_ssc.py‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎fastcan/_utils.py‎
Lines changed: 110 additions & 0 deletions b/‎fastcan/_utils.py‎
Lines changed: 110 additions & 0 deletions
@@ -0,0 +1,56 @@
+.. currentmodule:: fastcan
+
+.. _ols_omp:
+
+===========================
+Comparison with OLS and OMP
+===========================
+
+:class:`FastCan` has a close relationship with Orthogonal Least Squares (OLS) [1]_
+and Orthogonal Matching Pursuit (OMP) [2]_.
+The detailed difference between OLS and OMP can be found in [3]_.
+Here, let's briefly compare the three methods.
+
+
+Assume we have a feature matrix :math:`X_s \in \mathbb{R}^{N\times t}`, which constains
+:math:`t` selected features, and a target vector :math:`y \in \mathbb{R}^{N\times 1}`.
+Then the residual :math:`r` of the least-squares can be found by
+
+.. math::
+    r = y - X_s \beta \;\; \text{where} \;\; \beta =  (X_s^\top X_s)^{-1}X_s^\top y
+
+When evaluating a new feature :math:`x_i`
+
+* for OMP, the feature which maximizes :math:`r^\top x_i` will be selected
+* for OLS, the feature which maximizes :math:`r^\top w_i` will be selected, where
+  :math:`w_i` is the projection of :math:`x_i` on the orthogonal subspace so that it is
+  orthogonal to :math:`X_s`, i.e., :math:`X_s^\top w_i = \mathbf{0} \in \mathbb{R}^{N}`
+* for :class:`FastCan` (h-correlation algorithm), it is almost same as OLS, but the
+  difference is that in :class:`FastCan`, :math:`X_s`, :math:`y`, and :math:`x_i`
+  are centered (i.e., zero mean in each column) before the selection.
+
+The small change makes the feature ranking criterion of :class:`FastCan` is equivalent
+to the sum of squared canonical correlation coefficients, which gives it the following
+advantages over OLS and OMP:
+
+* Affine invariant: if features are polluted by affine transformation, i.e., scaled
+  and/or added some constants, the selection result given by :class:`FastCan` will be
+  unchanged.
+* Multioutput: as :class:`FastCan` use canonical correlation for feature ranking, it is
+  naturally support feature seleciton on dataset with multioutput.
+
+
+.. rubric:: References
+
+.. [1] `"Orthogonal least squares methods and their application to non-linear
+    system identification" <https://doi.org/10.1080/00207178908953472>`_ Chen, S.,
+    Billings, S. A., & Luo, W. International Journal of control, 50(5),
+    1873-1896 (1989).
+
+.. [2] `"Matching pursuits with time-frequency dictionaries"
+    <https://doi.org/10.1109/78.258082>`_ Mallat, S. G., & Zhang, Z.
+    IEEE Transactions on signal processing, 41(12), 3397-3415 (1993).
+
+.. [3] `"On the difference between Orthogonal Matching Pursuit and Orthogonal Least
+    Squares" <https://eprints.soton.ac.uk/142469/1/BDOMPvsOLS07.pdf>`_ Blumensath, T.,
+    & Davies, M. E. Technical report, University of Edinburgh, (2007).
@@ -10,4 +10,5 @@ User Guide
    intuitive.rst
    unsupervised.rst
    multioutput.rst
-   redundancy.rst
+   redundancy.rst
+   ols_and_omp.rst
@@ -0,0 +1,16 @@
+"""
+======================================================
+FastCan VS. OLS VS. OMP on Affine Transformed Features
+======================================================
+
+In this examples, we will compare the robustness of the three feature
+selection methods on affine transformed features.
+"""
+
+# Authors: Sikai Zhang
+# SPDX-License-Identifier: MIT
+
+# %%
+# Define OLS
+# ----------
+
@@ -207,5 +207,5 @@ def get_n_missed(
 ax.bar_label(rects, n_missed.sum(0), padding=3)
 plt.xlabel("Selector")
 plt.ylabel("No. of missed features")
-plt.title("Performance of selectors on datasets with linear redundant features")
+plt.title("Performance of selectors on datasets with linearly redundant features")
 plt.show()
@@ -3,9 +3,10 @@
 """
 
 from ._fastcan import FastCan
-from ._ssc import ssc
+from ._utils import ssc, ols
 
 __all__ = [
     "FastCan",
     "ssc",
+    "ols",
 ]
@@ -59,7 +59,7 @@ class FastCan(SelectorMixin, BaseEstimator):
     support_ : ndarray of shape (n_features,), dtype=bool
         The mask of selected features.
 
-    scores_: ndarray of shape (n_features_to_select,), dtype=float
+    scores_ : ndarray of shape (n_features_to_select,), dtype=float
         The h-correlation/eta-cosine of selected features. The order of
         the scores is corresponding to the feature selection process.
 
 
@@ -0,0 +1,110 @@
+"""Sum squared of correlation."""
+
+from numbers import Integral
+
+import numpy as np
+from sklearn.cross_decomposition import CCA
+from sklearn.utils import check_X_y
+from sklearn.utils._param_validation import Interval, validate_params
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "y": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def ssc(X, y):
+    """Sum of the squared canonical correlation coefficients.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Feature matrix.
+
+    y : array-like of shape (n_samples, n_outputs)
+        Target matrix.
+
+    Returns
+    -------
+    ssc : float
+        Sum of the squared canonical correlation coefficients.
+
+    Examples
+    --------
+    >>> from fastcan import ssc
+    >>> X = [[1], [-1], [0]]
+    >>> y = [[0], [1], [-1]]
+    >>> ssc(X, y)
+    np.float64(0.25)
+    """
+    X, y = check_X_y(
+        X, y, dtype=float, ensure_2d=True, multi_output=True, ensure_min_samples=2
+    )
+    n_components = min(X.shape[1], y.shape[1])
+    cca = CCA(n_components=n_components)
+    X_c, y_c = cca.fit_transform(X, y)
+    corrcoef = np.diagonal(np.corrcoef(X_c, y_c, rowvar=False), offset=n_components)
+    return sum(corrcoef**2)
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "y": ["array-like"],
+        "t": [Interval(Integral, 1, None, closed="left")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def ols(X, y, t=1):
+    """Orthogonal least-squares
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Feature matrix.
+
+    y : array-like of shape (n_samples,)
+        Target vector.
+
+    t : int, default=1
+        The parameter is the absolute number of features to select.
+
+    Returns
+    -------
+    indices : ndarray of shape (n_features_to_select,), dtype=int
+        The indices of the selected features. The order of the indices
+        is corresponding to the feature selection process.
+
+    scores : ndarray of shape (n_features_to_select,), dtype=float
+        The scores of selected features. The order of
+        the scores is corresponding to the feature selection process.
+    """
+    X, y = check_X_y(
+        X, y, dtype=float, ensure_2d=True
+    )
+    n_features = X.shape[1]
+    w = X / np.linalg.norm(X, axis=0)
+    v = y / np.linalg.norm(y)
+    mask = np.zeros(n_features, dtype=bool)
+    r2 = np.zeros(n_features)
+    indices = np.zeros(t, dtype=int)
+    scores = np.zeros(t, dtype=float)
+
+    for i in range(t):
+        for j in range(n_features):
+            if not mask[j]:
+                r = w[:, j] @ v
+                r2[j] = r**2
+        d = np.argmax(r2)
+        indices[i] = d
+        scores[i] = r2[d]
+        if i == t-1:
+            return indices, scores
+        mask[d] = True
+        r2[d] = 0
+        for j in range(n_features):
+            if not mask[j]:
+                w[:, j] = w[:, j] - w[:, d]*(w[:, d] @ w[:, j])
+                w[:, j] /= np.linalg.norm(w[:, j], axis=0)