scikit-learn-contrib · MatthewSZhang · Feb 18, 2025 · Feb 18, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -13,7 +13,7 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
-      - uses: prefix-dev/[email protected].1
+      - uses: prefix-dev/[email protected].2
         with:
           environments: default
           cache: true

diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2024 SIKAI ZHANG
+Copyright (c) 2024-2025 The fastcan developers.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/doc/conf.py b/doc/conf.py
@@ -14,7 +14,7 @@
 # sys.path.insert(0, os.path.abspath(".."))
 # General information about the project.
 project = "fastcan"
-copyright = f"{datetime.now().year}, fastcan developers (MIT License)"
+copyright = f"2024 - {datetime.now().year}, fastcan developers (MIT License)"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -25,7 +25,7 @@
 release = importlib.metadata.version(project)
 
 # The short X.Y version.
-version = '.'.join(release.split('.')[:2])
+version = ".".join(release.split(".")[:2])
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/doc/narx.rst b/doc/narx.rst
@@ -77,7 +77,7 @@ It should also be noted the different types of predictions in model training.
    and the model can be trained by the simple ordinary least-squares (OLS) method
 #. If assume the NARX model is a multiple-step-ahead prediction structure, the input data, like :math:`\hat{y}(k-1)` is
    unknown in advance. Therefore, the training data must first be generated by the multiple-step-ahead prediction with
-   the initial model coefficients, and then the coefficients can be updated recursively.
+   the initial model coefficients, and then the coefficients can be updated recursively
 
 ARX and OE model
 ----------------

diff --git a/doc/pruning.rst b/doc/pruning.rst
@@ -0,0 +1,48 @@
+.. currentmodule:: fastcan
+
+.. _pruning:
+
+===================================================
+Dictionary learning based unsupervised data pruning
+===================================================
+
+Different from feature selection, which reduces the size of dataset in column-wise,
+data pruning reduces the size of dataset in row-wise.
+To use :class:`FastCan` for unsupervised data pruning, the target :math:`Y` matrix is
+obtained first with `dictionary learning <https://scikit-learn.org/stable/modules/decomposition.html#dictionary-learning>`_.
+Dictionary learning will learn a ``dictionary`` which is composed of atoms.
+The atoms should be very representative, so that each sample of dataset can be represented (with errors)
+by sparse linear combinations of the atoms.
+We use these atoms as the target :math:`Y` and select samples based on their correlation with :math:`Y`.
+
+One challenge to use :class:`FastCan` for data pruning is that the number to select is much larger than feature selection.
+Normally, this number is higher than the number of features, which will make the pruned data matrix singular.
+In other words, :class:`FastCan` will easily think the pruned data is redundant and no additional sample
+should be selected, as any additional samples can be represented by linear combinations of the selected samples.
+Therefore, the number to select has to be set to small.
+
+To solve this problem, we use :func:`minibatch` to loose the redundancy check of :class:`FastCan`.
+The original :class:`FastCan` checks the redunancy within :math:`X_s \in \mathbb{R}^{n\times t}`, 
+which contains :math:`t` selected samples and n features,
+and the redunancy within :math:`Y \in \mathbb{R}^{n\times m}`, which contains :math:`m` atoms :math:`y_i`.
+:func:`minibatch` ranks samples with multiple correlation coefficients between :math:`X_b \in \mathbb{R}^{n\times b}` and :math:`y_i`,
+where :math:`b` is batch size and :math:`b <= t`, instead of canonical correlation coefficients between :math:`X_s` and :math:`Y`,
+which is used in :class:`FastCan`.
+Therefore, :func:`minibatch` looses the redundancy check in two ways.
+
+#. it uses :math:`y_i` instead of :math:`Y`, so no redundancy check is performed within :math:`Y`
+#. it uses :math:`X_b` instead of :math:`X_s`, so :func:`minibatch` only checks the redundancy within a batch :math:`X_b`, but does not
+   check the redundancy between batches.
+
+
+.. rubric:: References
+
+* `"Dictionary-learning-based data pruning for system identification"
+  <https://doi.org/10.48550/arXiv.2502.11484>`_
+  Wang, T., Zhang, S., & Sun L.
+  arXiv (2025).
+
+
+.. rubric:: Examples
+
+* See :ref:`sphx_glr_auto_examples_plot_pruning.py` for an example of dictionary learning based data pruning.
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
@@ -13,3 +13,4 @@ User Guide
    redundancy.rst
    ols_and_omp.rst
    narx.rst
+   pruning.rst
diff --git a/examples/plot_affinity.py b/examples/plot_affinity.py
@@ -9,7 +9,7 @@
 selection methods on affine transformed features.
 """
 
-# Authors: Sikai Zhang
+# Authors: The fastcan developers
 # SPDX-License-Identifier: MIT
 
 # %%
@@ -41,7 +41,6 @@
 print("FastCan: ", np.sort(ids_fastcan))
 
 
-
 # %%
 # Affine transformation
 # ---------------------
@@ -50,7 +49,6 @@
 # three features from the polluted features. The more stable the result, the better.
 
 
-
 n_features = X.shape[1]
 rng = np.random.default_rng()
 
@@ -75,7 +73,7 @@
 
 import matplotlib.pyplot as plt
 
-bin_lims = np.arange(n_features+1)
+bin_lims = np.arange(n_features + 1)
 counts_omp, _ = np.histogram(ids_omp_all, bins=bin_lims)
 counts_ols, _ = np.histogram(ids_ols_all, bins=bin_lims)
 counts_fastcan, _ = np.histogram(ids_fastcan_all, bins=bin_lims)

diff --git a/examples/plot_fisher.py b/examples/plot_fisher.py
@@ -10,7 +10,7 @@
 relationship with Fisher's criterion in LDA (Linear Discriminant Analysis).
 """
 
-# Authors: Sikai Zhang
+# Authors: The fastcan developers
 # SPDX-License-Identifier: MIT
 
 # %%
@@ -49,11 +49,11 @@
 fishers_criterion, _ = linalg.eigh(Sb, Sw)
 
 fishers_criterion = np.sort(fishers_criterion)[::-1]
-n_nonzero = min(X.shape[1], clf.classes_.shape[0]-1)
+n_nonzero = min(X.shape[1], clf.classes_.shape[0] - 1)
 # remove the eigenvalues which are close to zero
 fishers_criterion = fishers_criterion[:n_nonzero]
 # get canonical correlation coefficients from convert Fisher's criteria
-r2 = fishers_criterion/(1+fishers_criterion)
+r2 = fishers_criterion / (1 + fishers_criterion)
 
 # %%
 # Compute SSC

diff --git a/examples/plot_intuitive.py b/examples/plot_intuitive.py
@@ -9,7 +9,7 @@
 in :class:`FastCan`.
 """
 
-# Authors: Sikai Zhang
+# Authors: The fastcan developers
 # SPDX-License-Identifier: MIT
 
 # %%
@@ -28,7 +28,6 @@
 # property, so that the usefullness of each feature can be added together without
 # redundancy.
 
-
 import matplotlib.pyplot as plt
 import numpy as np
 from matplotlib.patches import Patch
@@ -37,8 +36,9 @@
 
 from fastcan import FastCan
 
-plt.rcParams['axes.spines.right'] = False
-plt.rcParams['axes.spines.top'] = False
+plt.rcParams["axes.spines.right"] = False
+plt.rcParams["axes.spines.top"] = False
+
 
 def get_r2(feats, target, feats_selected=None):
     """Get R-squared between [feats_selected, feat_i] and target."""
@@ -54,36 +54,38 @@ def get_r2(feats, target, feats_selected=None):
         r2[i] = lr.fit(feats_i, target).score(feats_i, target)
     return r2
 
+
 def plot_bars(ids, r2_left, r2_selected):
     """Plot the relative R-squared with a bar plot."""
-    legend_selected = Patch(color='tab:green', label='X_selected')
-    legend_cand = Patch(color='tab:blue', label='x_i: candidates')
-    legend_best = Patch(color='tab:orange', label='Best candidate')
+    legend_selected = Patch(color="tab:green", label="X_selected")
+    legend_cand = Patch(color="tab:blue", label="x_i: candidates")
+    legend_best = Patch(color="tab:orange", label="Best candidate")
     n_features = len(ids)
     n_selected = len(r2_selected)
 
-    left = np.zeros(n_features)+sum(r2_selected)
+    left = np.zeros(n_features) + sum(r2_selected)
     left_selected = np.cumsum(r2_selected)
     left_selected = np.r_[0, left_selected]
     left_selected = left_selected[:-1]
     left[:n_selected] = left_selected
 
-    label = [""]*n_features
-    label[np.argmax(r2_left)+n_selected] = f"{max(r2_left):.5f}"
+    label = [""] * n_features
+    label[np.argmax(r2_left) + n_selected] = f"{max(r2_left):.5f}"
 
-    colors = ["tab:blue"]*(n_features - n_selected)
+    colors = ["tab:blue"] * (n_features - n_selected)
     colors[np.argmax(r2_left)] = "tab:orange"
-    colors = ["tab:green"]*n_selected + colors
+    colors = ["tab:green"] * n_selected + colors
 
     hbars = plt.barh(ids, width=np.r_[score_selected, r2_left], color=colors, left=left)
-    plt.axvline(x = sum(r2_selected), color = 'tab:orange', linestyle="--")
+    plt.axvline(x=sum(r2_selected), color="tab:orange", linestyle="--")
     plt.bar_label(hbars, label)
     plt.yticks(np.arange(n_features))
     plt.xlabel("R-squared between [X_selected, x_i] and y")
     plt.ylabel("Feature index")
     plt.legend(handles=[legend_selected, legend_cand, legend_best])
     plt.show()
 
+
 X, y = load_diabetes(return_X_y=True)
 
 
@@ -92,7 +94,6 @@ def plot_bars(ids, r2_left, r2_selected):
 score_selected = []
 
 
-
 score_0 = get_r2(X, y)
 
 plot_bars(id_left, score_0, score_selected)
@@ -114,13 +115,12 @@ def plot_bars(ids, r2_left, r2_selected):
 id_selected += [id_left[index]]
 score_selected += [score_0[index]]
 id_left = np.delete(id_left, index)
-score_1 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected)
+score_1 = get_r2(X[:, id_left], y, X[:, id_selected]) - sum(score_selected)
 
 
 plot_bars(np.r_[id_selected, id_left], score_1, score_selected)
 
 
-
 # %%
 # Select the third feature
 # ------------------------
@@ -133,12 +133,11 @@ def plot_bars(ids, r2_left, r2_selected):
 id_selected += [id_left[index]]
 score_selected += [score_1[index]]
 id_left = np.delete(id_left, index)
-score_2 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected)
+score_2 = get_r2(X[:, id_left], y, X[:, id_selected]) - sum(score_selected)
 
 plot_bars(np.r_[id_selected, id_left], score_2, score_selected)
 
 
-
 # %%
 # h-correlation and eta-cosine
 # ----------------------------
@@ -180,7 +179,7 @@ def plot_bars(ids, r2_left, r2_selected):
 score_selected = [score_0[index]]
 id_left = np.arange(X.shape[1])
 id_left = np.delete(id_left, index)
-score_1_7 = get_r2(X[:, id_left], y, X[:, id_selected])-sum(score_selected)
+score_1_7 = get_r2(X[:, id_left], y, X[:, id_selected]) - sum(score_selected)
 
 plot_bars(np.r_[id_selected, id_left], score_1_7, score_selected)
 

diff --git a/examples/plot_narx.py b/examples/plot_narx.py
@@ -9,7 +9,7 @@
 NARX model for time series prediction.
 """
 
-# Authors: Sikai Zhang
+# Authors: The fastcan developers
 # SPDX-License-Identifier: MIT
 
 # %%
@@ -26,19 +26,24 @@
 # :math:`u_0` and :math:`u_1` are input signals,
 # and :math:`y` is the output signal.
 
-
 import numpy as np
 
 rng = np.random.default_rng(12345)
 n_samples = 1000
 max_delay = 3
 e = rng.normal(0, 0.1, n_samples)
-u0 = rng.uniform(0, 1, n_samples+max_delay)
-u1 = rng.normal(0, 0.1, n_samples+max_delay)
-y = np.zeros(n_samples+max_delay)
-for i in range(max_delay, n_samples+max_delay):
-    y[i] = 0.5*y[i-1]+0.3*u0[i]**2+2*u0[i-1]*u0[i-3]+1.5*u0[i-2]*u1[i-3]+1
-y = y[max_delay:]+e
+u0 = rng.uniform(0, 1, n_samples + max_delay)
+u1 = rng.normal(0, 0.1, n_samples + max_delay)
+y = np.zeros(n_samples + max_delay)
+for i in range(max_delay, n_samples + max_delay):
+    y[i] = (
+        0.5 * y[i - 1]
+        + 0.3 * u0[i] ** 2
+        + 2 * u0[i - 1] * u0[i - 3]
+        + 1.5 * u0[i - 2] * u1[i - 3]
+        + 1
+    )
+y = y[max_delay:] + e
 X = np.c_[u0[max_delay:], u1[max_delay:]]
 
 # %%
@@ -75,9 +80,9 @@
 from fastcan.narx import make_time_shift_features, make_time_shift_ids
 
 time_shift_ids = make_time_shift_ids(
-    n_features=3, # Number of inputs (2) and output (1) signals
-    max_delay=3, # Maximum time delays
-    include_zero_delay = [True, True, False], # Whether to include zero delay
+    n_features=3,  # Number of inputs (2) and output (1) signals
+    max_delay=3,  # Maximum time delays
+    include_zero_delay=[True, True, False],  # Whether to include zero delay
     # for each signal. The output signal should not have zero delay.
 )
 
@@ -90,8 +95,8 @@
 from fastcan.narx import make_poly_features, make_poly_ids
 
 poly_ids = make_poly_ids(
-    n_features=time_shift_vars.shape[1], # Number of time-shifted variables
-    degree=2, # Maximum polynomial degree
+    n_features=time_shift_vars.shape[1],  # Number of time-shifted variables
+    degree=2,  # Maximum polynomial degree
 )
 
 poly_terms = make_poly_features(time_shift_vars, poly_ids)
@@ -105,7 +110,7 @@
 from fastcan import FastCan
 
 selector = FastCan(
-    n_features_to_select=4, # 4 terms should be selected
+    n_features_to_select=4,  # 4 terms should be selected
 ).fit(poly_terms, y)
 
 support = selector.get_support()
@@ -124,7 +129,7 @@
 
 narx_model = NARX(
     time_shift_ids=time_shift_ids,
-    poly_ids = selected_poly_ids,
+    poly_ids=selected_poly_ids,
 )
 
 narx_model.fit(X, y)
@@ -158,7 +163,7 @@
 
 y_pred = narx_model.predict(
     X[:100],
-    y_init=y[:narx_model.max_delay_] # Set the initial values of the prediction to
+    y_init=y[: narx_model.max_delay_],  # Set the initial values of the prediction to
     # the true values
 )
 

diff --git a/examples/plot_narx_msa.py b/examples/plot_narx_msa.py
@@ -8,7 +8,7 @@
 In this example, we will compare one-step-ahead NARX and multi-step-ahead NARX.
 """
 
-# Authors: Sikai Zhang
+# Authors: The fastcan developers
 # SPDX-License-Identifier: MIT
 
 # %%