FIX minibatch ssc score not match gtruth (#200)

MatthewSZhang · web-flow · commit b9c9d8a0f8a9 · 2025-11-12T14:50:28.000+08:00
diff --git a/asv_benchmarks/benchmarks/fastcan.py b/asv_benchmarks/benchmarks/fastcan.py
@@ -37,11 +37,7 @@ def setup_cache(self):
             else:
                 eta = False
                 beam_width = 10
-            estimator = FastCan(
-                n_features_to_select=20,
-                eta=eta,
-                beam_width=beam_width
-            )
+            estimator = FastCan(n_features_to_select=20, eta=eta, beam_width=beam_width)
             estimator.fit(X, y)
 
             est_path = get_estimator_path(self, params)
diff --git a/fastcan/_beam.py b/fastcan/_beam.py
@@ -45,18 +45,14 @@ def _beam_search(
 
     for i in range(n_features_to_select - n_inclusions):
         if i == 0:
-            mask, X_selected = _prepare_candidates(
-                X, mask_exclude, indices_include
-            )
+            mask, X_selected = _prepare_candidates(X, mask_exclude, indices_include)
             if X_selected.shape[1] == 0:
                 beams_scores = np.sum((X.T @ V) ** 2, axis=1)
                 beams_scores[mask] = 0
             else:
                 W_selected = orth(X_selected)
                 selected_score = np.sum((W_selected.T @ V) ** 2)
-                beams_scores = _mgs_ssc(
-                    X, V, W_selected, mask, selected_score, tol
-                )
+                beams_scores = _mgs_ssc(X, V, W_selected, mask, selected_score, tol)
             beams_selected_ids = [indices_include for _ in range(beam_width)]
             beams_selected_ids, top_k_scores = _select_top_k(
                 beams_scores[None, :],
diff --git a/fastcan/_minibatch.py b/fastcan/_minibatch.py
@@ -5,13 +5,15 @@
 # Authors: The fastcan developers
 # SPDX-License-Identifier: MIT
 
+import warnings
 from numbers import Integral, Real
 
 import numpy as np
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 from sklearn.utils._param_validation import Interval, validate_params
 from sklearn.utils.validation import check_X_y
 
+from ._beam import _safe_normalize
 from ._cancorr_fast import _greedy_search  # type: ignore[attr-defined]
 from ._fastcan import _prepare_search
 
@@ -101,11 +103,18 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, tol=0.01, verbose=1):
         )
     )
     X_transformed_ = X - X.mean(0)
-    y_transformed_ = y - y.mean(0)
+    y_transformed_, const_mask = _safe_normalize(y - y.mean(0))
+    if const_mask.any():
+        warnings.warn(
+            f"Contain constant targets, whose indices are {np.where(const_mask)[0]}.",
+            UserWarning,
+        )
     indices_include = np.zeros(0, dtype=int)  # just an empty array
     indices_select = np.zeros(0, dtype=int)
 
     for i in range(n_outputs):
+        if const_mask[i]:
+            continue
         y_i = y_transformed_[:, [i]]
         n_selected_i = 0
         while n_to_select_split[i] > n_selected_i:
@@ -137,7 +146,9 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, tol=0.01, verbose=1):
             n_selected_i += batch_size_temp
             if verbose == 1:
                 print(
-                    f"Progress: {indices_select.size}/{n_features_to_select}", end="\r"
+                    f"Progress: {indices_select.size}/{n_features_to_select}, "
+                    f"Batch SSC: {scores.sum():.5f}",
+                    end="\r",
                 )
     if verbose == 1:
         print()
diff --git a/pixi.toml b/pixi.toml
@@ -84,8 +84,8 @@ scikit-learn = ">=1.6.0"
 fastcan = { path = ".", editable = true }
 
 [tasks]
-time-h = "python -m timeit -n 5 -s 'import numpy as np; from fastcan import FastCan; X = np.random.rand(3000, 100); y = np.random.rand(3000, 20)' 's = FastCan(100, verbose=0).fit(X, y)'"
-time-eta = "python -m timeit -n 5 -s 'import numpy as np; from fastcan import FastCan; X = np.random.rand(3000, 100); y = np.random.rand(3000, 20)' 's = FastCan(100, eta=True, verbose=0).fit(X, y)'"
+time-h = "python -m timeit -n 5 -s 'import numpy as np; from fastcan import FastCan; X = np.random.rand(30000, 100); y = np.random.rand(30000, 20)' 's = FastCan(100, verbose=0).fit(X, y)'"
+time-eta = "python -m timeit -n 5 -s 'import numpy as np; from fastcan import FastCan; X = np.random.rand(30000, 100); y = np.random.rand(30000, 20)' 's = FastCan(100, eta=True, verbose=0).fit(X, y)'"
 profile-minibatch = { cmd = '''python -c "import cProfile; import numpy as np; from fastcan import minibatch; X = np.random.rand(100, 3000); y = np.random.rand(100, 20); cProfile.run('minibatch(X, y, 1000, 10, verbose=0)', sort='{{ SORT }}')"''', args = [{ arg = "SORT", default = "cumtime" }] }
 time-narx = '''python -m timeit -n 1 -s "import numpy as np; from fastcan.narx import make_narx; rng = np.random.default_rng(5); X = rng.random((1000, 10)); y = rng.random((1000, 2)); m = make_narx(X, y, 10, max_delay=2, poly_degree=2, verbose=0)" "m.fit(X, y, coef_init='one_step_ahead', verbose=1)"'''
 profile-narx = { cmd = '''python -c "import cProfile; import numpy as np; from fastcan.narx import make_narx; rng = np.random.default_rng(8); X = rng.random((3000, 3)); y = rng.random((3000, 3)); m = make_narx(X, y, 10, max_delay=10, poly_degree=2, verbose=0); cProfile.run('m.fit(X, y, coef_init=[0]*33)', sort='{{ SORT }}')"''', args = [{ arg = "SORT", default = "tottime" }] }
diff --git a/tests/test_minibatch.py b/tests/test_minibatch.py
@@ -3,10 +3,11 @@
 import numpy as np
 import pytest
 from sklearn.cluster import KMeans
-from sklearn.datasets import load_iris, make_classification
+from sklearn.datasets import load_iris, make_classification, make_regression
 from sklearn.preprocessing import OneHotEncoder
 
 from fastcan import minibatch
+from fastcan.utils import ssc
 
 
 def test_data_pruning():
@@ -60,7 +61,7 @@ def test_select_minibatch_cls():
     assert indices.size == n_to_select
 
 
-def test_minibatch_error():
+def test_minibatch_error_warning():
     # Test refine raise error.
     n_samples = 200
     n_features = 20
@@ -83,3 +84,37 @@ def test_minibatch_error():
 
     with pytest.raises(ValueError, match=r"n_features_to_select .*"):
         _ = minibatch(X, y, n_features + 1, batch_size=3)
+
+    Y = OneHotEncoder(sparse_output=False).fit_transform(y.reshape(-1, 1))
+    Y[:, 0] = 1
+    with pytest.warns(
+        UserWarning, match=r"Contain constant targets, whose indices are .*"
+    ):
+        _ = minibatch(X, Y, 5, batch_size=3)
+
+
+def test_minibatch_ssc_aligned(capsys):
+    # Test whether ssc of minibatch aligns with the true ssc score
+    n_features = 20
+    n_targets = 5
+    n_to_select = 10
+    X, y = make_regression(
+        n_samples=100,
+        n_features=n_features,
+        n_informative=10,
+        n_targets=n_targets,
+        noise=0.1,
+        random_state=0,
+    )
+
+    # The last batch of features are selected for the last target.
+    # The number of features selected per target is n_to_select // n_targets
+    n_features_per_target = n_to_select // n_targets
+    indices = minibatch(X, y, n_to_select, batch_size=n_features_per_target + 1)
+    captured = capsys.readouterr()
+
+    gtruth_ssc = ssc(X[:, indices[-n_features_per_target:]], y[:, [-1]])
+    assert (
+        f"Progress: {n_to_select}/{n_to_select}, "
+        f"Batch SSC: {gtruth_ssc:.5f}" in captured.out
+    )