FEAT add beam search (#187)

MatthewSZhang · web-flow · commit 085a225f9081 · 2025-10-25T21:07:47.000+08:00
diff --git a/.github/workflows/asv.yml b/.github/workflows/asv.yml
@@ -42,7 +42,7 @@ jobs:
           pixi run asv-build ${{ matrix.os }}
 
       - name: Upload benchmark results
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
         with:
           name: asv-results-${{ matrix.os }}
           path: asv_benchmarks/results
@@ -82,7 +82,7 @@ jobs:
           cp -r gh-pages/results/* asv_benchmarks/results/ 2>/dev/null || true
 
       - name: Download all benchmark results
-        uses: actions/download-artifact@v5
+        uses: actions/download-artifact@v6
         with:
           pattern: asv-results-*
 
diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml
@@ -13,7 +13,7 @@ jobs:
         env:
           CIBW_PLATFORM: pyodide
       - name: Upload package
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
         with:
           name: wasm_wheel
           path: ./wheelhouse/*_wasm32.whl
diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml
@@ -18,7 +18,7 @@ jobs:
       id-token: write
     steps:
       - name: Download artifacts
-        uses: actions/download-artifact@v5
+        uses: actions/download-artifact@v6
         with:
           path: dist/
           merge-multiple: true
@@ -29,7 +29,7 @@ jobs:
         uses: pypa/gh-action-pypi-publish@release/v1
 
       - name: get wasm dist artifacts
-        uses: actions/download-artifact@v5
+        uses: actions/download-artifact@v6
         with:
           name: wasm_wheel
           path: wasm/
diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml
@@ -19,7 +19,7 @@ jobs:
         run: |
           pixi run build-sdist
       - name: Store artifacts
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
         with:
           name: cibw-sdist
           path: dist/*.tar.gz
@@ -43,7 +43,7 @@ jobs:
           # Include free-threaded support
           CIBW_ENABLE: cpython-freethreading
       - name: Upload package
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
         with:
           name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
           path: ./wheelhouse/*.whl
diff --git a/fastcan/_beam.py b/fastcan/_beam.py
@@ -0,0 +1,155 @@
+"""
+Beam search.
+"""
+
+# Authors: The fastcan developers
+# SPDX-License-Identifier: MIT
+
+import numpy as np
+from scipy.linalg import orth
+
+
+def _beam_search(
+    X, V, n_features_to_select, beam_width, indices_include, mask_exclude, tol, verbose
+):
+    """
+    Perform beam search to find the best subset of features.
+
+    Parameters:
+    X : np.ndarray
+        The transformed input feature matrix.
+    V : np.ndarray
+        The transformed target variable.
+    n_features_to_select : int
+        The total number of features to select.
+    beam_width : int
+        The number of top candidates to keep at each step.
+    indices_include : list
+        The indices of features that must be included in the selection.
+    mask_exclude : np.ndarray, dtype=bool
+        A boolean mask indicating which features to exclude.
+    tol : float
+        Tolerance for numerical stability in Gram-Schmidt process.
+    verbose : bool
+        If True, print progress information.
+
+    Returns:
+    indices : np.ndarray, dtype=np.int32
+        The indices of the selected features.
+    """
+
+    n_features = X.shape[1]
+    n_inclusions = len(indices_include)
+
+    X, _ = _safe_normalize(X)
+
+    for i in range(n_features_to_select - n_inclusions):
+        if i == 0:
+            X_support, X_selected = _prepare_candidates(
+                X, mask_exclude, indices_include
+            )
+            beams_selected_ids = [indices_include for _ in range(beam_width)]
+            W_selected = orth(X_selected)
+            selected_score = np.sum((W_selected.T @ V) ** 2)
+            beams_scores = _gram_schmidt(
+                X, X_support, X_selected, selected_score, V, tol
+            )
+            beams_selected_ids, top_k_scores = _select_top_k(
+                beams_scores[None, :],
+                beams_selected_ids,
+                beam_width,
+            )
+            continue
+        beams_scores = np.zeros((beam_width, n_features))
+        for beam_idx in range(beam_width):
+            X_support, X_selected = _prepare_candidates(
+                X, mask_exclude, beams_selected_ids[beam_idx]
+            )
+            beams_scores[beam_idx] = _gram_schmidt(
+                X, X_support, X_selected, top_k_scores[beam_idx], V, tol
+            )
+        beams_selected_ids, top_k_scores = _select_top_k(
+            beams_scores,
+            beams_selected_ids,
+            beam_width,
+        )
+        if verbose:
+            print(
+                f"Beam Search: {i + 1 + n_inclusions}/{n_features_to_select}, "
+                f"Best Beam: {np.argmax(top_k_scores)}, "
+                f"Beam SSC: {top_k_scores.max():.5f}",
+                end="\r",
+            )
+    if verbose:
+        print()
+    best_beam = np.argmax(top_k_scores)
+    indices = beams_selected_ids[best_beam]
+    return np.array(indices, dtype=np.int32, order="F")
+
+
+def _prepare_candidates(X, mask_exclude, indices_selected):
+    X_support = np.copy(~mask_exclude)
+    X_support[indices_selected] = False
+    X_selected = X[:, indices_selected]
+    return X_support, X_selected
+
+
+def _select_top_k(
+    beams_scores,
+    ids_selected,
+    beam_width,
+):
+    # For explore wider: make each feature in each selection iteration can
+    # only be selected once.
+    # For explore deeper: allow different beams to select the same feature
+    # at the different selection iteration.
+    n_features = beams_scores.shape[1]
+    beams_max = np.argmax(beams_scores, axis=0)
+    scores_max = beams_scores[beams_max, np.arange(n_features)]
+    n_valid = np.sum(beams_scores.any(axis=0))
+    n_selected = len(ids_selected[0])
+    if n_valid < beam_width:
+        raise ValueError(
+            "Beam Search: Not enough valid candidates to select "
+            f"beam width number of features, when selecting feature {n_selected + 1}. "
+            "Please decrease beam_width or n_features_to_select."
+        )
+
+    top_k_ids = np.argpartition(scores_max, -beam_width)[-beam_width:]
+    new_ids_selected = [[] for _ in range(beam_width)]
+    for k, beam_k in enumerate(beams_max[top_k_ids]):
+        new_ids_selected[k] = ids_selected[beam_k] + [top_k_ids[k]]
+    top_k_scores = scores_max[top_k_ids]
+    return new_ids_selected, top_k_scores
+
+
+def _gram_schmidt(X, X_support, X_selected, selected_score, V, tol, modified=True):
+    X = np.copy(X)
+    if modified:
+        # Change to Modified Gram-Schmidt
+        W_selected = orth(X_selected)
+    scores = np.zeros(X.shape[1])
+    for i, support in enumerate(X_support):
+        if not support:
+            continue
+        xi = X[:, i : i + 1]
+        for j in range(W_selected.shape[1]):
+            proj = W_selected[:, j : j + 1].T @ xi
+            xi -= proj * W_selected[:, j : j + 1]
+        wi, X_support[i] = _safe_normalize(xi)
+        if not X_support[i]:
+            continue
+        if np.any(np.abs(wi.T @ W_selected) > tol):
+            X_support[i] = False
+            continue
+        scores[i] = np.sum((wi.T @ V) ** 2)
+    scores += selected_score
+    scores[~X_support] = 0
+    return scores
+
+
+def _safe_normalize(X):
+    norm = np.linalg.norm(X, axis=0)
+    non_zero_support = norm != 0
+    norm[~non_zero_support] = 1
+    return X / norm, non_zero_support
diff --git a/fastcan/_cancorr_fast.pyx b/fastcan/_cancorr_fast.pyx
@@ -125,7 +125,7 @@ cdef void _mgsvv(
 
 
 @final
-cpdef int _forward_search(
+cpdef int _greedy_search(
     floating[::1, :] X,               # IN/OUT
     floating[::1, :] V,               # IN
     int t,                            # IN
diff --git a/fastcan/_fastcan.py b/fastcan/_fastcan.py
@@ -5,7 +5,6 @@
 # Authors: The fastcan developers
 # SPDX-License-Identifier: MIT
 
-from copy import deepcopy
 from numbers import Integral, Real
 
 import numpy as np
@@ -17,7 +16,8 @@
 from sklearn.utils._param_validation import Interval
 from sklearn.utils.validation import check_is_fitted, validate_data
 
-from ._cancorr_fast import _forward_search  # type: ignore[attr-defined]
+from ._beam import _beam_search
+from ._cancorr_fast import _greedy_search  # type: ignore[attr-defined]
 
 
 class FastCan(SelectorMixin, BaseEstimator):
@@ -46,6 +46,13 @@ class FastCan(SelectorMixin, BaseEstimator):
         the feature `x` is linear dependent to the selected features,
         and `mask` for that feature will True.
 
+    beam_width : int, default=1
+        The beam width for beam search.
+        When `beam_width` = 1, use greedy search.
+        When `beam_width` > 1, use beam search.
+
+        .. versionadded:: 0.5
+
     verbose : int, default=1
         The verbosity level.
 
@@ -114,6 +121,9 @@ class FastCan(SelectorMixin, BaseEstimator):
         "indices_exclude": [None, "array-like"],
         "eta": ["boolean"],
         "tol": [Interval(Real, 0, None, closed="neither")],
+        "beam_width": [
+            Interval(Integral, 1, None, closed="left"),
+        ],
         "verbose": ["verbose"],
     }
 
@@ -125,13 +135,15 @@ def __init__(
         indices_exclude=None,
         eta=False,
         tol=0.01,
+        beam_width=1,
         verbose=1,
     ):
         self.n_features_to_select = n_features_to_select
         self.indices_include = indices_include
         self.indices_exclude = indices_exclude
         self.eta = eta
         self.tol = tol
+        self.beam_width = beam_width
         self.verbose = verbose
 
     def fit(self, X, y):
@@ -204,15 +216,16 @@ def fit(self, X, y):
                 "`indices_include` and `indices_exclude` should not have intersection."
             )
 
-        n_candidates = (
-            n_features - self.indices_exclude_.size - self.n_features_to_select
-        )
-        if n_candidates < 0:
+        if (
+            n_features - self.indices_exclude_.size
+            < self.n_features_to_select + self.beam_width - 1
+        ):
             raise ValueError(
-                "n_features - n_features_to_select - n_exclusions should >= 0."
+                "n_features - n_exclusions should >= "
+                "n_features_to_select + beam_width - 1."
             )
-        if self.n_features_to_select - self.indices_include_.size < 0:
-            raise ValueError("n_features_to_select - n_inclusions should >= 0.")
+        if self.n_features_to_select < self.indices_include_.size:
+            raise ValueError("n_features_to_select should >= n_inclusions.")
 
         if self.eta:
             xy_hstack = np.hstack((X, y))
@@ -235,9 +248,28 @@ def fit(self, X, y):
             self.indices_exclude_,
         )
 
+        if self.beam_width > 1:
+            indices = _beam_search(
+                X=self.X_transformed_.copy(order="F"),
+                V=self.y_transformed_,
+                n_features_to_select=self.n_features_to_select,
+                beam_width=self.beam_width,
+                indices_include=list(self.indices_include_.copy()),
+                mask_exclude=mask.astype(bool, copy=True),
+                tol=self.tol,
+                verbose=self.verbose,
+            )
+
+            indices, scores, mask = _prepare_search(
+                n_features,
+                self.n_features_to_select,
+                indices,
+                self.indices_exclude_,
+            )
+
         n_threads = _openmp_effective_n_threads()
-        _forward_search(
-            X=deepcopy(self.X_transformed_),
+        _greedy_search(
+            X=self.X_transformed_.copy(order="F"),
             V=self.y_transformed_,
             t=self.n_features_to_select,
             tol=self.tol,
diff --git a/fastcan/_minibatch.py b/fastcan/_minibatch.py
@@ -5,15 +5,14 @@
 # Authors: The fastcan developers
 # SPDX-License-Identifier: MIT
 
-from copy import deepcopy
 from numbers import Integral, Real
 
 import numpy as np
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 from sklearn.utils._param_validation import Interval, validate_params
 from sklearn.utils.validation import check_X_y
 
-from ._cancorr_fast import _forward_search  # type: ignore[attr-defined]
+from ._cancorr_fast import _greedy_search  # type: ignore[attr-defined]
 from ._fastcan import _prepare_search
 
 
@@ -118,8 +117,8 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, tol=0.01, verbose=1):
                 indices_select,
             )
             try:
-                _forward_search(
-                    X=deepcopy(X_transformed_),
+                _greedy_search(
+                    X=np.copy(X_transformed_, order="F"),
                     V=y_i,
                     t=batch_size_temp,
                     tol=tol,
@@ -130,7 +129,7 @@ def minibatch(X, y, n_features_to_select=1, batch_size=1, tol=0.01, verbose=1):
                     scores=scores,
                 )
             except RuntimeError:
-                # If the batch size is too large, _forward_search cannot find enough
+                # If the batch size is too large, _greedy_search cannot find enough
                 # samples to form a non-singular matrix. Then, reduce the batch size.
                 indices = indices[indices != -1]
                 batch_size_temp = indices.size
diff --git a/fastcan/_refine.py b/fastcan/_refine.py
diff --git a/tests/test_beam.py b/tests/test_beam.py
diff --git a/tests/test_fastcan.py b/tests/test_fastcan.py