Merge pull request #22 from MatthewSZhang/extend

MatthewSZhang · web-flow · commit 7537dd03f1d7 · 2024-11-19T17:02:56.000+08:00
FEAT add extend by mini-batch
diff --git a/doc/index.rst b/doc/index.rst
@@ -19,6 +19,7 @@ API Reference
 
    FastCan
    refine
+   extend
    ssc
    ols
 
diff --git a/fastcan/__init__.py b/fastcan/__init__.py
@@ -2,6 +2,7 @@
 The :mod:`fastcan` module implements algorithms, including
 """
 
+from ._extend import extend
 from ._fastcan import FastCan
 from ._refine import refine
 from ._utils import ols, ssc
@@ -11,4 +12,5 @@
     "ssc",
     "ols",
     "refine",
+    "extend",
 ]
diff --git a/fastcan/_cancorr_fast.pyx b/fastcan/_cancorr_fast.pyx
@@ -194,6 +194,10 @@ cpdef int _forward_search(
 
             # Find max scores and update indices, X, mask, and scores
             index = _iamax(n_features, &r2[0], 1)
+            if r2[index] == 0:
+                raise RuntimeError(
+                    f"No improvement can be found when selecting the {i}th feature."
+                )
             indices[i] = index
             scores[i] = r2[index]
 
diff --git a/fastcan/_extend.py b/fastcan/_extend.py
@@ -0,0 +1,120 @@
+"""
+Extend feature selection
+"""
+
+import math
+from copy import deepcopy
+from numbers import Integral
+
+import numpy as np
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.utils._param_validation import Interval, validate_params
+from sklearn.utils.validation import check_is_fitted
+
+from ._cancorr_fast import _forward_search  # type: ignore
+from ._fastcan import FastCan, _prepare_search
+
+
+@validate_params(
+    {
+        "selector": [FastCan],
+        "n_features_to_select": [
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "batch_size": [
+            Interval(Integral, 1, None, closed="left"),
+        ],
+    },
+    prefer_skip_nested_validation=False,
+)
+def extend(selector, n_features_to_select=1, batch_size=1):
+    """Extend FastCan with mini batches.
+
+    It is suitable for selecting a very large number of features
+    even larger than the number of samples.
+
+    Similar to the correlation filter which selects each feature without considering
+    the redundancy, the function selects features in mini-batch and the
+    redundancy between the two mini-batches will be ignored.
+
+    Parameters
+    ----------
+    selector : FastCan
+        FastCan selector.
+
+    n_features_to_select : int, default=1
+        The parameter is the absolute number of features to select.
+
+    batch_size : int, default=1
+        The number of features in a mini-batch.
+
+    Returns
+    -------
+    indices : ndarray of shape (n_features_to_select,), dtype=int
+        The indices of the selected features.
+
+    Examples
+    --------
+    >>> from fastcan import FastCan, extend
+    >>> X = [[1, 1, 0], [0.01, 0, 0], [-1, 0, 1], [0, 0, 0]]
+    >>> y = [1, 0, -1, 0]
+    >>> selector = FastCan(1, verbose=0).fit(X, y)
+    >>> print(f"Indices: {selector.indices_}")
+    Indices: [0]
+    >>> indices = extend(selector, 3, batch_size=2)
+    >>> print(f"Indices: {indices}")
+    Indices: [0 2 1]
+    """
+    check_is_fitted(selector)
+    n_inclusions = selector.indices_include_.size
+    n_features = selector.n_features_in_
+    n_to_select = n_features_to_select - selector.n_features_to_select
+    batch_size_to_select = batch_size - n_inclusions
+
+    if n_features_to_select > n_features:
+        raise ValueError(
+            f"n_features_to_select {n_features_to_select} "
+            f"must be <= n_features {n_features}."
+        )
+    if n_to_select <= 0:
+        raise ValueError(
+            f"The number of features to select ({n_to_select}) ", "is less than 0."
+        )
+    if batch_size_to_select <= 0:
+        raise ValueError(
+            "The size of mini batch without included indices ",
+            f"({batch_size_to_select}) is less than 0.",
+        )
+
+    X_transformed_ = deepcopy(selector.X_transformed_)
+
+    indices_include = selector.indices_include_
+    indices_exclude = selector.indices_exclude_
+    indices_select = selector.indices_[n_inclusions:]
+
+    n_threads = _openmp_effective_n_threads()
+
+    for i in range(math.ceil(n_to_select / batch_size_to_select)):
+        if i == 0:
+            batch_size_i = (n_to_select - 1) % batch_size_to_select + 1 + n_inclusions
+        else:
+            batch_size_i = batch_size
+        indices, scores, mask = _prepare_search(
+            n_features,
+            batch_size_i,
+            indices_include,
+            np.r_[indices_exclude, indices_select],
+        )
+        _forward_search(
+            X=X_transformed_,
+            V=selector.y_transformed_,
+            t=batch_size_i,
+            tol=selector.tol,
+            num_threads=n_threads,
+            verbose=0,
+            mask=mask,
+            indices=indices,
+            scores=scores,
+        )
+        indices_select = np.r_[indices_select, indices[n_inclusions:]]
+    return np.r_[indices_include, indices_select]
diff --git a/fastcan/_fastcan.py b/fastcan/_fastcan.py
@@ -77,6 +77,12 @@ class FastCan(SelectorMixin, BaseEstimator):
         When h-correlation method is used, `n_samples_` = n_samples.
         When eta-cosine method is used, `n_samples_` = n_features+n_outputs.
 
+    indices_include_ : ndarray of shape (n_inclusions,), dtype=int
+        The indices of the prerequisite features.
+
+    indices_exclude_ : array-like of shape (n_exclusions,), dtype=int
+        The indices of the excluded features.
+
     References
     ----------
     * Zhang, S., & Lang, Z. Q. (2022).
diff --git a/fastcan/_refine.py b/fastcan/_refine.py
@@ -93,6 +93,7 @@ def refine(selector, drop=1, max_iter=None, verbose=1):
 
     n_inclusions = indices_include.size
     n_selections = n_features_to_select - n_inclusions
+    n_threads = _openmp_effective_n_threads()
 
     if drop == "all":
         drop = np.arange(1, n_selections)
@@ -126,7 +127,6 @@ def refine(selector, drop=1, max_iter=None, verbose=1):
                 rolled_indices[:-drop_n],
                 indices_exclude,
             )
-            n_threads = _openmp_effective_n_threads()
             _forward_search(
                 X=X_transformed_,
                 V=selector.y_transformed_,
diff --git a/tests/test_extend.py b/tests/test_extend.py
@@ -0,0 +1,86 @@
+"""Test feature selection extend"""
+import numpy as np
+import pytest
+from numpy.testing import (
+    assert_array_equal,
+)
+from sklearn.datasets import make_classification
+
+from fastcan import FastCan, extend
+
+
+def test_select_extend_cls():
+    # Test whether refine work correctly with random samples.
+    n_samples = 200
+    n_features = 30
+    n_informative = 20
+    n_classes = 8
+    n_repeated = 5
+    n_to_select = 18
+
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=n_informative,
+        n_repeated=n_repeated,
+        n_classes=n_classes,
+        n_clusters_per_class=1,
+        flip_y=0.0,
+        class_sep=10,
+        shuffle=False,
+        random_state=0,
+    )
+
+    n_features_to_select = 2
+    selector = FastCan(n_features_to_select).fit(X, y)
+    indices = extend(selector, n_to_select, batch_size=3)
+    selector_inc = FastCan(n_features_to_select, indices_include=[10]).fit(X, y)
+    indices_inc = extend(selector_inc, n_to_select, batch_size=3)
+    selector_exc = FastCan(
+        n_features_to_select, indices_include=[10], indices_exclude=[0]
+    ).fit(X, y)
+    indices_exc = extend(selector_exc, n_to_select, batch_size=3)
+
+
+    assert np.unique(indices).size == n_to_select
+    assert_array_equal(indices[:n_features_to_select], selector.indices_)
+    assert np.unique(indices_inc).size == n_to_select
+    assert_array_equal(indices_inc[:n_features_to_select], selector_inc.indices_)
+    assert np.unique(indices_exc).size == n_to_select
+    assert_array_equal(indices_exc[:n_features_to_select], selector_exc.indices_)
+    assert ~np.isin(0, indices_exc)
+
+
+def test_extend_error():
+    # Test refine raise error.
+    n_samples = 200
+    n_features = 20
+    n_informative = 10
+    n_classes = 8
+    n_repeated = 5
+
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=n_informative,
+        n_repeated=n_repeated,
+        n_classes=n_classes,
+        n_clusters_per_class=1,
+        flip_y=0.0,
+        class_sep=10,
+        shuffle=False,
+        random_state=0,
+    )
+
+    n_features_to_select = 2
+
+    selector = FastCan(n_features_to_select, indices_include=[0]).fit(X, y)
+
+    with pytest.raises(ValueError, match=r"n_features_to_select .*"):
+        _ = extend(selector, n_features+1, batch_size=3)
+
+    with pytest.raises(ValueError, match=r"The number of features to select .*"):
+        _ = extend(selector, n_features_to_select, batch_size=3)
+
+    with pytest.raises(ValueError, match=r"The size of mini batch without .*"):
+        _ = extend(selector, n_features, batch_size=1)
diff --git a/tests/test_refine.py b/tests/test_refine.py
@@ -5,7 +5,7 @@
 from fastcan import FastCan, refine
 
 
-def test_select_refine_random_cls():
+def test_select_refine_cls():
     # Test whether refine work correctly with random samples.
     n_samples = 200
     n_features = 20

-Original file line number
+Diff line change
    FastCan
    refine
 +   extend
    ssc
    ols