Merge pull request #16 from CangyuanLi/selection

CangyuanLi · web-flow · commit b2ba0a754e50 · 2026-01-02T18:10:42.000-08:00
diff --git a/python/rapidstats/_corr.py b/python/rapidstats/_corr.py
@@ -1,8 +1,8 @@
 import itertools
 from typing import Literal, Optional, Union
 
-import narwhals as nw
-import narwhals.typing as nwt
+import narwhals.stable.v1 as nw
+import narwhals.stable.v1.typing as nwt
 import polars as pl
 
 CorrelationMethod = Literal["pearson", "spearman"]
diff --git a/python/rapidstats/selection.py b/python/rapidstats/selection.py
@@ -2,6 +2,7 @@
 
 import copy
 import inspect
+import logging
 import math
 import pickle
 from collections.abc import Iterable
@@ -14,8 +15,11 @@
 from polars.series.series import ArrayLike
 from tqdm.auto import tqdm
 
+from ._corr import correlation_matrix
 from .metrics import roc_auc
 
+logger = logging.getLogger(__name__)
+
 
 class Estimator(Protocol):
     def fit(self, X, y, **kwargs): ...
@@ -179,7 +183,7 @@ def __init__(
         n_features_to_select: float = 1,
         step: float = 1,
         importance: Callable[[RFEState], Iterable[float]] = _rfe_get_feature_importance,
-        callbacks: Optional[Iterable[Callable[[RFEState]]]] = None,
+        callbacks: Optional[Iterable[Callable[[RFEState], Any]]] = None,
         quiet: bool = False,
     ):
         self.unfit_estimator = estimator
@@ -235,14 +239,14 @@ def fit(
                     **fit_kwargs,
                 )
 
-                state = {
-                    "estimator": est,
-                    "X": X_loop,
-                    "y": y,
-                    "eval_set": fit_kwargs.get("eval_set", None),
-                    "features": features,
-                    "iteration": iteration,
-                }
+                state = RFEState(
+                    estimator=est,
+                    X=X_loop,
+                    y=y,
+                    eval_set=fit_kwargs.get("eval_set", None),
+                    features=features,
+                    iteration=iteration,
+                )
 
                 for callback in self.callbacks:
                     callback(state)
@@ -261,6 +265,9 @@ def fit(
                 real_step = _get_step(len_features, step)
                 k = len_features - real_step
 
+                if k <= 0:
+                    break
+
                 remaining_features = (
                     pl.LazyFrame(
                         {"importance": self.importance(state), "feature": features}
@@ -277,38 +284,15 @@ def fit(
                 pbar.update(1)
 
         self.estimator_ = est
-        self.selected_features_ = features
+        self.selected_features_ = sorted(features)
 
         return self
 
-    def transform(
-        self,
-        X: Optional[nwt.IntoDataFrame] = None,
-        y: Optional[Any] = None,
-        **fit_kwargs,
-    ) -> Any:
-        if X is None or y is None:
-            return self.estimator_
-
-        if "eval_set" in fit_kwargs:
-            fit_kwargs["eval_set"] = [
-                (
-                    nw.from_native(X_val).select(self.selected_features_).to_native(),
-                    y_val,
-                )
-                for X_val, y_val in fit_kwargs["eval_set"]
-            ]
-
-        return self.unfit_estimator.fit(
-            nw.from_native(X, eager_only=True)
-            .select(self.selected_features_)
-            .to_native(),
-            y,
-            **fit_kwargs,
-        )
+    def transform(self, X: nwt.IntoFrameT) -> nwt.IntoFrameT:
+        return nw.from_native(X).select(self.selected_features_).to_native()
 
     def fit_transform(self, X, y, **fit_kwargs) -> Any:
-        return self.fit(X, y, **fit_kwargs).transform()
+        return self.fit(X, y, **fit_kwargs).transform(X)
 
 
 class NFEState(TypedDict):
@@ -328,7 +312,7 @@ def __init__(
         self,
         estimator: Estimator,
         importance: Callable[[NFEState], ArrayLike] = _nfe_get_feature_importance,
-        seed: Optional[int] = None,
+        seed: Optional[int] = 208,
     ):
         self.unfit_estimator = estimator
         self.importance = importance
@@ -347,7 +331,6 @@ def _add_noise(self, df: nw.DataFrame) -> nw.DataFrame:
         )
 
     def fit(self, X: nwt.IntoDataFrame, y: Any, **fit_kwargs):
-
         X_nw = nw.from_native(X, eager_only=True).pipe(self._add_noise)
 
         if "eval_set" in fit_kwargs:
@@ -364,7 +347,7 @@ def fit(self, X: nwt.IntoDataFrame, y: Any, **fit_kwargs):
         X_train = X_nw.to_native()
         est = self.unfit_estimator.fit(X_train, y, **fit_kwargs)
 
-        state = {"estimator": est, "X": X_train, "y": y}
+        state = NFEState(estimator=est, X=X_train, y=y)
 
         nfe_features = (
             pl.LazyFrame(
@@ -377,35 +360,118 @@ def fit(self, X: nwt.IntoDataFrame, y: Any, **fit_kwargs):
                 )
             )
             .collect()["feature"]
+            .sort()
             .to_list()
         )
 
         self.selected_features_ = nfe_features
 
         return self
 
-    def transform(
-        self,
-        X: nwt.IntoDataFrame,
-        y: Any,
-        **fit_kwargs,
-    ) -> Any:
-        if "eval_set" in fit_kwargs:
-            fit_kwargs["eval_set"] = [
-                (
-                    nw.from_native(X_val).select(self.selected_features_).to_native(),
-                    y_val,
-                )
-                for X_val, y_val in fit_kwargs["eval_set"]
-            ]
+    def transform(self, X: nwt.IntoFrameT) -> nwt.IntoFrameT:
+        return nw.from_native(X).select(self.selected_features_).to_native()
+
+    def fit_transform(
+        self, X: nwt.IntoDataFrameT, y: Any, **fit_kwargs
+    ) -> nwt.IntoDataFrameT:
+        return self.fit(X, y, **fit_kwargs).transform(X)
+
+
+class CFE:
+    def __init__(self, threshold: float = 0.99, seed: Optional[int] = 208):
+        self.threshold = threshold
+        self.seed = seed
+
+    @staticmethod
+    def _find_drop(corr_mat: nw.DataFrame, seed: Optional[int]) -> tuple[str, int]:
+        f1_counts = corr_mat.group_by("f1").agg(nw.len().alias("count_f1"))
+        f2_counts = corr_mat.group_by("f2").agg(nw.len().alias("count_f2"))
+
+        counts = (
+            f1_counts.join(f2_counts, left_on="f1", right_on="f2", how="full")
+            .with_columns(
+                nw.coalesce("f1", "f2").alias("feature"),
+                nw.sum_horizontal("count_f1", "count_f2").alias("count"),
+            )
+            .select("feature", "count")
+            .filter(nw.col("count").__eq__(nw.col("count").max()))
+            # We need to sort by "feature" because the order after the join is not
+            # always the same, making multiple runs even with the same seed not
+            # reproducible without the sort.
+            .sort("feature")
+            # We could take the first or last, but let's sample so that we don't
+            # introduce bias based on the alphabetical order.
+            .sample(1, seed=seed)
+        )
+
+        return (counts["feature"].item(), counts["count"].item())
+
+    def fit_from_correlation_matrix(
+        self, corr_mat: nwt.IntoFrame, index: str = "", transform: bool = True
+    ):
+        cm_nw = nw.from_native(corr_mat).lazy()
 
-        return self.unfit_estimator.fit(
-            nw.from_native(X, eager_only=True)
-            .select(self.selected_features_)
-            .to_native(),
-            y,
-            **fit_kwargs,
+        if transform:
+            cm_nw = cm_nw.unpivot(index=index).rename(
+                {index: "f1", "variable": "f2", "value": "correlation"}
+            )
+
+        features = (
+            nw.concat(
+                [
+                    cm_nw.select("f1").rename({"f1": "x"}),
+                    cm_nw.select("f2").rename({"f2": "x"}),
+                ],
+                how="vertical",
+            )
+            .unique()
+            .collect()["x"]
+            .to_list()
+        )
+
+        cm_nw = (
+            cm_nw.with_columns(nw.col("correlation").abs())
+            .filter(
+                nw.col("f1").__ne__(nw.col("f2")),
+                nw.col("correlation").is_null().__invert__(),
+                nw.col("correlation").is_nan().__invert__(),
+                nw.col("correlation").__ge__(self.threshold),
+            )
+            .collect()
         )
 
-    def fit_transform(self, X: nwt.IntoDataFrame, y: Any, **fit_kwargs) -> Any:
-        return self.fit(X, y, **fit_kwargs).transform(X, y, **fit_kwargs)
+        drop_list = []
+        i = 0
+        while cm_nw.shape[0] > 0:
+            to_drop, count = self._find_drop(cm_nw, self.seed)
+
+            logger.info(
+                f"Iteration {i}: Dropping {to_drop}, correlated with {count} other features"
+            )
+
+            cm_nw = cm_nw.filter(
+                nw.col("f1")
+                .__eq__(to_drop)
+                .__or__(nw.col("f2").__eq__(to_drop))
+                .__invert__()
+            )
+
+            drop_list.append(to_drop)
+            i += 1
+
+        self.selected_features_ = sorted(list(set(features) - set(drop_list)))
+
+        return self
+
+    def fit(self, X: nwt.IntoFrame):
+        corr_mat = correlation_matrix(X)
+
+        self.fit_from_correlation_matrix(corr_mat)
+
+        return self
+
+    def transform(self, X: nwt.IntoFrameT) -> nwt.IntoFrameT:
+        return nw.from_native(X).select(self.selected_features_).to_native()
+
+    def fit_transform(self, X: nwt.IntoFrameT) -> nwt.IntoFrameT:
+        return self.fit(X).transform(X)
diff --git a/tests/test_selection.py b/tests/test_selection.py
@@ -94,6 +94,30 @@ def test_rfe(estimator):
     )
 
     assert rfe.selected_features_ == ["f0.99"]
+    assert rfe.transform(X).columns == ["f0.99"]
+
+
+@pytest.mark.parametrize("estimator", ESTIMATORS)
+def test_rfe_early_stopping(estimator):
+    fit_kwargs = {}
+    if "eval_set" in inspect.signature(estimator.fit).parameters:
+        fit_kwargs["eval_set"] = [(X, y)]
+
+    def _roc_auc(est, X, y) -> float:
+        return rs.metrics.roc_auc(y, est.predict(X))
+
+    early_stopping_kwargs = {}
+    if "predict_proba" not in inspect.getmembers(
+        estimator, predicate=inspect.isfunction
+    ):
+        early_stopping_kwargs["metric"] = _roc_auc
+
+    rs.selection.RFE(
+        estimator=estimator,
+        step=3,
+        quiet=True,
+        callbacks=[rs.selection.EarlyStopping(**early_stopping_kwargs)],
+    ).fit(X, y, **fit_kwargs)
 
 
 @pytest.mark.parametrize("estimator", ESTIMATORS)
@@ -105,3 +129,77 @@ def test_nfe(estimator):
     nfe = rs.selection.NFE(estimator=estimator, seed=SEED).fit(X, y, **fit_kwargs)
 
     assert "f0.99" in nfe.selected_features_
+    assert "f0.99" in nfe.transform(X).columns
+
+
+def test_cfe():
+    corr_mat = pl.DataFrame(
+        {
+            "": ["a", "b", "c"],
+            "a": [1.0, 0.5, 0.7],
+            "b": [-0.99, 1, 0.98],
+            "c": [float("nan"), None, 1],
+        }
+    )
+
+    expected = ["a", "c"]
+    cfe = rs.selection.CFE(threshold=0.95)
+
+    assert cfe.fit_from_correlation_matrix(corr_mat).selected_features_ == expected
+
+    corr_mat_unpivoted = corr_mat.unpivot(index="").rename(
+        {"": "f1", "variable": "f2", "value": "correlation"}
+    )
+
+    assert (
+        cfe.fit_from_correlation_matrix(
+            corr_mat_unpivoted, transform=False
+        ).selected_features_
+        == expected
+    )
+
+
+def test_cfe_identity_no_drop():
+    # Test that identity correlations do not cause a feature to be removed, i.e.
+    # corr(a, a) = 1 should not cause feature a to be dropped.
+    corr_mat = pl.DataFrame(
+        {
+            "": ["a", "b", "c"],
+            "a": [1.0, 0.5, 0.7],
+            "b": [0.5, 1.0, 0.98],
+            "c": [float("nan"), None, 1],
+        }
+    )
+
+    assert rs.selection.CFE(threshold=0.99).fit_from_correlation_matrix(
+        corr_mat
+    ).selected_features_ == ["a", "b", "c"]
+
+
+def test_cfe_corr_1_is_removed():
+    # Test that a correlation of 1 that is not an identity causes a feature to be
+    # correctly removed.
+    corr_mat = pl.DataFrame(
+        {
+            "": ["a", "b", "c"],
+            "a": [1.0, 0.5, 0.7],
+            "b": [0.5, 1.0, 1.0],
+            "c": [float("nan"), None, 1],
+        }
+    )
+
+    assert rs.selection.CFE(threshold=0.99).fit_from_correlation_matrix(
+        corr_mat
+    ).selected_features_ == ["a", "c"]
+
+
+def test_cfe_repro():
+    n_cols = 50
+    df = pl.DataFrame(
+        np.random.rand(1_000, n_cols), schema=[f"col_{i}" for i in range(n_cols)]
+    )
+
+    assert (
+        rs.selection.CFE().fit(df).selected_features_
+        == rs.selection.CFE().fit(df).selected_features_
+    )