Fix estimator type (#36)

jonathan-taylor · web-flow · commit 85853d29bb38 · 2026-02-02T18:16:21.000-08:00
* loosening numpy and pandas versions

* fixes to build requirements

* simplifying setup.py, logic in pyproject.toml

* fixing _estimator_type flags

* feat: Add scikit-learn estimator type tags

Adds the `__sklearn_tags__` method to the `sklearn_sm` and `sklearn_selected` wrappers. This allows scikit-learn to correctly identify the estimator type (regressor or classifier) based on the statsmodels model.

This change enables the use of scikit-learn's cross-validation and model selection tools with these wrappers.

Tests have been added to verify that OLS and GLM Binomial models are correctly identified.

* removing redundant setup.py

* unused pkg_resources import
diff --git a/ISLP/models/generic_selector.py b/ISLP/models/generic_selector.py
@@ -28,7 +28,10 @@
 import scipy as sp
 
 from sklearn.metrics import get_scorer
-from sklearn.base import (clone, MetaEstimatorMixin)
+from sklearn.base import (clone,
+                          MetaEstimatorMixin,
+                          is_classifier,
+                          is_regressor)
 from sklearn.model_selection import cross_val_score
 from joblib import Parallel, delayed
 
@@ -149,13 +152,13 @@ def __init__(self,
         self.scoring = scoring
 
         if scoring is None:
-            if self.est_._estimator_type == 'classifier':
+            if is_classifier(self.est_):
                 scoring = 'accuracy'
-            elif self.est_._estimator_type == 'regressor':
+            elif is_regressor(self.est_):
                 scoring = 'r2'
             else:
-                raise AttributeError('Estimator must '
-                                     'be a Classifier or Regressor.')
+                scoring = None
+                
         if isinstance(scoring, str):
             self.scorer = get_scorer(scoring)
         else:
@@ -166,7 +169,7 @@ def __init__(self,
         # don't mess with this unless testing
         self._TESTING_INTERRUPT_MODE = False
 
-    def fit(self, X, y, groups=None, **params):
+    def fit(self, X, y, groups=None, **fit_params):
         """Perform feature selection and learn model from training data.
 
         Parameters
@@ -183,7 +186,7 @@ def fit(self, X, y, groups=None, **params):
         groups: array-like, with shape (n_samples,), optional
             Group labels for the samples used while splitting the dataset into
             train/test set. Passed to the fit method of the cross-validator.
-        params: various, optional
+        fit_params: various, optional
             Additional parameters that are being passed to the estimator.
             For example, `sample_weights=weights`.
 
@@ -218,7 +221,7 @@ def fit(self, X, y, groups=None, **params):
                                       groups=groups,
                                       cv=self.cv,
                                       pre_dispatch=self.pre_dispatch,
-                                      **params)
+                                      **fit_params)
 
         # keep a running track of the best state
 
@@ -242,7 +245,7 @@ def fit(self, X, y, groups=None, **params):
                                             X,
                                             y,
                                             groups=groups,
-                                            **params)
+                                            **fit_params)
                 iteration += 1
                 cur, best_, self.finished_ = self.update_results_check(results_,
                                                                        self.path_,
@@ -287,7 +290,7 @@ def fit_transform(self,
                       X,
                       y,
                       groups=None,
-                      **params):
+                      **fit_params):
         """Fit to training data then reduce X to its most important features.
 
         Parameters
@@ -304,7 +307,7 @@ def fit_transform(self,
         groups: array-like, with shape (n_samples,), optional
             Group labels for the samples used while splitting the dataset into
             train/test set. Passed to the fit method of the cross-validator.
-        params: various, optional
+        fit_params: various, optional
             Additional parameters that are being passed to the estimator.
             For example, `sample_weights=weights`.
 
@@ -313,7 +316,7 @@ def fit_transform(self,
         Reduced feature subset of X, shape={n_samples, k_features}
 
         """
-        self.fit(X, y, groups=groups, **params)
+        self.fit(X, y, groups=groups, **fit_params)
         return self.transform(X)
 
     def get_metric_dict(self, confidence_interval=0.95):
@@ -368,7 +371,7 @@ def _batch(self,
                X,
                y,
                groups=None,
-               **params):
+               **fit_params):
 
         results = []
 
@@ -388,7 +391,7 @@ def _batch(self,
                              groups=groups,
                              cv=self.cv,
                              pre_dispatch=self.pre_dispatch,
-                             **params)
+                             **fit_params)
                             for state in candidates)
 
             for state, scores in work:
@@ -484,8 +487,11 @@ def _calc_score(estimator,
                 groups=None,
                 cv=None,
                 pre_dispatch='2*n_jobs',
-                **params):
+                **fit_params):
     
+    if scorer is None:
+        scorer = lambda estimator, X, y: estimator.score(X, y)
+
     X_state = build_submodel(X, state)
 
     if cv:
@@ -497,11 +503,11 @@ def _calc_score(estimator,
                                  scoring=scorer,
                                  n_jobs=1,
                                  pre_dispatch=pre_dispatch,
-                                 params=params)
+                                 fit_params=fit_params)
     else:
         estimator.fit(X_state,
                       y,
-                      **params)
+                      **fit_params)
         scores = np.array([scorer(estimator,
                                   X_state,
                                   y)])
diff --git a/ISLP/models/sklearn_wrap.py b/ISLP/models/sklearn_wrap.py
@@ -49,7 +49,17 @@ def __init__(self,
         self.model_type = model_type
         self.model_spec = model_spec
         self.model_args = model_args
-        
+
+    def __sklearn_tags__(self):    
+        tags = super().__sklearn_tags__()
+        if self.model_type == sm.OLS:
+            tags.estimator_type = 'regressor'
+        elif (issubclass(self.model_type, sm.GLM) and
+              'family' in self.model_args and
+              isinstance(self.model_args.get('family', None), sm.families.Binomial)):
+            tags.estimator_type = 'classifier'
+        return tags
+
     def fit(self, X, y):
         """
         Fit a statsmodel model
@@ -171,6 +181,9 @@ def __init__(self,
         self.cv = cv
         self.scoring = scoring
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        return tags
                                      
     def fit(self, X, y):
         """
diff --git a/ISLP/torch/imdb.py b/ISLP/torch/imdb.py
@@ -12,7 +12,6 @@
 import torch
 from torch.utils.data import TensorDataset
 from scipy.sparse import load_npz
-from pkg_resources import resource_filename
 from pickle import load as load_pickle
 import urllib
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "ISLP"
 dependencies = ["numpy>=1.7.1",
                "scipy>=0.9",
-               "pandas>=0.20",
+               "pandas>=1.5",
                "lxml", # pandas needs this for html
                "scikit-learn>=1.2",
                "joblib",
@@ -15,7 +15,7 @@ dependencies = ["numpy>=1.7.1",
                ]
 description  = "Library for ISLP labs"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = {file = "LICENSE"}
 keywords = []
 authors = [
@@ -38,6 +38,23 @@ classifiers = ["Development Status :: 3 - Alpha",
 	       ]
 dynamic = ["version"]
 
+[tool.setuptools]
+packages = [
+    "ISLP",
+    "ISLP.models", 
+    "ISLP.bart",
+    "ISLP.torch",
+    "ISLP.data"
+]
+include-package-data = true
+
+[tool.setuptools.package-data]
+ISLP = ["data/*.csv", "data/*.npy", "data/*.data"]
+
+[tool.setuptools.dynamic]
+version = {attr = "ISLP.__version__"}  # Assuming ISLP.__version__ holds your version
+
+
 [project.urls]  # Optional
 "Homepage" = "https://github.com/intro-stat-learning/ISLP"
 "Bug Reports" = "https://github.com/intro-stat-learning/ISLP/issues"
@@ -51,8 +68,14 @@ doc = ['Sphinx>=3.0']
 [build-system]
 requires = ["setuptools>=42",
             "wheel",
-	    "versioneer[toml]",
-	    "Sphinx>=1.0"
+	    "Sphinx>=1.0",
+            "numpy",
+            "pandas",
+            "scipy",
+            "scikit-learn",
+            "joblib",
+            "statsmodels",
+	    "versioneer[toml]"
 	    ]
 build-backend = "setuptools.build_meta"
 
diff --git a/setup.py b/setup.py
diff --git a/tests/models/test_sklearn_wrap.py b/tests/models/test_sklearn_wrap.py
@@ -0,0 +1,46 @@
+
+import numpy as np
+import pandas as pd
+import statsmodels.api as sm
+from sklearn.base import is_classifier, is_regressor
+import pytest
+
+from ISLP.models.sklearn_wrap import sklearn_sm, sklearn_selected
+from ISLP.models.model_spec import ModelSpec
+from ISLP.models.strategy import min_max
+
+@pytest.fixture
+def model_setup():
+    X = pd.DataFrame({'X1': np.random.rand(10), 'X2': np.random.rand(10), 'X3': np.random.rand(10)})
+    y = pd.Series(np.random.randint(0, 2, 10)) # For classifier
+    model_spec_dummy = ModelSpec(['X1', 'X2', 'X3']).fit(X)
+    min_max_strategy_dummy = min_max(model_spec_dummy, min_terms=1, max_terms=2)
+    return X, y, model_spec_dummy, min_max_strategy_dummy
+
+def test_OLS_is_regressor():
+    model = sklearn_sm(sm.OLS)
+    assert model.__sklearn_tags__().estimator_type == 'regressor'
+    assert is_regressor(model)
+
+def test_GLM_binomial_is_classifier():
+    model = sklearn_sm(sm.GLM, model_args={'family': sm.families.Binomial()})
+    assert model.__sklearn_tags__().estimator_type == 'classifier'
+    assert is_classifier(model)
+
+def test_GLM_binomial_probit_is_classifier():
+    model = sklearn_sm(sm.GLM, model_args={'family': sm.families.Binomial(link=sm.families.links.Probit())})
+    assert model.__sklearn_tags__().estimator_type == 'classifier'
+    assert is_classifier(model)
+
+
+def test_selected_OLS_is_regressor(model_setup):
+    X, y, model_spec_dummy, min_max_strategy_dummy = model_setup
+    model = sklearn_selected(sm.OLS, strategy=min_max_strategy_dummy)
+    assert model.__sklearn_tags__().estimator_type == 'regressor'
+    assert is_regressor(model)
+
+def test_selected_GLM_binomial_is_classifier(model_setup):
+    X, y, model_spec_dummy, min_max_strategy_dummy = model_setup
+    model = sklearn_selected(sm.GLM, strategy=min_max_strategy_dummy, model_args={'family': sm.families.Binomial()})
+    assert model.__sklearn_tags__().estimator_type == 'classifier'
+    assert is_classifier(model)