Fix support for sklearn<1.0

kbattocchi · kbattocchi · commit f1bfe22c737e · 2022-07-07T21:59:14.000-04:00
diff --git a/econml/sklearn_extensions/linear_model.py b/econml/sklearn_extensions/linear_model.py
@@ -38,6 +38,51 @@
 from joblib import Parallel, delayed
 
 
+# TODO: once we drop support for sklearn < 1.0, we can remove this
+def _add_normalize(to_wrap):
+    """
+    Add a fictitious "normalize" argument to linear model initializer signatures.
+
+    This is necessary for their get_params to play nicely with some other sklearn-internal methods.
+
+    Note that directly adding a **params argument to the ordinary initializer will not work,
+    because get_params explicitly looks only at the initializer signature arguments that are not
+    varargs or varkeywords, so we need to modify the signature of the initializer to include the
+    "normalize" argument.
+    """
+    # if we're decorating a class, just update the __init__ method,
+    # so that the result is still a class instead of a wrapper method
+    if isinstance(to_wrap, type):
+        import sklearn
+        from packaging import version
+
+        if version.parse(sklearn.__version__) >= version.parse("1.0"):
+            # normalize was deprecated or removed; don't need to do anything
+            return to_wrap
+
+        else:
+            from inspect import Parameter, signature
+            from functools import wraps
+
+            old_init = to_wrap.__init__
+
+            @wraps(old_init)
+            def new_init(self, *args, normalize=False, **kwargs):
+                if normalize is not False:
+                    warnings.warn("normalize is deprecated and will be ignored", stacklevel=2)
+                return old_init(self, *args, **kwargs)
+
+            sig = signature(old_init)
+            sig = sig.replace(parameters=[*sig.parameters.values(),
+                                          Parameter("normalize", kind=Parameter.KEYWORD_ONLY, default=False)])
+
+            new_init.__signature__ = sig
+            to_wrap.__init__ = new_init
+            return to_wrap
+    else:
+        raise ValueError("This decorator was applied to a method, but is intended to be applied only to types.")
+
+
 def _weighted_check_cv(cv=5, y=None, classifier=False, random_state=None):
     cv = 5 if cv is None else cv
     if isinstance(cv, numbers.Integral):
@@ -131,6 +176,7 @@ def _fit_weighted_linear_model(self, X, y, sample_weight, check_input=None):
             super().fit(**fit_params)
 
 
+@_add_normalize
 class WeightedLasso(WeightedModelMixin, Lasso):
     """Version of sklearn Lasso that accepts weights.
 
@@ -236,6 +282,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         return self
 
 
+@_add_normalize
 class WeightedMultiTaskLasso(WeightedModelMixin, MultiTaskLasso):
     """Version of sklearn MultiTaskLasso that accepts weights.
 
@@ -325,6 +372,7 @@ def fit(self, X, y, sample_weight=None):
         return self
 
 
+@_add_normalize
 class WeightedLassoCV(WeightedModelMixin, LassoCV):
     """Version of sklearn LassoCV that accepts weights.
 
@@ -443,6 +491,7 @@ def fit(self, X, y, sample_weight=None):
         return self
 
 
+@_add_normalize
 class WeightedMultiTaskLassoCV(WeightedModelMixin, MultiTaskLassoCV):
     """Version of sklearn MultiTaskLassoCV that accepts weights.
 
@@ -582,6 +631,7 @@ def _get_theta_coefs_and_tau_sq(i, X, sample_weight, alpha_cov, n_alphas_cov, ma
     return coefs, tausq
 
 
+@_add_normalize
 class DebiasedLasso(WeightedLasso):
     """Debiased Lasso model.
 
@@ -927,6 +977,7 @@ def _get_unscaled_coef_var(self, X, theta_hat, sample_weight):
         return _unscaled_coef_var
 
 
+@_add_normalize
 class MultiOutputDebiasedLasso(MultiOutputRegressor):
     """Debiased MultiOutputLasso model.
 
diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py
@@ -7,7 +7,7 @@
 from sklearn.linear_model import LinearRegression, Lasso, LassoCV, LogisticRegression
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures
-from sklearn.model_selection import KFold, GroupKFold
+from sklearn.model_selection import KFold, GroupKFold, check_cv
 from econml.dml import DML, LinearDML, SparseLinearDML, KernelDML, CausalForestDML
 from econml.dml import NonParamDML
 import numpy as np
@@ -1141,27 +1141,37 @@ def test_groups(self):
         est.fit(y, t, groups=groups)
 
         # test nested grouping
-        class NestedModel(LassoCV):
-            def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
-                         precompute='auto', max_iter=1000, tol=1e-4,
-                         copy_X=True, cv=None, verbose=False, n_jobs=None,
-                         positive=False, random_state=None, selection='cyclic'):
-
-                super().__init__(
-                    eps=eps, n_alphas=n_alphas, alphas=alphas,
-                    fit_intercept=fit_intercept,
-                    precompute=precompute, max_iter=max_iter, tol=tol, copy_X=copy_X,
-                    cv=cv, verbose=verbose, n_jobs=n_jobs, positive=positive,
-                    random_state=random_state, selection=selection)
+        class NestedModel:
+            def __init__(self, cv):
+                self.model = LassoCV(cv=cv)
+
+            # DML nested CV works via a 'cv' attribute
+            @property
+            def cv(self):
+                return self.model.cv
+
+            @cv.setter
+            def cv(self, value):
+                self.model.cv = value
 
             def fit(self, X, y):
-                # ensure that the grouping has worked correctly and we get all 10 copies of the items in
-                # whichever groups we saw
-                (yvals, cts) = np.unique(y, return_counts=True)
-                for (yval, ct) in zip(yvals, cts):
-                    if ct != 10:
-                        raise Exception("Grouping failed; received {0} copies of {1} instead of 10".format(ct, yval))
-                return super().fit(X, y)
+                for (train, test) in check_cv(self.cv, y).split(X, y):
+                    (yvals, cts) = np.unique(y[train], return_counts=True)
+                    # with 2-fold outer and 2-fold inner grouping, and six total groups,
+                    # should get 1 or 2 groups per split
+                    if len(yvals) > 2:
+                        raise Exception(f"Grouping failed: received {len(yval)} groups instead of at most 2")
+
+                    # ensure that the grouping has worked correctly and we get all 10 copies of the items in
+                    # whichever groups we see
+                    for (yval, ct) in zip(yvals, cts):
+                        if ct != 10:
+                            raise Exception(f"Grouping failed; received {ct} copies of {yval} instead of 10")
+                self.model.fit(X, y)
+                return self
+
+            def predict(self, X):
+                return self.model.predict(X)
 
         # test nested grouping
         est = LinearDML(model_y=NestedModel(cv=2), model_t=NestedModel(cv=2), cv=GroupKFold(2))
@@ -1170,6 +1180,6 @@ def fit(self, X, y):
         # by default, we use 5 split cross-validation for our T and Y models
         # but we don't have enough groups here to split both the outer and inner samples with grouping
         # TODO: does this imply we should change some defaults to make this more likely to succeed?
-        est = LinearDML(cv=GroupKFold(2))
+        est = LinearDML(model_y=LassoCV(cv=5), model_t=LassoCV(cv=5), cv=GroupKFold(2))
         with pytest.raises(Exception):
             est.fit(y, t, groups=groups)
diff --git a/econml/tests/test_drlearner.py b/econml/tests/test_drlearner.py
@@ -13,8 +13,8 @@
 from sklearn.base import TransformerMixin
 from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor
 from sklearn.exceptions import DataConversionWarning
-from sklearn.linear_model import LinearRegression, Lasso, LassoCV, LogisticRegression
-from sklearn.model_selection import KFold, GroupKFold
+from sklearn.linear_model import LinearRegression, Lasso, LassoCV, LogisticRegression, LogisticRegressionCV
+from sklearn.model_selection import KFold, GroupKFold, check_cv
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures
 
@@ -799,27 +799,37 @@ def test_groups(self):
         est.fit(y, t, W=w, groups=groups)
 
         # test nested grouping
-        class NestedModel(LassoCV):
-            def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
-                         precompute='auto', max_iter=1000, tol=1e-4,
-                         copy_X=True, cv=None, verbose=False, n_jobs=None,
-                         positive=False, random_state=None, selection='cyclic'):
-
-                super().__init__(
-                    eps=eps, n_alphas=n_alphas, alphas=alphas,
-                    fit_intercept=fit_intercept,
-                    precompute=precompute, max_iter=max_iter, tol=tol, copy_X=copy_X,
-                    cv=cv, verbose=verbose, n_jobs=n_jobs, positive=positive,
-                    random_state=random_state, selection=selection)
+        class NestedModel:
+            def __init__(self, cv):
+                self.model = LassoCV(cv=cv)
+
+            # DML nested CV works via a 'cv' attribute
+            @property
+            def cv(self):
+                return self.model.cv
+
+            @cv.setter
+            def cv(self, value):
+                self.model.cv = value
 
             def fit(self, X, y):
-                # ensure that the grouping has worked correctly and we get all 10 copies of the items in
-                # whichever groups we saw
-                (yvals, cts) = np.unique(y, return_counts=True)
-                for (yval, ct) in zip(yvals, cts):
-                    if ct != 10:
-                        raise Exception("Grouping failed; received {0} copies of {1} instead of 10".format(ct, yval))
-                return super().fit(X, y)
+                for (train, test) in check_cv(self.cv, y).split(X, y):
+                    (yvals, cts) = np.unique(y[train], return_counts=True)
+                    # with 2-fold outer and 2-fold inner grouping, and six total groups,
+                    # should get 1 or 2 groups per split
+                    if len(yvals) > 2:
+                        raise Exception(f"Grouping failed: received {len(yval)} groups instead of at most 2")
+
+                    # ensure that the grouping has worked correctly and we get all 10 copies of the items in
+                    # whichever groups we see
+                    for (yval, ct) in zip(yvals, cts):
+                        if ct != 10:
+                            raise Exception(f"Grouping failed; received {ct} copies of {yval} instead of 10")
+                self.model.fit(X, y)
+                return self
+
+            def predict(self, X):
+                return self.model.predict(X)
 
         # test nested grouping
         est = LinearDRLearner(model_propensity=LogisticRegression(),
@@ -829,7 +839,9 @@ def fit(self, X, y):
         # by default, we use 5 split cross-validation for our T and Y models
         # but we don't have enough groups here to split both the outer and inner samples with grouping
         # TODO: does this imply we should change some defaults to make this more likely to succeed?
-        est = LinearDRLearner(cv=GroupKFold(2))
+        est = LinearDRLearner(model_propensity=LogisticRegressionCV(cv=5),
+                              model_regression=LassoCV(cv=5),
+                              cv=GroupKFold(2))
         with pytest.raises(Exception):
             est.fit(y, t, W=w, groups=groups)
 
diff --git a/econml/tests/test_linear_model.py b/econml/tests/test_linear_model.py
@@ -51,6 +51,11 @@ def setUpClass(cls):
         cls.y_2D_consistent = np.concatenate((TestLassoExtensions.y_simple.reshape(-1, 1),
                                               TestLassoExtensions.y2_full.reshape(-1, 1)), axis=1)
 
+    def test_can_clone(self):
+        for model in [WeightedLasso(), WeightedLassoCV(), WeightedMultiTaskLassoCV(),
+                      WeightedLassoCVWrapper(), DebiasedLasso(), MultiOutputDebiasedLasso()]:
+            clone(model)
+
     #################
     # WeightedLasso #
     #################