some small fixes to the debiased lasso (#358)

vsyrgkanis · web-flow · commit ce2f2b54e125 · 2021-01-09T08:56:09.000-05:00
* some small fixes to the debiased lasso

* added parallelism across rows of design matrix to run each lassocv in parallel. added n_jobs param to debiased lasso and to sparselineardml

* added n_jobs to multioutput debiasedlasso

* added separate options for alpha for the covariance matrxi estimation

* added extra alpha options in sparselineardrlearner and sparsellineardml
diff --git a/econml/dml/dml.py b/econml/dml/dml.py
@@ -653,6 +653,18 @@ class SparseLinearDML(DebiasedLassoCateEstimatorMixin, DML):
         CATE L1 regularization applied through the debiased lasso in the final model.
         'auto' corresponds to a CV form of the :class:`MultiOutputDebiasedLasso`.
 
+    n_alphas : int, optional, default 100
+        How many alphas to try if alpha='auto'
+
+    alpha_cov : string | float, optional, default 'auto'
+        The regularization alpha that is used when constructing the pseudo inverse of
+        the covariance matrix Theta used to for correcting the final state lasso coefficient
+        in the debiased lasso. Each such regression corresponds to the regression of one feature
+        on the remainder of the features.
+
+    n_alphas_cov : int, optional, default 10
+        How many alpha_cov to try if alpha_cov='auto'.
+
     max_iter : int, optional, default=1000
         The maximum number of iterations in the Debiased Lasso
 
@@ -707,8 +719,12 @@ class SparseLinearDML(DebiasedLassoCateEstimatorMixin, DML):
     def __init__(self,
                  model_y='auto', model_t='auto',
                  alpha='auto',
+                 n_alphas=100,
+                 alpha_cov='auto',
+                 n_alphas_cov=10,
                  max_iter=1000,
                  tol=1e-4,
+                 n_jobs=None,
                  featurizer=None,
                  fit_cate_intercept=True,
                  linear_first_stages=True,
@@ -718,9 +734,13 @@ def __init__(self,
                  random_state=None):
         model_final = MultiOutputDebiasedLasso(
             alpha=alpha,
+            n_alphas=n_alphas,
+            alpha_cov=alpha_cov,
+            n_alphas_cov=n_alphas_cov,
             fit_intercept=False,
             max_iter=max_iter,
             tol=tol,
+            n_jobs=n_jobs,
             random_state=random_state)
         super().__init__(model_y=model_y,
                          model_t=model_t,
diff --git a/econml/drlearner.py b/econml/drlearner.py
@@ -853,6 +853,18 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
         CATE L1 regularization applied through the debiased lasso in the final model.
         'auto' corresponds to a CV form of the :class:`DebiasedLasso`.
 
+    n_alphas : int, optional, default 100
+        How many alphas to try if alpha='auto'
+
+    alpha_cov : string | float, optional, default 'auto'
+        The regularization alpha that is used when constructing the pseudo inverse of
+        the covariance matrix Theta used to for correcting the final state lasso coefficient
+        in the debiased lasso. Each such regression corresponds to the regression of one feature
+        on the remainder of the features.
+
+    n_alphas_cov : int, optional, default 10
+        How many alpha_cov to try if alpha_cov='auto'.
+
     max_iter : int, optional, default 1000
         The maximum number of iterations in the Debiased Lasso
 
@@ -910,17 +922,17 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
         est.fit(y, T, X=X, W=None)
 
     >>> est.effect(X[:3])
-    array([ 0.418400...,  0.306400..., -0.130733...])
+    array([ 0.41...,  0.31..., -0.12...])
     >>> est.effect_interval(X[:3])
-    (array([ 0.056783..., -0.206438..., -0.739296...]), array([0.780017..., 0.819239..., 0.477828...]))
+    (array([ 0.04..., -0.19..., -0.73...]), array([0.77..., 0.82..., 0.47...]))
     >>> est.coef_(T=1)
-    array([0.449779..., 0.004807..., 0.061954...])
+    array([ 0.45..., -0.00..., 0.06...])
     >>> est.coef__interval(T=1)
-    (array([ 0.242194... , -0.190825..., -0.139646...]), array([0.657365..., 0.200440..., 0.263556...]))
+    (array([ 0.24... , -0.19..., -0.13...]), array([0.65..., 0.19..., 0.26...]))
     >>> est.intercept_(T=1)
-    0.88436847...
+    0.88...
     >>> est.intercept__interval(T=1)
-    (0.68683788..., 1.08189907...)
+    (0.68..., 1.08...)
 
     Attributes
     ----------
@@ -942,17 +954,25 @@ def __init__(self,
                  featurizer=None,
                  fit_cate_intercept=True,
                  alpha='auto',
+                 n_alphas=100,
+                 alpha_cov='auto',
+                 n_alphas_cov=10,
                  max_iter=1000,
                  tol=1e-4,
                  min_propensity=1e-6,
                  categories='auto',
-                 n_splits=2, random_state=None):
+                 n_splits=2,
+                 random_state=None):
         self.fit_cate_intercept = fit_cate_intercept
         model_final = DebiasedLasso(
             alpha=alpha,
+            n_alphas=n_alphas,
+            alpha_cov=alpha_cov,
+            n_alphas_cov=n_alphas_cov,
             fit_intercept=fit_cate_intercept,
             max_iter=max_iter,
-            tol=tol)
+            tol=tol,
+            random_state=random_state)
         super().__init__(model_propensity=model_propensity,
                          model_regression=model_regression,
                          model_final=model_final,
diff --git a/econml/sklearn_extensions/linear_model.py b/econml/sklearn_extensions/linear_model.py
@@ -35,6 +35,7 @@
 from statsmodels.tools.tools import add_constant
 from statsmodels.api import RLM
 import statsmodels
+from joblib import Parallel, delayed
 
 
 def _weighted_check_cv(cv=5, y=None, classifier=False):
@@ -539,6 +540,34 @@ def fit(self, X, y, sample_weight=None):
         return self
 
 
+def _get_theta_coefs_and_tau_sq(i, X, sample_weight, alpha_cov, n_alphas_cov, max_iter, tol, random_state):
+    n_samples, n_features = X.shape
+    y = X[:, i]
+    X_reduced = X[:, list(range(i)) + list(range(i + 1, n_features))]
+    # Call weighted lasso on reduced design matrix
+    if alpha_cov == 'auto':
+        local_wlasso = WeightedLassoCV(cv=3, n_alphas=n_alphas_cov,
+                                       fit_intercept=False,
+                                       max_iter=max_iter,
+                                       tol=tol, n_jobs=1,
+                                       random_state=random_state)
+    else:
+        local_wlasso = WeightedLasso(alpha=alpha_cov,
+                                     fit_intercept=False,
+                                     max_iter=max_iter,
+                                     tol=tol,
+                                     random_state=random_state)
+    local_wlasso.fit(X_reduced, y, sample_weight=sample_weight)
+    coefs = local_wlasso.coef_
+    # Weighted tau
+    if sample_weight is not None:
+        y_weighted = y * sample_weight / np.sum(sample_weight)
+    else:
+        y_weighted = y / n_samples
+    tausq = np.dot(y - local_wlasso.predict(X_reduced), y_weighted)
+    return coefs, tausq
+
+
 class DebiasedLasso(WeightedLasso):
     """Debiased Lasso model.
 
@@ -555,6 +584,18 @@ class DebiasedLasso(WeightedLasso):
         reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.
         Given this, you should use the :class:`.LinearRegression` object.
 
+    n_alphas : int, optional, default 100
+        How many alphas to try if alpha='auto'
+
+    alpha_cov : string | float, optional, default 'auto'
+        The regularization alpha that is used when constructing the pseudo inverse of
+        the covariance matrix Theta used to for correcting the lasso coefficient. Each
+        such regression corresponds to the regression of one feature on the remainder
+        of the features.
+
+    n_alphas_cov : int, optional, default 10
+        How many alpha_cov to try if alpha_cov='auto'.
+
     fit_intercept : boolean, optional, default True
         Whether to calculate the intercept for this model. If set
         to False, no intercept will be used in calculations
@@ -597,6 +638,9 @@ class DebiasedLasso(WeightedLasso):
         (setting to 'random') often leads to significantly faster convergence
         especially when tol is higher than 1e-4.
 
+    n_jobs : int or None, default None
+        How many jobs to use whenever parallelism is invoked
+
     Attributes
     ----------
     coef_ : array, shape (n_features,)
@@ -620,10 +664,14 @@ class DebiasedLasso(WeightedLasso):
 
     """
 
-    def __init__(self, alpha='auto', fit_intercept=True,
-                 precompute=False, copy_X=True, max_iter=1000,
+    def __init__(self, alpha='auto', n_alphas=100, alpha_cov='auto', n_alphas_cov=10,
+                 fit_intercept=True, precompute=False, copy_X=True, max_iter=1000,
                  tol=1e-4, warm_start=False,
-                 random_state=None, selection='cyclic'):
+                 random_state=None, selection='cyclic', n_jobs=None):
+        self.n_jobs = n_jobs
+        self.n_alphas = n_alphas
+        self.alpha_cov = alpha_cov
+        self.n_alphas_cov = n_alphas_cov
         super().__init__(
             alpha=alpha, fit_intercept=fit_intercept,
             precompute=precompute, copy_X=copy_X,
@@ -747,18 +795,8 @@ def predict_interval(self, X, alpha=0.1):
         lower = alpha / 2
         upper = 1 - alpha / 2
         y_pred = self.predict(X)
-        y_lower = np.empty(y_pred.shape)
-        y_upper = np.empty(y_pred.shape)
-        # Note that in the case of no intercept, X_offset is 0
-        if self.fit_intercept:
-            X = X - self._X_offset
-        # Calculate the variance of the predictions
-        var_pred = np.sum(np.matmul(X, self._coef_variance) * X, axis=1)
-        if self.fit_intercept:
-            var_pred += self._mean_error_variance
-
         # Calculate prediction confidence intervals
-        sd_pred = np.sqrt(var_pred)
+        sd_pred = self.prediction_stderr(X)
         y_lower = y_pred + \
             np.apply_along_axis(lambda s: norm.ppf(
                 lower, scale=s), 0, sd_pred)
@@ -810,20 +848,25 @@ def intercept__interval(self, alpha=0.1):
 
     def _get_coef_correction(self, X, y, y_pred, sample_weight, theta_hat):
         # Assumes flattened y
-        n_samples, n_features = X.shape
+        n_samples, _ = X.shape
         y_res = np.ndarray.flatten(y) - y_pred
         # Compute weighted residuals
         if sample_weight is not None:
             y_res_scaled = y_res * sample_weight / np.sum(sample_weight)
         else:
             y_res_scaled = y_res / n_samples
         delta_coef = np.matmul(
-            np.matmul(theta_hat, X.T), y_res_scaled)
+            theta_hat, np.matmul(X.T, y_res_scaled))
         return delta_coef
 
     def _get_optimal_alpha(self, X, y, sample_weight):
         # To be done once per target. Assumes y can be flattened.
-        cv_estimator = WeightedLassoCV(cv=5, fit_intercept=self.fit_intercept)
+        cv_estimator = WeightedLassoCV(cv=5, n_alphas=self.n_alphas, fit_intercept=self.fit_intercept,
+                                       precompute=self.precompute, copy_X=True,
+                                       max_iter=self.max_iter, tol=self.tol,
+                                       random_state=self.random_state,
+                                       selection=self.selection,
+                                       n_jobs=self.n_jobs)
         cv_estimator.fit(X, y.flatten(), sample_weight=sample_weight)
         return cv_estimator.alpha_
 
@@ -835,27 +878,15 @@ def _get_theta_hat(self, X, sample_weight):
             C_hat = np.ones((1, 1))
             tausq = (X.T @ X / n_samples).flatten()
             return np.diag(1 / tausq) @ C_hat
-        coefs = np.empty((n_features, n_features - 1))
-        tausq = np.empty(n_features)
         # Compute Lasso coefficients for the columns of the design matrix
-        for i in range(n_features):
-            y = X[:, i]
-            X_reduced = X[:, list(range(i)) + list(range(i + 1, n_features))]
-            # Call weighted lasso on reduced design matrix
-            # Inherit some parameters from the parent
-            local_wlasso = WeightedLasso(
-                alpha=self.alpha,
-                fit_intercept=False,
-                max_iter=self.max_iter,
-                tol=self.tol
-            ).fit(X_reduced, y, sample_weight=sample_weight)
-            coefs[i] = local_wlasso.coef_
-            # Weighted tau
-            if sample_weight is not None:
-                y_weighted = y * sample_weight / np.sum(sample_weight)
-            else:
-                y_weighted = y / n_samples
-            tausq[i] = np.dot(y - local_wlasso.predict(X_reduced), y_weighted)
+        results = Parallel(n_jobs=self.n_jobs)(
+            delayed(_get_theta_coefs_and_tau_sq)(i, X, sample_weight,
+                                                 self.alpha_cov, self.n_alphas_cov,
+                                                 self.max_iter, self.tol, self.random_state)
+            for i in range(n_features))
+        coefs, tausq = zip(*results)
+        coefs = np.array(coefs)
+        tausq = np.array(tausq)
         # Compute C_hat
         C_hat = np.diag(np.ones(n_features))
         C_hat[0][1:] = -coefs[0]
@@ -893,6 +924,18 @@ class MultiOutputDebiasedLasso(MultiOutputRegressor):
         reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.
         Given this, you should use the :class:`LinearRegression` object.
 
+    n_alphas : int, optional, default 100
+        How many alphas to try if alpha='auto'
+
+    alpha_cov : string | float, optional, default 'auto'
+        The regularization alpha that is used when constructing the pseudo inverse of
+        the covariance matrix Theta used to for correcting the lasso coefficient. Each
+        such regression corresponds to the regression of one feature on the remainder
+        of the features.
+
+    n_alphas_cov : int, optional, default 10
+        How many alpha_cov to try if alpha_cov='auto'.
+
     fit_intercept : boolean, optional, default True
         Whether to calculate the intercept for this model. If set
         to False, no intercept will be used in calculations
@@ -935,6 +978,9 @@ class MultiOutputDebiasedLasso(MultiOutputRegressor):
         (setting to 'random') often leads to significantly faster convergence
         especially when tol is higher than 1e-4.
 
+    n_jobs : int or None, default None
+        How many jobs to use whenever parallelism is invoked
+
     Attributes
     ----------
     coef_ : array, shape (n_targets, n_features) or (n_features,)
@@ -954,14 +1000,17 @@ class MultiOutputDebiasedLasso(MultiOutputRegressor):
 
     """
 
-    def __init__(self, alpha='auto', fit_intercept=True,
+    def __init__(self, alpha='auto', n_alphas=100, alpha_cov='auto', n_alphas_cov=10,
+                 fit_intercept=True,
                  precompute=False, copy_X=True, max_iter=1000,
                  tol=1e-4, warm_start=False,
                  random_state=None, selection='cyclic', n_jobs=None):
-        self.estimator = DebiasedLasso(alpha=alpha, fit_intercept=fit_intercept,
+        self.estimator = DebiasedLasso(alpha=alpha, n_alphas=n_alphas, alpha_cov=alpha_cov, n_alphas_cov=n_alphas_cov,
+                                       fit_intercept=fit_intercept,
                                        precompute=precompute, copy_X=copy_X, max_iter=max_iter,
                                        tol=tol, warm_start=warm_start,
-                                       random_state=random_state, selection=selection)
+                                       random_state=random_state, selection=selection,
+                                       n_jobs=n_jobs)
         super().__init__(estimator=self.estimator, n_jobs=n_jobs)
 
     def fit(self, X, y, sample_weight=None):
diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py
@@ -967,9 +967,10 @@ def test_categories(self):
         dmls = [LinearDML, SparseLinearDML]
         for ctor in dmls:
             dml1 = ctor(LinearRegression(), LogisticRegression(C=1000),
-                        fit_cate_intercept=False, discrete_treatment=True)
+                        fit_cate_intercept=False, discrete_treatment=True, random_state=123)
             dml2 = ctor(LinearRegression(), LogisticRegression(C=1000),
-                        fit_cate_intercept=False, discrete_treatment=True, categories=['c', 'b', 'a'])
+                        fit_cate_intercept=False, discrete_treatment=True, categories=['c', 'b', 'a'],
+                        random_state=123)
 
             # create a simple artificial setup where effect of moving from treatment
             #     a -> b is 2,
@@ -1003,9 +1004,9 @@ def test_categories(self):
             # but const_marginal_effect should be reordered based on the explicit cagetories
             cme1 = dml1.const_marginal_effect(np.ones((1, 1))).reshape(-1)
             cme2 = dml2.const_marginal_effect(np.ones((1, 1))).reshape(-1)
-            self.assertAlmostEqual(cme1[1], -cme2[1], places=4)  # 1->3 in original ordering; 3->1 in new ordering
+            self.assertAlmostEqual(cme1[1], -cme2[1], places=3)  # 1->3 in original ordering; 3->1 in new ordering
             # 1-> 2 in original ordering; combination of 3->1 and 3->2
-            self.assertAlmostEqual(cme1[0], -cme2[1] + cme2[0], places=4)
+            self.assertAlmostEqual(cme1[0], -cme2[1] + cme2[0], places=3)
 
     def test_groups(self):
         groups = [1, 2, 3, 4, 5, 6] * 10
diff --git a/notebooks/Doubly Robust Learner and Interpretability.ipynb b/notebooks/Doubly Robust Learner and Interpretability.ipynb