doctest clean and added to github actions

jpreszler · jpreszler · commit 5d6c8eb0074f · 2023-09-06T10:00:43.000-07:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -34,6 +34,10 @@ jobs:
         uses: actions/setup-python@v3
         with:
           python-version: ${{ matrix.python-version }}
+      - name: Run doctests
+        run: |
+          pip install -e .[test]
+          pytest --doctest-modules causalpy/
       - name: Run tests
         run: |
           pip install -e .[test]
diff --git a/causalpy/data/simulate_data.py b/causalpy/data/simulate_data.py
@@ -56,8 +56,9 @@ def generate_synthetic_control_data(
 
     Example
     --------
+    >>> from causalpy.data.simulate_data import generate_synthetic_control_data
     >>> df, weightings_true = generate_synthetic_control_data(
-    ...                             treatment_time=treatment_time
+    ...                             treatment_time=70
     ... )
     """
 
@@ -196,6 +197,7 @@ def generate_did():
 
     Example
     --------
+    >>> from causalpy.data.simulate_data import generate_did
     >>> df = generate_did()
     """
     # true parameters
@@ -249,6 +251,7 @@ def generate_regression_discontinuity_data(
     Example
     --------
     >>> import pathlib
+    >>> from causalpy.data.simulate_data import generate_regression_discontinuity_data
     >>> df = generate_regression_discontinuity_data(true_treatment_threshold=0.5)
     >>> df.to_csv(pathlib.Path.cwd() / 'regression_discontinuity.csv', index=False)
     """
@@ -278,9 +281,10 @@ def generate_ancova_data(
     Example
     --------
     >>> import pathlib
+    >>> from causalpy.data.simulate_data import generate_ancova_data
     >>> df = generate_ancova_data(
     ...     N=200,
-    ...     pre_treatment_threshold=np.array([10, 12]),
+    ...     pre_treatment_means=np.array([10, 12]),
     ...     treatment_effect=2,
     ...     sigma=1
     ... )
diff --git a/causalpy/pymc_experiments.py b/causalpy/pymc_experiments.py
@@ -359,7 +359,7 @@ def summary(self) -> None:
         ...     ),
         ... )
         >>> result.summary()
-        ===============================Synthetic Control===============================
+        ==================================Pre-Post Fit==================================
         Formula: actual ~ 0 + a + b + c + d + e + f + g
         Model coefficients:
         a                             0.33, 94% HDI [0.30, 0.38]
@@ -757,7 +757,7 @@ def _plot_causal_impact_arrow(self, ax):
     def _causal_impact_summary_stat(self) -> str:
         """Computes the mean and 94% credible interval bounds for the causal impact."""
         percentiles = self.causal_impact.quantile([0.03, 1 - 0.03]).values
-        ci = r"$CI_{94\%}$" + f"[{percentiles[0]:.2f}, {percentiles[1]:.2f}]"
+        ci = "$CI_{94%}$" + f"[{percentiles[0]:.2f}, {percentiles[1]:.2f}]"
         causal_impact = f"{self.causal_impact.mean():.2f}, "
         return f"Causal impact = {causal_impact + ci}"
 
@@ -767,16 +767,31 @@ def summary(self) -> None:
 
         Example
         --------
-        Assuming `result` is a DiD experiment
-
+        >>> import causalpy as cp
+        >>> df = cp.load_data("did")
+        >>> seed = 42
+        >>> result = cp.pymc_experiments.DifferenceInDifferences(
+        ...     df,
+        ...     formula="y ~ 1 + group*post_treatment",
+        ...     time_variable_name="t",
+        ...     group_variable_name="group",
+        ...     model=cp.pymc_models.LinearRegression(
+        ...         sample_kwargs={
+        ...             "target_accept": 0.95,
+        ...             "random_seed": seed,
+        ...             "progressbar": False,
+        ...         }
+        ...     )
+        ...  )
         >>> result.summary()
-        ==========================Difference in Differences=========================
+        ===========================Difference in Differences============================
         Formula: y ~ 1 + group*post_treatment
+        <BLANKLINE>
         Results:
         Causal impact = 0.51, $CI_{94%}$[0.41, 0.61]
         Model coefficients:
         Intercept                     1.08, 94% HDI [1.03, 1.13]
-        post_treatment[T.True]        0.98, 94% HDI [0.91, 1.06]
+        post_treatment[T.True]        0.98, 94% HDI [0.92, 1.05]
         group                         0.16, 94% HDI [0.09, 0.23]
         group:post_treatment[T.True]  0.51, 94% HDI [0.41, 0.61]
         sigma                         0.08, 94% HDI [0.07, 0.10]
@@ -995,19 +1010,35 @@ def summary(self) -> None:
 
         Example
         --------
+        >>> import causalpy as cp
+        >>> df = cp.load_data("rd")
+        >>> seed = 42
+        >>> result = cp.pymc_experiments.RegressionDiscontinuity(
+        ...     df,
+        ...     formula="y ~ 1 + x + treated + x:treated",
+        ...     model=cp.pymc_models.LinearRegression(
+        ...         sample_kwargs={
+        ...             "target_accept": 0.95,
+        ...             "random_seed": seed,
+        ...             "progressbar": False,
+        ...         },
+        ...     ),
+        ...     treatment_threshold=0.5,
+        ... )
         >>> result.summary()
-        ============================Regression Discontinuity==========================
+        ============================Regression Discontinuity============================
         Formula: y ~ 1 + x + treated + x:treated
         Running variable: x
         Threshold on running variable: 0.5
+        <BLANKLINE>
         Results:
-        Discontinuity at threshold = 0.92
+        Discontinuity at threshold = 0.91
         Model coefficients:
-        Intercept                     0.09, 94% HDI [0.00, 0.17]
-        treated[T.True]               2.48, 94% HDI [1.66, 3.27]
+        Intercept                     0.09, 94% HDI [-0.00, 0.17]
+        treated[T.True]               2.45, 94% HDI [1.66, 3.28]
         x                             1.32, 94% HDI [1.14, 1.50]
-        x:treated[T.True]             -3.12, 94% HDI [-4.17, -2.05]
-        sigma                         0.35, 94% HDI [0.31, 0.41]
+        x:treated[T.True]             -3.08, 94% HDI [-4.17, -2.05]
+        sigma                         0.36, 94% HDI [0.31, 0.41]
         """
 
         print(f"{self.expt_type:=^80}")
@@ -1182,7 +1213,7 @@ def plot(self):
     def _causal_impact_summary_stat(self) -> str:
         """Computes the mean and 94% credible interval bounds for the causal impact."""
         percentiles = self.causal_impact.quantile([0.03, 1 - 0.03]).values
-        ci = r"$CI_{94\%}$" + f"[{percentiles[0]:.2f}, {percentiles[1]:.2f}]"
+        ci = r"$CI_{94%}$" + f"[{percentiles[0]:.2f}, {percentiles[1]:.2f}]"
         causal_impact = f"{self.causal_impact.mean():.2f}, "
         return f"Causal impact = {causal_impact + ci}"
 
@@ -1192,14 +1223,31 @@ def summary(self) -> None:
 
         Example
         --------
+        >>> import causalpy as cp
+        >>> df = cp.load_data("anova1")
+        >>> seed = 42
+        >>> result = cp.pymc_experiments.PrePostNEGD(
+        ...     df,
+        ...     formula="post ~ 1 + C(group) + pre",
+        ...     group_variable_name="group",
+        ...     pretreatment_variable_name="pre",
+        ...     model=cp.pymc_models.LinearRegression(
+        ...         sample_kwargs={
+        ...             "target_accept": 0.95,
+        ...             "random_seed": seed,
+        ...             "progressbar": False,
+        ...         }
+        ...     )
+        ... )
         >>> result.summary()
-        =================Pretest/posttest Nonequivalent Group Design================
+        ==================Pretest/posttest Nonequivalent Group Design===================
         Formula: post ~ 1 + C(group) + pre
+        <BLANKLINE>
         Results:
-        Causal impact = 1.89, $CI_{94%}$[1.70, 2.07]
+        Causal impact = 1.88, $CI_{94%}$[1.69, 2.07]
         Model coefficients:
-        Intercept                     -0.46, 94% HDI [-1.17, 0.22]
-        C(group)[T.1]                 1.89, 94% HDI [1.70, 2.07]
+        Intercept                     -0.47, 94% HDI [-1.16, 0.24]
+        C(group)[T.1]                 1.88, 94% HDI [1.69, 2.07]
         pre                           1.05, 94% HDI [0.98, 1.12]
         sigma                         0.51, 94% HDI [0.46, 0.56]
 
diff --git a/causalpy/pymc_models.py b/causalpy/pymc_models.py
@@ -99,7 +99,7 @@ def fit(self, X, y, coords: Optional[Dict[str, Any]] = None) -> None:
         >>> model = MyToyModel(
         ...             sample_kwargs={"chains": 2, "draws": 2, "progressbar": False}
         ... )
-        >>> model.fit(X, y) # doctest: +ELLIPSIS
+        >>> model.fit(X, y)
         Inference ...
         """
         self.build_model(X, y, coords)
@@ -139,10 +139,10 @@ def predict(self, X):
         >>> model = MyToyModel(
         ...             sample_kwargs={"chains": 2, "draws": 2, "progressbar": False}
         ... )
-        >>> model.fit(X, y) # doctest: +ELLIPSIS
+        >>> model.fit(X, y)
         Inference...
         >>> X_new = rng.normal(loc=0, scale=1, size=(20,2))
-        >>> model.predict(X_new) # doctest: +ELLIPSIS
+        >>> model.predict(X_new)
         Inference...
         """
 
@@ -177,17 +177,16 @@ def score(self, X, y) -> pd.Series:
         ...             mu = pm.Deterministic("mu", pm.math.dot(X_, beta))
         ...             pm.Normal("y_hat", mu=mu, sigma=sigma, observed=y_)
         >>> rng = np.random.default_rng(seed=42)
-        >>> X = rng.normal(loc=0, scale=1, size=(20, 2))
-        >>> y = rng.normal(loc=0, scale=1, size=(20,))
+        >>> X = rng.normal(loc=0, scale=1, size=(200, 2))
+        >>> y = rng.normal(loc=0, scale=1, size=(200,))
         >>> model = MyToyModel(
-        ...         sample_kwargs={"chains": 2, "draws": 200, "progressbar": False}
+        ...         sample_kwargs={"chains": 2, "draws": 2000, "progressbar": False}
         ... )
-        >>> model.fit(X, y) # doctest: +ELLIPSIS
+        >>> model.fit(X, y)
         Inference...
-        >>> model.score(X, y)
-        Sampling: [y_hat]
-        r2        0.376489
-        r2_std    0.081305
+        >>> round(model.score(X, y),2) # using round() to simplify doctest
+        r2        0.34
+        r2_std    0.02
         dtype: float64
         """
         yhat = self.predict(X)
@@ -223,7 +222,8 @@ class WeightedSumFitter(ModelBuilder):
     >>> X = sc[['a', 'b', 'c', 'd', 'e', 'f', 'g']]
     >>> y = np.asarray(sc['actual']).reshape((sc.shape[0], 1))
     >>> wsf = WeightedSumFitter(sample_kwargs={"progressbar": False})
-    >>> _ = wsf.fit(X,y)
+    >>> wsf.fit(X,y)
+    Inference ...
     """
 
     def build_model(self, X, y, coords):
@@ -279,7 +279,7 @@ class LinearRegression(ModelBuilder):
     ...                 'coeffs': ['x', 'treated'],
     ...                 'obs_indx': np.arange(rd.shape[0])
     ...                },
-    ... ) # doctest: +ELLIPSIS
+    ... )
     Inference...
     """
 
diff --git a/causalpy/skl_experiments.py b/causalpy/skl_experiments.py
@@ -59,7 +59,6 @@ class PrePostFit(ExperimentalDesign):
     ...     formula="actual ~ 0 + a + b + c + d + e + f + g",
     ...     model = cp.skl_models.WeightedProportion()
     ... )
-
     """
 
     def __init__(
@@ -181,10 +180,18 @@ def get_coeffs(self):
 
         Example
         --------
+        >>> from sklearn.linear_model import LinearRegression
+        >>> import causalpy as cp
+        >>> df = cp.load_data("sc")
+        >>> treatment_time = 70
+        >>> result = cp.skl_experiments.PrePostFit(
+        ...     df,
+        ...     treatment_time,
+        ...     formula="actual ~ 0 + a + b + c + d + e + f + g",
+        ...     model = cp.skl_models.WeightedProportion()
+        ... )
         >>> result.get_coeffs()
-        array([3.97370896e-01, 1.53881980e-01, 4.48747123e-01, 1.04639857e-16,
-        0.00000000e+00, 0.00000000e+00, 2.92931287e-16])
-
+        array(...)
         """
         return np.squeeze(self.model.coef_)
 
@@ -262,7 +269,6 @@ class SyntheticControl(PrePostFit):
     ...     formula="actual ~ 0 + a + b + c + d + e + f + g",
     ...     model = cp.skl_models.WeightedProportion()
     ... )
-
     """
 
     def plot(self, plot_predictors=False, **kwargs):
@@ -293,21 +299,22 @@ class DifferenceInDifferences(ExperimentalDesign):
     :param group_variable_name:
         Name of the data column for the group variable
     :param model:
-        A PyMC model for difference in differences
+        An skl model for difference in differences
 
     Example
     --------
+    >>> import causalpy as cp
+    >>> from sklearn.linear_model import LinearRegression
     >>> df = cp.load_data("did")
     >>> result = cp.skl_experiments.DifferenceInDifferences(
-    ...     data,
+    ...     df,
     ...     formula="y ~ 1 + group*post_treatment",
     ...     time_variable_name="t",
     ...     group_variable_name="group",
     ...     treated=1,
     ...     untreated=0,
     ...     model=LinearRegression(),
     ... )
-
     """
 
     def __init__(
@@ -497,14 +504,15 @@ class RegressionDiscontinuity(ExperimentalDesign):
 
     Example
     --------
+    >>> import causalpy as cp
+    >>> from sklearn.linear_model import LinearRegression
     >>> data = cp.load_data("rd")
     >>> result = cp.skl_experiments.RegressionDiscontinuity(
     ...     data,
     ...     formula="y ~ 1 + x + treated",
     ...     model=LinearRegression(),
     ...     treatment_threshold=0.5,
     ... )
-
     """
 
     def __init__(
@@ -640,18 +648,27 @@ def summary(self):
 
         Example
         --------
-        >>> result.summary()
+        >>> import causalpy as cp
+        >>> from sklearn.linear_model import LinearRegression
+        >>> data = cp.load_data("rd")
+        >>> result = cp.skl_experiments.RegressionDiscontinuity(
+        ...     data,
+        ...     formula="y ~ 1 + x + treated",
+        ...     model=LinearRegression(),
+        ...     treatment_threshold=0.5,
+        ... )
+        >>> result.summary() # doctest: +NORMALIZE_WHITESPACE
         Difference in Differences experiment
         Formula: y ~ 1 + x + treated
         Running variable: x
         Threshold on running variable: 0.5
+        <BLANKLINE>
         Results:
         Discontinuity at threshold = 0.19
         Model coefficients:
-                Intercept		0.0
-                treated[T.True]		0.19034196317793994
-                x		1.229600855360073
-
+           Intercept		0.0
+           treated[T.True]		0.19034196317793994
+           x		1.229600855360073
         """
         print("Difference in Differences experiment")
         print(f"Formula: {self.formula}")
diff --git a/causalpy/skl_models.py b/causalpy/skl_models.py
diff --git a/pyproject.toml b/pyproject.toml