pymc-labs
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 7 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 10 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎causalpy/data/datasets.py‎
Lines changed: 17 additions & 4 deletions b/‎causalpy/data/datasets.py‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎causalpy/data/simulate_data.py‎
Lines changed: 72 additions & 37 deletions b/‎causalpy/data/simulate_data.py‎
Lines changed: 72 additions & 37 deletions
@@ -48,3 +48,10 @@ repos:
         additional_dependencies:
           # Support pyproject.toml configuration
           - tomli
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.18.2
+    hooks:
+      - id: mypy
+        args: [--ignore-missing-imports]
+        files: ^causalpy/
+        additional_dependencies: [numpy>=1.20, pandas-stubs]
@@ -37,3 +37,13 @@
 - **Formulas**: Use patsy for formula parsing (via `dmatrices()`)
 - **Custom exceptions**: Use project-specific exceptions from `causalpy.custom_exceptions`: `FormulaException`, `DataException`, `BadIndexException`
 - **File organization**: Experiments in `causalpy/experiments/`, PyMC models in `causalpy/pymc_models.py`, scikit-learn models in `causalpy/skl_models.py`
+
+## Type Checking
+
+- **Tool**: MyPy
+- **Configuration**: Integrated as a pre-commit hook.
+- **Scope**: Checks Python files within the `causalpy/` directory.
+- **Settings**:
+    - `ignore-missing-imports`: Enabled to allow for gradual adoption of type hints without requiring all third-party libraries to have stubs.
+    - `additional_dependencies`: Includes `numpy` and `pandas-stubs` to provide type information for these libraries.
+- **Execution**: Run automatically via `pre-commit run --all-files` or on commit.
@@ -43,15 +43,28 @@
 }
 
 
-def _get_data_home() -> pathlib.PosixPath:
+def _get_data_home() -> pathlib.Path:
     """Return the path of the data directory"""
     return pathlib.Path(cp.__file__).parents[1] / "causalpy" / "data"
 
 
-def load_data(dataset: str = None) -> pd.DataFrame:
-    """Loads the requested dataset and returns a pandas DataFrame.
+def load_data(dataset: str | None = None) -> pd.DataFrame:
+    """Load the requested dataset and return a pandas DataFrame.
 
-    :param dataset: The desired dataset to load
+    Parameters
+    ----------
+    dataset : str, optional
+        The desired dataset to load. If None, raises ValueError.
+
+    Returns
+    -------
+    pd.DataFrame
+        The loaded dataset as a pandas DataFrame.
+
+    Raises
+    ------
+    ValueError
+        If the requested dataset is not found.
     """
 
     if dataset in DATASETS:
 
@@ -20,16 +20,19 @@
 from scipy.stats import dirichlet, gamma, norm, uniform
 from statsmodels.nonparametric.smoothers_lowess import lowess
 
-default_lowess_kwargs = {"frac": 0.2, "it": 0}
-RANDOM_SEED = 8927
-rng = np.random.default_rng(RANDOM_SEED)
+default_lowess_kwargs: dict[str, float | int] = {"frac": 0.2, "it": 0}
+RANDOM_SEED: int = 8927
+rng: np.random.Generator = np.random.default_rng(RANDOM_SEED)
 
 
 def _smoothed_gaussian_random_walk(
-    gaussian_random_walk_mu, gaussian_random_walk_sigma, N, lowess_kwargs
-):
+    gaussian_random_walk_mu: float,
+    gaussian_random_walk_sigma: float,
+    N: int,
+    lowess_kwargs: dict,
+) -> tuple[np.ndarray, np.ndarray]:
     """
-    Generates Gaussian random walk data and applies LOWESS
+    Generates Gaussian random walk data and applies LOWESS.
 
     :param gaussian_random_walk_mu:
         Mean of the random walk
@@ -48,12 +51,12 @@ def _smoothed_gaussian_random_walk(
 
 
 def generate_synthetic_control_data(
-    N=100,
-    treatment_time=70,
-    grw_mu=0.25,
-    grw_sigma=1,
-    lowess_kwargs=default_lowess_kwargs,
-):
+    N: int = 100,
+    treatment_time: int = 70,
+    grw_mu: float = 0.25,
+    grw_sigma: float = 1,
+    lowess_kwargs: dict = default_lowess_kwargs,
+) -> tuple[pd.DataFrame, np.ndarray]:
     """
     Generates data for synthetic control example.
 
@@ -73,7 +76,6 @@ def generate_synthetic_control_data(
     >>> from causalpy.data.simulate_data import generate_synthetic_control_data
     >>> df, weightings_true = generate_synthetic_control_data(treatment_time=70)
     """
-
     # 1. Generate non-treated variables
     df = pd.DataFrame(
         {
@@ -108,8 +110,12 @@ def generate_synthetic_control_data(
 
 
 def generate_time_series_data(
-    N=100, treatment_time=70, beta_temp=-1, beta_linear=0.5, beta_intercept=3
-):
+    N: int = 100,
+    treatment_time: int = 70,
+    beta_temp: float = -1,
+    beta_linear: float = 0.5,
+    beta_intercept: float = 3,
+) -> pd.DataFrame:
     """
     Generates interrupted time series example data
 
@@ -155,7 +161,9 @@ def generate_time_series_data(
     return df
 
 
-def generate_time_series_data_seasonal(treatment_time):
+def generate_time_series_data_seasonal(
+    treatment_time: pd.Timestamp,
+) -> pd.DataFrame:
     """
     Generates 10 years of monthly data with seasonality
     """
@@ -169,11 +177,13 @@ def generate_time_series_data_seasonal(treatment_time):
         t=df.index,
     ).set_index("date", drop=True)
     month_effect = np.array([11, 13, 12, 15, 19, 23, 21, 28, 20, 17, 15, 12])
-    df["y"] = 0.2 * df["t"] + 2 * month_effect[df.month.values - 1]
+    df["y"] = 0.2 * df["t"] + 2 * month_effect[np.asarray(df.month.values) - 1]
 
     N = df.shape[0]
     idx = np.arange(N)[df.index > treatment_time]
-    df["causal effect"] = 100 * gamma(10).pdf(np.arange(0, N, 1) - np.min(idx))
+    df["causal effect"] = 100 * gamma(10).pdf(
+        np.array(np.arange(0, N, 1)) - int(np.min(idx))
+    )
 
     df["y"] += df["causal effect"]
     df["y"] += norm(0, 2).rvs(N)
@@ -183,7 +193,9 @@ def generate_time_series_data_seasonal(treatment_time):
     return df
 
 
-def generate_time_series_data_simple(treatment_time, slope=0.0):
+def generate_time_series_data_simple(
+    treatment_time: pd.Timestamp, slope: float = 0.0
+) -> pd.DataFrame:
     """Generate simple interrupted time series data, with no seasonality or temporal
     structure.
     """
@@ -205,7 +217,7 @@ def generate_time_series_data_simple(treatment_time, slope=0.0):
     return df
 
 
-def generate_did():
+def generate_did() -> pd.DataFrame:
     """
     Generate Difference in Differences data
 
@@ -223,8 +235,14 @@ def generate_did():
 
     # local functions
     def outcome(
-        t, control_intercept, treat_intercept_delta, trend, Δ, group, post_treatment
-    ):
+        t: np.ndarray,
+        control_intercept: float,
+        treat_intercept_delta: float,
+        trend: float,
+        Δ: float,
+        group: np.ndarray,
+        post_treatment: np.ndarray,
+    ) -> np.ndarray:
         """Compute the outcome of each unit"""
         return (
             control_intercept
@@ -244,21 +262,21 @@ def outcome(
     df["post_treatment"] = df["t"] > intervention_time
 
     df["y"] = outcome(
-        df["t"],
+        np.asarray(df["t"]),
         control_intercept,
         treat_intercept_delta,
         trend,
         Δ,
-        df["group"],
-        df["post_treatment"],
+        np.asarray(df["group"]),
+        np.asarray(df["post_treatment"]),
     )
     df["y"] += rng.normal(0, 0.1, df.shape[0])
     return df
 
 
 def generate_regression_discontinuity_data(
-    N=100, true_causal_impact=0.5, true_treatment_threshold=0.0
-):
+    N: int = 100, true_causal_impact: float = 0.5, true_treatment_threshold: float = 0.0
+) -> pd.DataFrame:
     """
     Generate regression discontinuity example data
 
@@ -272,12 +290,12 @@ def generate_regression_discontinuity_data(
     ... )  # doctest: +SKIP
     """
 
-    def is_treated(x):
+    def is_treated(x: np.ndarray) -> np.ndarray:
         """Check if x was treated"""
         return np.greater_equal(x, true_treatment_threshold)
 
-    def impact(x):
-        """Assign true_causal_impact to all treaated entries"""
+    def impact(x: np.ndarray) -> np.ndarray:
+        """Assign true_causal_impact to all treated entries"""
         y = np.zeros(len(x))
         y[is_treated(x)] = true_causal_impact
         return y
@@ -289,8 +307,11 @@ def impact(x):
 
 
 def generate_ancova_data(
-    N=200, pre_treatment_means=np.array([10, 12]), treatment_effect=2, sigma=1
-):
+    N: int = 200,
+    pre_treatment_means: np.ndarray = np.array([10, 12]),
+    treatment_effect: int = 2,
+    sigma: int = 1,
+) -> pd.DataFrame:
     """
     Generate ANCOVA example data
 
@@ -310,7 +331,7 @@ def generate_ancova_data(
     return df
 
 
-def generate_geolift_data():
+def generate_geolift_data() -> pd.DataFrame:
     """Generate synthetic data for a geolift example. This will consists of 6 untreated
     countries. The treated unit `Denmark` is a weighted combination of the untreated
     units. We additionally specify a treatment effect which takes effect after the
@@ -360,7 +381,7 @@ def generate_geolift_data():
     return df
 
 
-def generate_multicell_geolift_data():
+def generate_multicell_geolift_data() -> pd.DataFrame:
     """Generate synthetic data for a geolift example. This will consists of 6 untreated
     countries. The treated unit `Denmark` is a weighted combination of the untreated
     units. We additionally specify a treatment effect which takes effect after the
@@ -422,7 +443,9 @@ def generate_multicell_geolift_data():
 # -----------------
 
 
-def generate_seasonality(n=12, amplitude=1, length_scale=0.5):
+def generate_seasonality(
+    n: int = 12, amplitude: int = 1, length_scale: float = 0.5
+) -> np.ndarray:
     """Generate monthly seasonality by sampling from a Gaussian process with a
     Gaussian kernel, using numpy code"""
     # Generate the covariance matrix
@@ -436,14 +459,26 @@ def generate_seasonality(n=12, amplitude=1, length_scale=0.5):
     return seasonality
 
 
-def periodic_kernel(x1, x2, period=1, length_scale=1, amplitude=1):
+def periodic_kernel(
+    x1: np.ndarray,
+    x2: np.ndarray,
+    period: int = 1,
+    length_scale: float = 1.0,
+    amplitude: int = 1,
+) -> np.ndarray:
     """Generate a periodic kernel for gaussian process"""
     return amplitude**2 * np.exp(
         -2 * np.sin(np.pi * np.abs(x1 - x2) / period) ** 2 / length_scale**2
     )
 
 
-def create_series(n=52, amplitude=1, length_scale=2, n_years=4, intercept=3):
+def create_series(
+    n: int = 52,
+    amplitude: int = 1,
+    length_scale: int = 2,
+    n_years: int = 4,
+    intercept: int = 3,
+) -> np.ndarray:
     """
     Returns numpy tile with generated seasonality data repeated over
     multiple years