Skip to content

Commit 4cab835

Browse files
authored
Merge pull request #119 from pymc-labs/test-data-loading
Add more tests: loading of package datasets and integration tests of pymc examples
2 parents 7b83c45 + 164368d commit 4cab835

File tree

7 files changed

+305
-77
lines changed

7 files changed

+305
-77
lines changed

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ df = (
4949
cp.load_data("drinking")
5050
.rename(columns={"agecell": "age"})
5151
.assign(treated=lambda df_: df_.age > 21)
52-
.dropna(axis=0)
5352
)
5453

5554
# Run the analysis

causalpy/data/drinking.csv

Lines changed: 49 additions & 51 deletions
Large diffs are not rendered by default.

causalpy/tests/test_data_loading.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import pandas as pd
2+
import pytest
3+
4+
import causalpy as cp
5+
6+
tests = ["banks", "brexit", "covid", "did", "drinking", "its", "its simple", "rd", "sc"]
7+
8+
9+
@pytest.mark.parametrize("dataset_name", tests)
10+
def test_data_loading(dataset_name):
11+
df = cp.load_data(dataset_name)
12+
assert isinstance(df, pd.DataFrame)
13+
# Check that there are no missing values in any column
14+
assert df.isnull().sum().sum() == 0
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
import pandas as pd
2+
import pytest
3+
4+
import causalpy as cp
5+
6+
sample_kwargs = {"tune": 20, "draws": 20, "chains": 2, "cores": 2}
7+
8+
9+
@pytest.mark.integration
10+
def test_did():
11+
df = cp.load_data("did")
12+
result = cp.pymc_experiments.DifferenceInDifferences(
13+
df,
14+
formula="y ~ 1 + group + t + treated:group",
15+
time_variable_name="t",
16+
group_variable_name="group",
17+
treated=1,
18+
untreated=0,
19+
prediction_model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
20+
)
21+
assert isinstance(df, pd.DataFrame)
22+
assert isinstance(result, cp.pymc_experiments.DifferenceInDifferences)
23+
assert (
24+
len(result.prediction_model.idata.posterior.coords["chain"])
25+
== sample_kwargs["chains"]
26+
)
27+
assert (
28+
len(result.prediction_model.idata.posterior.coords["draw"])
29+
== sample_kwargs["draws"]
30+
)
31+
32+
33+
@pytest.mark.integration
34+
def test_did_banks():
35+
df = (
36+
cp.load_data("banks")
37+
.filter(items=["bib6", "bib8", "year"])
38+
.rename(columns={"bib6": "Sixth District", "bib8": "Eighth District"})
39+
.groupby("year")
40+
.median()
41+
)
42+
df.reset_index(level=0, inplace=True)
43+
df_long = pd.melt(
44+
df,
45+
id_vars=["year"],
46+
value_vars=["Sixth District", "Eighth District"],
47+
var_name="district",
48+
value_name="bib",
49+
).sort_values("year")
50+
df_long["district"] = df_long["district"].astype("category")
51+
df_long["unit"] = df_long["district"]
52+
df_long["treated"] = (df_long.year >= 1931) & (df_long.district == "Sixth District")
53+
result = cp.pymc_experiments.DifferenceInDifferences(
54+
df_long[df_long.year.isin([1930, 1931])],
55+
formula="bib ~ 1 + district + year + district:treated",
56+
time_variable_name="year",
57+
group_variable_name="district",
58+
treated="Sixth District",
59+
untreated="Eighth District",
60+
prediction_model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
61+
)
62+
assert isinstance(df, pd.DataFrame)
63+
assert isinstance(result, cp.pymc_experiments.DifferenceInDifferences)
64+
assert (
65+
len(result.prediction_model.idata.posterior.coords["chain"])
66+
== sample_kwargs["chains"]
67+
)
68+
assert (
69+
len(result.prediction_model.idata.posterior.coords["draw"])
70+
== sample_kwargs["draws"]
71+
)
72+
73+
74+
@pytest.mark.integration
75+
def test_rd():
76+
df = cp.load_data("rd")
77+
result = cp.pymc_experiments.RegressionDiscontinuity(
78+
df,
79+
formula="y ~ 1 + bs(x, df=6) + treated",
80+
prediction_model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
81+
treatment_threshold=0.5,
82+
)
83+
assert isinstance(df, pd.DataFrame)
84+
assert isinstance(result, cp.pymc_experiments.RegressionDiscontinuity)
85+
assert (
86+
len(result.prediction_model.idata.posterior.coords["chain"])
87+
== sample_kwargs["chains"]
88+
)
89+
assert (
90+
len(result.prediction_model.idata.posterior.coords["draw"])
91+
== sample_kwargs["draws"]
92+
)
93+
94+
95+
@pytest.mark.integration
96+
def test_rd_drinking():
97+
df = (
98+
cp.load_data("drinking")
99+
.rename(columns={"agecell": "age"})
100+
.assign(treated=lambda df_: df_.age > 21)
101+
)
102+
result = cp.pymc_experiments.RegressionDiscontinuity(
103+
df,
104+
formula="all ~ 1 + age + treated",
105+
running_variable_name="age",
106+
prediction_model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
107+
treatment_threshold=21,
108+
)
109+
assert isinstance(df, pd.DataFrame)
110+
assert isinstance(result, cp.pymc_experiments.RegressionDiscontinuity)
111+
assert (
112+
len(result.prediction_model.idata.posterior.coords["chain"])
113+
== sample_kwargs["chains"]
114+
)
115+
assert (
116+
len(result.prediction_model.idata.posterior.coords["draw"])
117+
== sample_kwargs["draws"]
118+
)
119+
120+
121+
@pytest.mark.integration
122+
def test_its():
123+
df = cp.load_data("its")
124+
df["date"] = pd.to_datetime(df["date"])
125+
df.set_index("date", inplace=True)
126+
treatment_time = pd.to_datetime("2017-01-01")
127+
result = cp.pymc_experiments.SyntheticControl(
128+
df,
129+
treatment_time,
130+
formula="y ~ 1 + t + C(month)",
131+
prediction_model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
132+
)
133+
assert isinstance(df, pd.DataFrame)
134+
assert isinstance(result, cp.pymc_experiments.SyntheticControl)
135+
assert (
136+
len(result.prediction_model.idata.posterior.coords["chain"])
137+
== sample_kwargs["chains"]
138+
)
139+
assert (
140+
len(result.prediction_model.idata.posterior.coords["draw"])
141+
== sample_kwargs["draws"]
142+
)
143+
144+
145+
@pytest.mark.integration
146+
def test_its_covid():
147+
df = cp.load_data("covid")
148+
df["date"] = pd.to_datetime(df["date"])
149+
df = df.set_index("date")
150+
treatment_time = pd.to_datetime("2020-01-01")
151+
result = cp.pymc_experiments.SyntheticControl(
152+
df,
153+
treatment_time,
154+
formula="standardize(deaths) ~ 0 + standardize(t) + C(month) + standardize(temp)", # noqa E501
155+
prediction_model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
156+
)
157+
assert isinstance(df, pd.DataFrame)
158+
assert isinstance(result, cp.pymc_experiments.SyntheticControl)
159+
assert (
160+
len(result.prediction_model.idata.posterior.coords["chain"])
161+
== sample_kwargs["chains"]
162+
)
163+
assert (
164+
len(result.prediction_model.idata.posterior.coords["draw"])
165+
== sample_kwargs["draws"]
166+
)
167+
168+
169+
@pytest.mark.integration
170+
def test_sc():
171+
df = cp.load_data("sc")
172+
treatment_time = 70
173+
result = cp.pymc_experiments.SyntheticControl(
174+
df,
175+
treatment_time,
176+
formula="actual ~ 0 + a + b + c + d + e + f + g",
177+
prediction_model=cp.pymc_models.WeightedSumFitter(sample_kwargs=sample_kwargs),
178+
)
179+
assert isinstance(df, pd.DataFrame)
180+
assert isinstance(result, cp.pymc_experiments.SyntheticControl)
181+
assert (
182+
len(result.prediction_model.idata.posterior.coords["chain"])
183+
== sample_kwargs["chains"]
184+
)
185+
assert (
186+
len(result.prediction_model.idata.posterior.coords["draw"])
187+
== sample_kwargs["draws"]
188+
)
189+
190+
191+
@pytest.mark.integration
192+
def test_sc_brexit():
193+
df = cp.load_data("brexit")
194+
df["Time"] = pd.to_datetime(df["Time"])
195+
df.set_index("Time", inplace=True)
196+
df = df.iloc[df.index > "2009", :]
197+
treatment_time = pd.to_datetime("2016 June 24")
198+
df = df.drop(["Japan", "Italy", "US", "Spain"], axis=1)
199+
target_country = "UK"
200+
all_countries = df.columns
201+
other_countries = all_countries.difference({target_country})
202+
all_countries = list(all_countries)
203+
other_countries = list(other_countries)
204+
formula = target_country + " ~ " + "0 + " + " + ".join(other_countries)
205+
result = cp.pymc_experiments.SyntheticControl(
206+
df,
207+
treatment_time,
208+
formula=formula,
209+
prediction_model=cp.pymc_models.WeightedSumFitter(sample_kwargs=sample_kwargs),
210+
)
211+
assert isinstance(df, pd.DataFrame)
212+
assert isinstance(result, cp.pymc_experiments.SyntheticControl)
213+
assert (
214+
len(result.prediction_model.idata.posterior.coords["chain"])
215+
== sample_kwargs["chains"]
216+
)
217+
assert (
218+
len(result.prediction_model.idata.posterior.coords["draw"])
219+
== sample_kwargs["draws"]
220+
)

causalpy/tests/test_integration_skl_examples.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ def test_rd_drinking():
2626
cp.load_data("drinking")
2727
.rename(columns={"agecell": "age"})
2828
.assign(treated=lambda df_: df_.age > 21)
29-
.dropna(axis=0)
3029
)
3130
result = cp.skl_experiments.RegressionDiscontinuity(
3231
df,

docs/index.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ Quickstart
3737
cp.load_data("drinking")
3838
.rename(columns={"agecell": "age"})
3939
.assign(treated=lambda df_: df_.age > 21)
40-
.dropna(axis=0)
4140
)
4241
4342
# Run the analysis

docs/notebooks/rd_pymc_drinking.ipynb

Lines changed: 22 additions & 23 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)