Skip to content

Commit d5f0dad

Browse files
authored
Merge pull request #290 from CITCOM-project/gp-formulae
GP formulae
2 parents bfd0534 + ed832da commit d5f0dad

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1924
-1512
lines changed

.github/workflows/lint-format.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ jobs:
2525

2626
- name: Archive production artifacts
2727
if: ${{ success() }} || ${{ failure() }}
28-
uses: actions/upload-artifact@v2
28+
uses: actions/upload-artifact@v3
2929
with:
3030
name: MegaLinter reports
3131
path: |
3232
megalinter-reports
33-
mega-linter.log
33+
mega-linter.log

.pylintrc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -371,8 +371,8 @@ min-public-methods=2
371371
[EXCEPTIONS]
372372

373373
# Exceptions that will emit a warning when caught.
374-
overgeneral-exceptions=BaseException,
375-
Exception
374+
overgeneral-exceptions=builtins.BaseException,
375+
builtins.Exception
376376

377377

378378
[FORMAT]
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
"""This module contains the Estimator abstract class"""
2+
3+
import logging
4+
from abc import ABC, abstractmethod
5+
from typing import Any
6+
7+
import pandas as pd
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
class Estimator(ABC):
13+
# pylint: disable=too-many-instance-attributes
14+
"""An estimator contains all of the information necessary to compute a causal estimate for the effect of changing
15+
a set of treatment variables to a set of values.
16+
17+
All estimators must implement the following two methods:
18+
19+
1) add_modelling_assumptions: The validity of a model-assisted causal inference result depends on whether
20+
the modelling assumptions imposed by a model actually hold. Therefore, for each model, is important to state
21+
the modelling assumption upon which the validity of the results depend. To achieve this, the estimator object
22+
maintains a list of modelling assumptions (as strings). If a user wishes to implement their own estimator, they
23+
must implement this method and add all assumptions to the list of modelling assumptions.
24+
25+
2) estimate_ate: All estimators must be capable of returning the average treatment effect as a minimum. That is, the
26+
average effect of the intervention (changing treatment from control to treated value) on the outcome of interest
27+
adjusted for all confounders.
28+
"""
29+
30+
def __init__(
31+
# pylint: disable=too-many-arguments
32+
self,
33+
treatment: str,
34+
treatment_value: float,
35+
control_value: float,
36+
adjustment_set: set,
37+
outcome: str,
38+
df: pd.DataFrame = None,
39+
effect_modifiers: dict[str:Any] = None,
40+
alpha: float = 0.05,
41+
query: str = "",
42+
):
43+
self.treatment = treatment
44+
self.treatment_value = treatment_value
45+
self.control_value = control_value
46+
self.adjustment_set = adjustment_set
47+
self.outcome = outcome
48+
self.alpha = alpha
49+
self.df = df.query(query) if query else df
50+
51+
if effect_modifiers is None:
52+
self.effect_modifiers = {}
53+
else:
54+
self.effect_modifiers = effect_modifiers
55+
self.modelling_assumptions = []
56+
if query:
57+
self.modelling_assumptions.append(query)
58+
self.add_modelling_assumptions()
59+
logger.debug("Effect Modifiers: %s", self.effect_modifiers)
60+
61+
@abstractmethod
62+
def add_modelling_assumptions(self):
63+
"""
64+
Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
65+
must hold if the resulting causal inference is to be considered valid.
66+
"""
67+
68+
def compute_confidence_intervals(self) -> list[float, float]:
69+
"""
70+
Estimate the 95% Wald confidence intervals for the effect of changing the treatment from control values to
71+
treatment values on the outcome.
72+
:return: 95% Wald confidence intervals.
73+
"""
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
"""This module contains the RegressionEstimator, which is an abstract class for concrete regression estimators."""
2+
3+
import logging
4+
from typing import Any
5+
from abc import abstractmethod
6+
7+
import pandas as pd
8+
from statsmodels.regression.linear_model import RegressionResultsWrapper
9+
from patsy import dmatrix # pylint: disable = no-name-in-module
10+
11+
from causal_testing.specification.variable import Variable
12+
from causal_testing.estimation.abstract_estimator import Estimator
13+
14+
logger = logging.getLogger(__name__)
15+
16+
17+
class RegressionEstimator(Estimator):
18+
"""A Linear Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
19+
combination of parameters and functions of the variables (note these functions need not be linear).
20+
"""
21+
22+
def __init__(
23+
# pylint: disable=too-many-arguments
24+
self,
25+
treatment: str,
26+
treatment_value: float,
27+
control_value: float,
28+
adjustment_set: set,
29+
outcome: str,
30+
df: pd.DataFrame = None,
31+
effect_modifiers: dict[Variable:Any] = None,
32+
formula: str = None,
33+
alpha: float = 0.05,
34+
query: str = "",
35+
):
36+
super().__init__(
37+
treatment=treatment,
38+
treatment_value=treatment_value,
39+
control_value=control_value,
40+
adjustment_set=adjustment_set,
41+
outcome=outcome,
42+
df=df,
43+
effect_modifiers=effect_modifiers,
44+
query=query,
45+
)
46+
47+
self.model = None
48+
if effect_modifiers is None:
49+
effect_modifiers = []
50+
if adjustment_set is None:
51+
adjustment_set = []
52+
if formula is not None:
53+
self.formula = formula
54+
else:
55+
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
56+
self.formula = f"{outcome} ~ {'+'.join(terms)}"
57+
58+
@property
59+
@abstractmethod
60+
def regressor(self):
61+
"""
62+
The regressor to use, e.g. ols or logit.
63+
This should be a property accessible with self.regressor.
64+
Define as `regressor = ...`` outside of __init__, not as `self.regressor = ...`, otherwise
65+
you'll get an "cannot instantiate with abstract method" error.
66+
"""
67+
68+
def add_modelling_assumptions(self):
69+
"""
70+
Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
71+
must hold if the resulting causal inference is to be considered valid.
72+
"""
73+
self.modelling_assumptions.append(
74+
"The variables in the data must fit a shape which can be expressed as a linear"
75+
"combination of parameters and functions of variables. Note that these functions"
76+
"do not need to be linear."
77+
)
78+
79+
def _run_regression(self, data=None) -> RegressionResultsWrapper:
80+
"""Run logistic regression of the treatment and adjustment set against the outcome and return the model.
81+
82+
:return: The model after fitting to data.
83+
"""
84+
if data is None:
85+
data = self.df
86+
model = self.regressor(formula=self.formula, data=data).fit(disp=0)
87+
self.model = model
88+
return model
89+
90+
def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame:
91+
"""Estimate the outcomes under control and treatment.
92+
93+
:param data: The data to use, defaults to `self.df`. Controllable for boostrap sampling.
94+
:param: adjustment_config: The values of the adjustment variables to use.
95+
96+
:return: The estimated outcome under control and treatment, with confidence intervals in the form of a
97+
dataframe with columns "predicted", "se", "ci_lower", and "ci_upper".
98+
"""
99+
if adjustment_config is None:
100+
adjustment_config = {}
101+
102+
model = self._run_regression(data)
103+
104+
x = pd.DataFrame(columns=self.df.columns)
105+
x["Intercept"] = 1 # self.intercept
106+
x[self.treatment] = [self.treatment_value, self.control_value]
107+
108+
for k, v in adjustment_config.items():
109+
x[k] = v
110+
for k, v in self.effect_modifiers.items():
111+
x[k] = v
112+
x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe")
113+
for col in x:
114+
if str(x.dtypes[col]) == "object":
115+
x = pd.get_dummies(x, columns=[col], drop_first=True)
116+
117+
# This has to be here in case the treatment variable is in an I(...) block in the self.formula
118+
x[self.treatment] = [self.treatment_value, self.control_value]
119+
return model.get_prediction(x).summary_frame()
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""This module contains the CubicSplineRegressionEstimator class, for estimating
2+
continuous outcomes with changes in behaviour"""
3+
4+
import logging
5+
from typing import Any
6+
7+
import pandas as pd
8+
9+
from causal_testing.specification.variable import Variable
10+
from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
class CubicSplineRegressionEstimator(LinearRegressionEstimator):
16+
"""A Cubic Spline Regression Estimator is a parametric estimator which restricts the variables in the data to a
17+
combination of parameters and basis functions of the variables.
18+
"""
19+
20+
def __init__(
21+
# pylint: disable=too-many-arguments
22+
self,
23+
treatment: str,
24+
treatment_value: float,
25+
control_value: float,
26+
adjustment_set: set,
27+
outcome: str,
28+
basis: int,
29+
df: pd.DataFrame = None,
30+
effect_modifiers: dict[Variable:Any] = None,
31+
formula: str = None,
32+
alpha: float = 0.05,
33+
expected_relationship=None,
34+
):
35+
super().__init__(
36+
treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, formula, alpha
37+
)
38+
39+
self.expected_relationship = expected_relationship
40+
41+
if effect_modifiers is None:
42+
effect_modifiers = []
43+
44+
if formula is None:
45+
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
46+
self.formula = f"{outcome} ~ cr({'+'.join(terms)}, df={basis})"
47+
48+
def estimate_ate_calculated(self, adjustment_config: dict = None) -> pd.Series:
49+
"""Estimate the ate effect of the treatment on the outcome. That is, the change in outcome caused
50+
by changing the treatment variable from the control value to the treatment value. Here, we actually
51+
calculate the expected outcomes under control and treatment and divide one by the other. This
52+
allows for custom terms to be put in such as squares, inverses, products, etc.
53+
54+
:param: adjustment_config: The configuration of the adjustment set as a dict mapping variable names to
55+
their values. N.B. Every variable in the adjustment set MUST have a value in
56+
order to estimate the outcome under control and treatment.
57+
58+
:return: The average treatment effect.
59+
"""
60+
model = self._run_regression()
61+
62+
x = {"Intercept": 1, self.treatment: self.treatment_value}
63+
if adjustment_config is not None:
64+
for k, v in adjustment_config.items():
65+
x[k] = v
66+
if self.effect_modifiers is not None:
67+
for k, v in self.effect_modifiers.items():
68+
x[k] = v
69+
70+
treatment = model.predict(x).iloc[0]
71+
72+
x[self.treatment] = self.control_value
73+
control = model.predict(x).iloc[0]
74+
75+
return pd.Series(treatment - control)

0 commit comments

Comments
 (0)