CITCOM-project
diff --git a/‎causal_testing/json_front/json_class.py
Lines changed: 6 additions & 6 deletions b/‎causal_testing/json_front/json_class.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎causal_testing/specification/causal_dag.py
Lines changed: 30 additions & 0 deletions b/‎causal_testing/specification/causal_dag.py
Lines changed: 30 additions & 0 deletions
diff --git a/‎causal_testing/testing/causal_test_engine.py
Lines changed: 2 additions & 2 deletions b/‎causal_testing/testing/causal_test_engine.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎causal_testing/testing/causal_test_result.py
Lines changed: 4 additions & 0 deletions b/‎causal_testing/testing/causal_test_result.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎causal_testing/testing/estimators.py
Lines changed: 79 additions & 25 deletions b/‎causal_testing/testing/estimators.py
Lines changed: 79 additions & 25 deletions
diff --git a/‎examples/.gitignore
Lines changed: 1 addition & 0 deletions b/‎examples/.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/covasim_/doubling_beta/causal_test_beta.py
Lines changed: 6 additions & 6 deletions b/‎examples/covasim_/doubling_beta/causal_test_beta.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/covasim_/vaccinating_elderly/causal_test_vaccine.py
Lines changed: 2 additions & 2 deletions b/‎examples/covasim_/vaccinating_elderly/causal_test_vaccine.py
Lines changed: 2 additions & 2 deletions
@@ -208,12 +208,12 @@ def _setup_test(self, causal_test_case: CausalTestCase, estimator: Estimator) ->
         treatment_var = causal_test_case.treatment_variable
         minimal_adjustment_set = minimal_adjustment_set - {treatment_var}
         estimation_model = estimator(
-            (treatment_var.name,),
-            causal_test_case.treatment_value,
-            causal_test_case.control_value,
-            minimal_adjustment_set,
-            (causal_test_case.outcome_variable.name,),
-            causal_test_engine.scenario_execution_data_df,
+            treatment=treatment_var.name,
+            treatment_value=causal_test_case.treatment_value,
+            control_value=causal_test_case.control_value,
+            adjustment_set=minimal_adjustment_set,
+            outcome=causal_test_case.outcome_variable.name,
+            df=causal_test_engine.scenario_execution_data_df,
             effect_modifiers=causal_test_case.effect_modifier_configuration,
         )
 
 
@@ -138,6 +138,36 @@ def __init__(self, dot_path: str = None, **attr):
         if not self.is_acyclic():
             raise nx.HasACycle("Invalid Causal DAG: contains a cycle.")
 
+    def check_iv_assumptions(self, treatment, outcome, instrument) -> bool:
+        """
+        Checks the three instrumental variable assumptions, raising a
+        ValueError if any are violated.
+
+        :return Boolean True if the three IV assumptions hold.
+        """
+        # (i) Instrument is associated with treatment
+        if nx.d_separated(self.graph, {instrument}, {treatment}, set()):
+            raise ValueError(f"Instrument {instrument} is not associated with treatment {treatment} in the DAG")
+
+        # (ii) Instrument does not affect outcome except through its potential effect on treatment
+        if not all([treatment in path for path in nx.all_simple_paths(self.graph, source=instrument, target=outcome)]):
+            raise ValueError(
+                f"Instrument {instrument} affects the outcome {outcome} other than through the treatment {treatment}"
+            )
+
+        # (iii) Instrument and outcome do not share causes
+        if any(
+            [
+                cause
+                for cause in self.graph.nodes
+                if list(nx.all_simple_paths(self.graph, source=cause, target=instrument))
+                and list(nx.all_simple_paths(self.graph, source=cause, target=outcome))
+            ]
+        ):
+            raise ValueError(f"Instrument {instrument} and outcome {outcome} share common causes")
+
+        return True
+
     def add_edge(self, u_of_edge: Node, v_of_edge: Node, **attr):
         """Add an edge to the causal DAG.
 
 
@@ -89,11 +89,11 @@ def execute_test_suite(self, test_suite: CausalTestSuite) -> list[CausalTestResu
                     treatment_value = test.treatment_value
                     control_value = test.control_value
                     estimator = estimator_class(
-                        (treatment_variable.name,),
+                        treatment_variable.name,
                         treatment_value,
                         control_value,
                         minimal_adjustment_set,
-                        (test.outcome_variable.name,),
+                        test.outcome_variable.name,
                     )
                     if estimator.df is None:
                         estimator.df = self.scenario_execution_data_df
 
@@ -83,12 +83,16 @@ def ci_low(self):
         """Return the lower bracket of the confidence intervals."""
         if not self.confidence_intervals:
             return None
+        if any([x is None for x in self.confidence_intervals]):
+            return None
         return min(self.confidence_intervals)
 
     def ci_high(self):
         """Return the higher bracket of the confidence intervals."""
         if not self.confidence_intervals:
             return None
+        if any([x is None for x in self.confidence_intervals]):
+            return None
         return max(self.confidence_intervals)
 
     def summary(self):
 
@@ -135,20 +135,20 @@ def _run_logistic_regression(self, data) -> RegressionResultsWrapper:
         """
         # 1. Reduce dataframe to contain only the necessary columns
         reduced_df = data.copy()
-        necessary_cols = list(self.treatment) + list(self.adjustment_set) + list(self.outcome)
+        necessary_cols = [self.treatment] + list(self.adjustment_set) + [self.outcome]
         missing_rows = reduced_df[necessary_cols].isnull().any(axis=1)
         reduced_df = reduced_df[~missing_rows]
-        reduced_df = reduced_df.sort_values(list(self.treatment))
+        reduced_df = reduced_df.sort_values([self.treatment])
         logger.debug(reduced_df[necessary_cols])
 
         # 2. Add intercept
         reduced_df["Intercept"] = self.intercept
 
         # 3. Estimate the unit difference in outcome caused by unit difference in treatment
-        cols = list(self.treatment)
+        cols = [self.treatment]
         cols += [x for x in self.adjustment_set if x not in cols]
         treatment_and_adjustments_cols = reduced_df[cols + ["Intercept"]]
-        outcome_col = reduced_df[list(self.outcome)]
+        outcome_col = reduced_df[[self.outcome]]
         for col in treatment_and_adjustments_cols:
             if str(treatment_and_adjustments_cols.dtypes[col]) == "object":
                 treatment_and_adjustments_cols = pd.get_dummies(
@@ -167,7 +167,7 @@ def estimate(self, data: pd.DataFrame) -> RegressionResultsWrapper:
         self.model = model
 
         x = pd.DataFrame()
-        x[self.treatment[0]] = [self.treatment_value, self.control_value]
+        x[self.treatment] = [self.treatment_value, self.control_value]
         x["Intercept"] = self.intercept
         for k, v in self.effect_modifiers.items():
             x[k] = v
@@ -240,7 +240,7 @@ def estimate_ate(self, bootstrap_size=100) -> float:
         ci_high = bootstraps[bootstrap_size - bound]
 
         logger.info(
-            f"Changing {self.treatment[0]} from {self.control_value} to {self.treatment_value} gives an estimated "
+            f"Changing {self.treatment} from {self.control_value} to {self.treatment_value} gives an estimated "
             f"ATE of {ci_low} < {estimate} < {ci_high}"
         )
         assert ci_low < estimate < ci_high, f"Expecting {ci_low} < {estimate} < {ci_high}"
@@ -270,7 +270,7 @@ def estimate_risk_ratio(self, bootstrap_size=100) -> float:
         ci_high = bootstraps[bootstrap_size - bound]
 
         logger.info(
-            f"Changing {self.treatment[0]} from {self.control_value} to {self.treatment_value} gives an estimated "
+            f"Changing {self.treatment} from {self.control_value} to {self.treatment_value} gives an estimated "
             f"risk ratio of {ci_low} < {estimate} < {ci_high}"
         )
         assert ci_low < estimate < ci_high, f"Expecting {ci_low} < {estimate} < {ci_high}"
@@ -284,7 +284,7 @@ def estimate_unit_odds_ratio(self) -> float:
         :return: The odds ratio. Confidence intervals are not yet supported.
         """
         model = self._run_logistic_regression(self.df)
-        return np.exp(model.params[self.treatment[0]])
+        return np.exp(model.params[self.treatment])
 
 
 class LinearRegressionEstimator(Estimator):
@@ -385,7 +385,7 @@ def estimate_unit_ate(self) -> float:
         :return: The unit average treatment effect and the 95% Wald confidence intervals.
         """
         model = self._run_linear_regression()
-        unit_effect = model.params[list(self.treatment)].values[0]  # Unit effect is the coefficient of the treatment
+        unit_effect = model.params[[self.treatment]].values[0]  # Unit effect is the coefficient of the treatment
         [ci_low, ci_high] = self._get_confidence_intervals(model)
 
         return unit_effect * self.treatment_value - unit_effect * self.control_value, [ci_low, ci_high]
@@ -409,8 +409,8 @@ def estimate_ate(self) -> tuple[float, list[float, float], float]:
 
         # It is ABSOLUTELY CRITICAL that these go last, otherwise we can't index
         # the effect with "ate = t_test_results.effect[0]"
-        individuals.loc["control", list(self.treatment)] = self.control_value
-        individuals.loc["treated", list(self.treatment)] = self.treatment_value
+        individuals.loc["control", [self.treatment]] = self.control_value
+        individuals.loc["treated", [self.treatment]] = self.treatment_value
 
         # Perform a t-test to compare the predicted outcome of the control and treated individual (ATE)
         t_test_results = model.t_test(individuals.loc["treated"] - individuals.loc["control"])
@@ -431,7 +431,7 @@ def estimate_control_treatment(self, adjustment_config: dict = None) -> tuple[pd
         self.model = model
 
         x = pd.DataFrame()
-        x[self.treatment[0]] = [self.treatment_value, self.control_value]
+        x[self.treatment] = [self.treatment_value, self.control_value]
         x["Intercept"] = 1#self.intercept
         for k, v in adjustment_config.items():
             x[k] = v
@@ -487,7 +487,7 @@ def estimate_cates(self) -> tuple[float, list[float, float]]:
             self.effect_modifiers
         ), f"Must have at least one effect modifier to compute CATE - {self.effect_modifiers}."
         x = pd.DataFrame()
-        x[self.treatment[0]] = [self.treatment_value, self.control_value]
+        x[self.treatment] = [self.treatment_value, self.control_value]
         x["Intercept"] = 1#self.intercept
         for k, v in self.effect_modifiers.items():
             self.adjustment_set.add(k)
@@ -513,20 +513,20 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
         """
         # 1. Reduce dataframe to contain only the necessary columns
         reduced_df = self.df.copy()
-        necessary_cols = list(self.treatment) + list(self.adjustment_set) + list(self.outcome)
+        necessary_cols = [self.treatment] + list(self.adjustment_set) + [self.outcome]
         missing_rows = reduced_df[necessary_cols].isnull().any(axis=1)
         reduced_df = reduced_df[~missing_rows]
-        reduced_df = reduced_df.sort_values(list(self.treatment))
+        reduced_df = reduced_df.sort_values([self.treatment])
         logger.debug(reduced_df[necessary_cols])
 
         # 2. Add intercept
         reduced_df["Intercept"] = 1#self.intercept
 
         # 3. Estimate the unit difference in outcome caused by unit difference in treatment
-        cols = list(self.treatment)
+        cols = [self.treatment]
         cols += [x for x in self.adjustment_set if x not in cols]
         treatment_and_adjustments_cols = reduced_df[cols + ["Intercept"]]
-        outcome_col = reduced_df[list(self.outcome)]
+        outcome_col = reduced_df[[self.outcome]]
         for col in treatment_and_adjustments_cols:
             if str(treatment_and_adjustments_cols.dtypes[col]) == "object":
                 treatment_and_adjustments_cols = pd.get_dummies(
@@ -539,12 +539,66 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
     def _get_confidence_intervals(self, model):
         confidence_intervals = model.conf_int(alpha=0.05, cols=None)
         ci_low, ci_high = (
-            confidence_intervals[0][list(self.treatment)],
-            confidence_intervals[1][list(self.treatment)],
+            confidence_intervals[0][[self.treatment]],
+            confidence_intervals[1][[self.treatment]],
         )
         return [ci_low.values[0], ci_high.values[0]]
 
 
+class InstrumentalVariableEstimator(Estimator):
+    """
+    Carry out estimation using instrumental variable adjustment rather than conventional adjustment. This means we do
+    not need to observe all confounders in order to adjust for them. A key assumption here is linearity.
+    """
+
+    def __init__(
+        self,
+        treatment: str,
+        treatment_value: float,
+        control_value: float,
+        adjustment_set: set,
+        outcome: str,
+        instrument: str,
+        df: pd.DataFrame = None,
+        intercept: int = 1,
+        effect_modifiers: dict=None # Not used (yet?). Needed for compatibility
+    ):
+        super().__init__(treatment, treatment_value, control_value, adjustment_set, outcome, df, None)
+        self.intercept = intercept
+        self.model = None
+        self.instrument = instrument
+
+
+    def add_modelling_assumptions(self):
+        """
+        Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
+        must hold if the resulting causal inference is to be considered valid.
+        """
+        self.modelling_assumptions += """The instrument and the treatment, and the treatment and the outcome must be
+        related linearly in the form Y = aX + b."""
+        self.modelling_assumptions += """The three IV conditions must hold
+            (i) Instrument is associated with treatment
+            (ii) Instrument does not affect outcome except through its potential effect on treatment
+            (iii) Instrument and outcome do not share causes
+        """
+
+    def estimate_coefficient(self):
+        """
+        Estimate the linear regression coefficient of the treatment on the outcome.
+        """
+        # Estimate the total effect of instrument I on outcome Y = abI + c1
+        ab = sm.OLS(self.df[self.outcome], self.df[[self.instrument]]).fit().params[self.instrument]
+
+        # Estimate the direct effect of instrument I on treatment X = aI + c1
+        a = sm.OLS(self.df[self.treatment], self.df[[self.instrument]]).fit().params[self.instrument]
+
+        # Estimate the coefficient of I on X by cancelling
+        return ab / a
+
+    def estimate_ate(self):
+        return (self.treatment_value - self.control_value) * self.estimate_coefficient(), (None, None)
+
+
 class CausalForestEstimator(Estimator):
     """A causal random forest estimator is a non-parametric estimator which recursively partitions the covariate space
     to learn a low-dimensional representation of treatment effect heterogeneity. This form of estimator is best suited
@@ -566,7 +620,7 @@ def estimate_ate(self) -> float:
         """
         # Remove any NA containing rows
         reduced_df = self.df.copy()
-        necessary_cols = list(self.treatment) + list(self.adjustment_set) + list(self.outcome)
+        necessary_cols = [self.treatment] + list(self.adjustment_set) + [self.outcome]
         missing_rows = reduced_df[necessary_cols].isnull().any(axis=1)
         reduced_df = reduced_df[~missing_rows]
 
@@ -577,8 +631,8 @@ def estimate_ate(self) -> float:
         else:
             effect_modifier_df = reduced_df[list(self.adjustment_set)]
         confounders_df = reduced_df[list(self.adjustment_set)]
-        treatment_df = np.ravel(reduced_df[list(self.treatment)])
-        outcome_df = np.ravel(reduced_df[list(self.outcome)])
+        treatment_df = np.ravel(reduced_df[[self.treatment]])
+        outcome_df = np.ravel(reduced_df[[self.outcome]])
 
         # Fit the model to the data using a gradient boosting regressor for both the treatment and outcome model
         model = CausalForestDML(
@@ -606,7 +660,7 @@ def estimate_cates(self) -> pd.DataFrame:
 
         # Remove any NA containing rows
         reduced_df = self.df.copy()
-        necessary_cols = list(self.treatment) + list(self.adjustment_set) + list(self.outcome)
+        necessary_cols = [self.treatment] + list(self.adjustment_set) + [self.outcome]
         missing_rows = reduced_df[necessary_cols].isnull().any(axis=1)
         reduced_df = reduced_df[~missing_rows]
 
@@ -620,8 +674,8 @@ def estimate_cates(self) -> pd.DataFrame:
             confounders_df = reduced_df[list(self.adjustment_set)]
         else:
             confounders_df = None
-        treatment_df = reduced_df[list(self.treatment)]
-        outcome_df = reduced_df[list(self.outcome)]
+        treatment_df = reduced_df[[self.treatment]]
+        outcome_df = reduced_df[[self.outcome]]
 
         # Fit a model to the data
         model = CausalForestDML(model_y=GradientBoostingRegressor(), model_t=GradientBoostingRegressor())
 
@@ -0,0 +1 @@
+*.pdf
@@ -47,19 +47,19 @@ def doubling_beta_CATE_on_csv(observational_data_path: str, simulate_counterfact
     past_execution_df = pd.read_csv(observational_data_path)
     _, causal_test_engine, causal_test_case = engine_setup(observational_data_path)
 
-    linear_regression_estimator = LinearRegressionEstimator(('beta',), 0.032, 0.016,
+    linear_regression_estimator = LinearRegressionEstimator('beta', 0.032, 0.016,
                                                             {'avg_age', 'contacts'},  # We use custom adjustment set
-                                                            ('cum_infections',),
+                                                            'cum_infections',
                                                             df=past_execution_df)
 
     # Add squared terms for beta, since it has a quadratic relationship with cumulative infections
     linear_regression_estimator.add_squared_term_to_df('beta')
     causal_test_result = causal_test_engine.execute_test(linear_regression_estimator, causal_test_case, 'ate')
 
     # Repeat for association estimate (no adjustment)
-    no_adjustment_linear_regression_estimator = LinearRegressionEstimator(('beta',), 0.032, 0.016,
+    no_adjustment_linear_regression_estimator = LinearRegressionEstimator('beta', 0.032, 0.016,
                                                                           set(),
-                                                                          ('cum_infections',),
+                                                                          'cum_infections',
                                                                           df=past_execution_df)
     no_adjustment_linear_regression_estimator.add_squared_term_to_df('beta')
     association_test_result = causal_test_engine.execute_test(no_adjustment_linear_regression_estimator, causal_test_case, 'ate')
@@ -79,9 +79,9 @@ def doubling_beta_CATE_on_csv(observational_data_path: str, simulate_counterfact
     # Repeat causal inference after deleting all rows with treatment value to obtain counterfactual inferences
     if simulate_counterfactuals:
         counterfactual_past_execution_df = past_execution_df[past_execution_df['beta'] != 0.032]
-        counterfactual_linear_regression_estimator = LinearRegressionEstimator(('beta',), 0.032, 0.016,
+        counterfactual_linear_regression_estimator = LinearRegressionEstimator('beta', 0.032, 0.016,
                                                                                {'avg_age', 'contacts'},
-                                                                               ('cum_infections',),
+                                                                               'cum_infections',
                                                                                df=counterfactual_past_execution_df)
         counterfactual_linear_regression_estimator.add_squared_term_to_df('beta')
         counterfactual_causal_test_result = causal_test_engine.execute_test(linear_regression_estimator, causal_test_case, 'ate')
 
@@ -85,9 +85,9 @@ def experimental_causal_test_vaccinate_elderly(runs_per_test_per_config: int = 3
         minimal_adjustment_set = causal_dag.identification(base_test_case)
 
         # 9. Build statistical model
-        linear_regression_estimator = LinearRegressionEstimator((vaccine.name,), 1, 0,
+        linear_regression_estimator = LinearRegressionEstimator(vaccine.name, 1, 0,
                                                                 minimal_adjustment_set,
-                                                                (outcome_variable.name,))
+                                                                outcome_variable.name)
 
         # 10. Execute test and save results in dict
         causal_test_result = causal_test_engine.execute_test(linear_regression_estimator, causal_test_case, 'ate')