IV estimation

jmafoster1 · jmafoster1 · commit 48b2c6887319 · 2023-02-21T15:44:44.000Z
diff --git a/causal_testing/json_front/json_class.py b/causal_testing/json_front/json_class.py
@@ -208,12 +208,12 @@ def _setup_test(self, causal_test_case: CausalTestCase, estimator: Estimator) ->
         treatment_var = causal_test_case.treatment_variable
         minimal_adjustment_set = minimal_adjustment_set - {treatment_var}
         estimation_model = estimator(
-            (treatment_var.name,),
-            causal_test_case.treatment_value,
-            causal_test_case.control_value,
-            minimal_adjustment_set,
-            (causal_test_case.outcome_variable.name,),
-            causal_test_engine.scenario_execution_data_df,
+            treatment=treatment_var.name,
+            treatment_value=causal_test_case.treatment_value,
+            control_value=causal_test_case.control_value,
+            adjustment_set=minimal_adjustment_set,
+            outcome=causal_test_case.outcome_variable.name,
+            df=causal_test_engine.scenario_execution_data_df,
             effect_modifiers=causal_test_case.effect_modifier_configuration,
         )
 
diff --git a/causal_testing/testing/causal_test_engine.py b/causal_testing/testing/causal_test_engine.py
@@ -89,11 +89,11 @@ def execute_test_suite(self, test_suite: CausalTestSuite) -> list[CausalTestResu
                     treatment_value = test.treatment_value
                     control_value = test.control_value
                     estimator = estimator_class(
-                        (treatment_variable.name,),
+                        treatment_variable.name,
                         treatment_value,
                         control_value,
                         minimal_adjustment_set,
-                        (test.outcome_variable.name,),
+                        test.outcome_variable.name,
                     )
                     if estimator.df is None:
                         estimator.df = self.scenario_execution_data_df
diff --git a/causal_testing/testing/causal_test_result.py b/causal_testing/testing/causal_test_result.py
@@ -83,12 +83,16 @@ def ci_low(self):
         """Return the lower bracket of the confidence intervals."""
         if not self.confidence_intervals:
             return None
+        if any([x is None for x in self.confidence_intervals]):
+            return None
         return min(self.confidence_intervals)
 
     def ci_high(self):
         """Return the higher bracket of the confidence intervals."""
         if not self.confidence_intervals:
             return None
+        if any([x is None for x in self.confidence_intervals]):
+            return None
         return max(self.confidence_intervals)
 
     def summary(self):
diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py
@@ -36,11 +36,11 @@ class Estimator(ABC):
 
     def __init__(
         self,
-        treatment: tuple,
+        treatment: str,
         treatment_value: float,
         control_value: float,
         adjustment_set: set,
-        outcome: tuple,
+        outcome: str,
         df: pd.DataFrame = None,
         effect_modifiers: dict[Variable:Any] = None,
     ):
@@ -93,11 +93,11 @@ class LogisticRegressionEstimator(Estimator):
 
     def __init__(
         self,
-        treatment: tuple,
+        treatment: str,
         treatment_value: float,
         control_value: float,
         adjustment_set: set,
-        outcome: tuple,
+        outcome: str,
         df: pd.DataFrame = None,
         effect_modifiers: dict[Variable:Any] = None,
         intercept: int = 1,
@@ -133,20 +133,20 @@ def _run_logistic_regression(self, data) -> RegressionResultsWrapper:
         """
         # 1. Reduce dataframe to contain only the necessary columns
         reduced_df = data.copy()
-        necessary_cols = list(self.treatment) + list(self.adjustment_set) + list(self.outcome)
+        necessary_cols = [self.treatment] + list(self.adjustment_set) + [self.outcome]
         missing_rows = reduced_df[necessary_cols].isnull().any(axis=1)
         reduced_df = reduced_df[~missing_rows]
-        reduced_df = reduced_df.sort_values(list(self.treatment))
+        reduced_df = reduced_df.sort_values([self.treatment])
         logger.debug(reduced_df[necessary_cols])
 
         # 2. Add intercept
         reduced_df["Intercept"] = self.intercept
 
         # 3. Estimate the unit difference in outcome caused by unit difference in treatment
-        cols = list(self.treatment)
+        cols = [self.treatment]
         cols += [x for x in self.adjustment_set if x not in cols]
         treatment_and_adjustments_cols = reduced_df[cols + ["Intercept"]]
-        outcome_col = reduced_df[list(self.outcome)]
+        outcome_col = reduced_df[[self.outcome]]
         for col in treatment_and_adjustments_cols:
             if str(treatment_and_adjustments_cols.dtypes[col]) == "object":
                 treatment_and_adjustments_cols = pd.get_dummies(
@@ -165,7 +165,7 @@ def estimate(self, data: pd.DataFrame) -> RegressionResultsWrapper:
         self.model = model
 
         x = pd.DataFrame()
-        x[self.treatment[0]] = [self.treatment_value, self.control_value]
+        x[self.treatment] = [self.treatment_value, self.control_value]
         x["Intercept"] = self.intercept
         for k, v in self.effect_modifiers.items():
             x[k] = v
@@ -238,7 +238,7 @@ def estimate_ate(self, bootstrap_size=100) -> float:
         ci_high = bootstraps[bootstrap_size - bound]
 
         logger.info(
-            f"Changing {self.treatment[0]} from {self.control_value} to {self.treatment_value} gives an estimated "
+            f"Changing {self.treatment} from {self.control_value} to {self.treatment_value} gives an estimated "
             f"ATE of {ci_low} < {estimate} < {ci_high}"
         )
         assert ci_low < estimate < ci_high, f"Expecting {ci_low} < {estimate} < {ci_high}"
@@ -268,7 +268,7 @@ def estimate_risk_ratio(self, bootstrap_size=100) -> float:
         ci_high = bootstraps[bootstrap_size - bound]
 
         logger.info(
-            f"Changing {self.treatment[0]} from {self.control_value} to {self.treatment_value} gives an estimated "
+            f"Changing {self.treatment} from {self.control_value} to {self.treatment_value} gives an estimated "
             f"risk ratio of {ci_low} < {estimate} < {ci_high}"
         )
         assert ci_low < estimate < ci_high, f"Expecting {ci_low} < {estimate} < {ci_high}"
@@ -282,7 +282,7 @@ def estimate_unit_odds_ratio(self) -> float:
         :return: The odds ratio. Confidence intervals are not yet supported.
         """
         model = self._run_logistic_regression(self.df)
-        return np.exp(model.params[self.treatment[0]])
+        return np.exp(model.params[self.treatment])
 
 
 class LinearRegressionEstimator(Estimator):
@@ -292,11 +292,11 @@ class LinearRegressionEstimator(Estimator):
 
     def __init__(
         self,
-        treatment: tuple,
+        treatment: str,
         treatment_value: float,
         control_value: float,
         adjustment_set: set,
-        outcome: tuple,
+        outcome: str,
         df: pd.DataFrame = None,
         effect_modifiers: dict[Variable:Any] = None,
         product_terms: list[tuple[Variable, Variable]] = None,
@@ -383,7 +383,7 @@ def estimate_unit_ate(self) -> float:
         :return: The unit average treatment effect and the 95% Wald confidence intervals.
         """
         model = self._run_linear_regression()
-        unit_effect = model.params[list(self.treatment)].values[0]  # Unit effect is the coefficient of the treatment
+        unit_effect = model.params[[self.treatment]].values[0]  # Unit effect is the coefficient of the treatment
         [ci_low, ci_high] = self._get_confidence_intervals(model)
 
         return unit_effect * self.treatment_value - unit_effect * self.control_value, [ci_low, ci_high]
@@ -407,8 +407,8 @@ def estimate_ate(self) -> tuple[float, list[float, float], float]:
 
         # It is ABSOLUTELY CRITICAL that these go last, otherwise we can't index
         # the effect with "ate = t_test_results.effect[0]"
-        individuals.loc["control", list(self.treatment)] = self.control_value
-        individuals.loc["treated", list(self.treatment)] = self.treatment_value
+        individuals.loc["control", [self.treatment]] = self.control_value
+        individuals.loc["treated", [self.treatment]] = self.treatment_value
 
         # Perform a t-test to compare the predicted outcome of the control and treated individual (ATE)
         t_test_results = model.t_test(individuals.loc["treated"] - individuals.loc["control"])
@@ -429,7 +429,7 @@ def estimate_control_treatment(self, adjustment_config: dict = None) -> tuple[pd
         self.model = model
 
         x = pd.DataFrame()
-        x[self.treatment[0]] = [self.treatment_value, self.control_value]
+        x[self.treatment] = [self.treatment_value, self.control_value]
         x["Intercept"] = self.intercept
         for k, v in adjustment_config.items():
             x[k] = v
@@ -485,7 +485,7 @@ def estimate_cates(self) -> tuple[float, list[float, float]]:
             self.effect_modifiers
         ), f"Must have at least one effect modifier to compute CATE - {self.effect_modifiers}."
         x = pd.DataFrame()
-        x[self.treatment[0]] = [self.treatment_value, self.control_value]
+        x[self.treatment] = [self.treatment_value, self.control_value]
         x["Intercept"] = self.intercept
         for k, v in self.effect_modifiers.items():
             self.adjustment_set.add(k)
@@ -511,20 +511,20 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
         """
         # 1. Reduce dataframe to contain only the necessary columns
         reduced_df = self.df.copy()
-        necessary_cols = list(self.treatment) + list(self.adjustment_set) + list(self.outcome)
+        necessary_cols = [self.treatment] + list(self.adjustment_set) + [self.outcome]
         missing_rows = reduced_df[necessary_cols].isnull().any(axis=1)
         reduced_df = reduced_df[~missing_rows]
-        reduced_df = reduced_df.sort_values(list(self.treatment))
+        reduced_df = reduced_df.sort_values([self.treatment])
         logger.debug(reduced_df[necessary_cols])
 
         # 2. Add intercept
         reduced_df["Intercept"] = self.intercept
 
         # 3. Estimate the unit difference in outcome caused by unit difference in treatment
-        cols = list(self.treatment)
+        cols = [self.treatment]
         cols += [x for x in self.adjustment_set if x not in cols]
         treatment_and_adjustments_cols = reduced_df[cols + ["Intercept"]]
-        outcome_col = reduced_df[list(self.outcome)]
+        outcome_col = reduced_df[[self.outcome]]
         for col in treatment_and_adjustments_cols:
             if str(treatment_and_adjustments_cols.dtypes[col]) == "object":
                 treatment_and_adjustments_cols = pd.get_dummies(
@@ -537,8 +537,8 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
     def _get_confidence_intervals(self, model):
         confidence_intervals = model.conf_int(alpha=0.05, cols=None)
         ci_low, ci_high = (
-            confidence_intervals[0][list(self.treatment)],
-            confidence_intervals[1][list(self.treatment)],
+            confidence_intervals[0][[self.treatment]],
+            confidence_intervals[1][[self.treatment]],
         )
         return [ci_low.values[0], ci_high.values[0]]
 
@@ -551,20 +551,22 @@ class InstrumentalVariableEstimator(Estimator):
 
     def __init__(
         self,
-        treatment: tuple,
+        treatment: str,
         treatment_value: float,
         control_value: float,
         adjustment_set: set,
-        outcome: tuple,
+        outcome: str,
         instrument: str,
         df: pd.DataFrame = None,
         intercept: int = 1,
+        effect_modifiers: dict=None # Not used (yet?). Needed for compatibility
     ):
         super().__init__(treatment, treatment_value, control_value, adjustment_set, outcome, df, None)
         self.intercept = intercept
         self.model = None
         self.instrument = instrument
 
+
     def add_modelling_assumptions(self):
         """
         Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
@@ -582,7 +584,6 @@ def estimate_coefficient(self):
         """
         Estimate the linear regression coefficient of the treatment on the outcome.
         """
-
         # Estimate the total effect of instrument I on outcome Y = abI + c1
         ab = sm.OLS(self.df[self.outcome], self.df[[self.instrument]]).fit().params[self.instrument]
 
@@ -617,7 +618,7 @@ def estimate_ate(self) -> float:
         """
         # Remove any NA containing rows
         reduced_df = self.df.copy()
-        necessary_cols = list(self.treatment) + list(self.adjustment_set) + list(self.outcome)
+        necessary_cols = [self.treatment] + list(self.adjustment_set) + [self.outcome]
         missing_rows = reduced_df[necessary_cols].isnull().any(axis=1)
         reduced_df = reduced_df[~missing_rows]
 
@@ -628,8 +629,8 @@ def estimate_ate(self) -> float:
         else:
             effect_modifier_df = reduced_df[list(self.adjustment_set)]
         confounders_df = reduced_df[list(self.adjustment_set)]
-        treatment_df = np.ravel(reduced_df[list(self.treatment)])
-        outcome_df = np.ravel(reduced_df[list(self.outcome)])
+        treatment_df = np.ravel(reduced_df[[self.treatment]])
+        outcome_df = np.ravel(reduced_df[[self.outcome]])
 
         # Fit the model to the data using a gradient boosting regressor for both the treatment and outcome model
         model = CausalForestDML(
@@ -657,7 +658,7 @@ def estimate_cates(self) -> pd.DataFrame:
 
         # Remove any NA containing rows
         reduced_df = self.df.copy()
-        necessary_cols = list(self.treatment) + list(self.adjustment_set) + list(self.outcome)
+        necessary_cols = [self.treatment] + list(self.adjustment_set) + [self.outcome]
         missing_rows = reduced_df[necessary_cols].isnull().any(axis=1)
         reduced_df = reduced_df[~missing_rows]
 
@@ -671,8 +672,8 @@ def estimate_cates(self) -> pd.DataFrame:
             confounders_df = reduced_df[list(self.adjustment_set)]
         else:
             confounders_df = None
-        treatment_df = reduced_df[list(self.treatment)]
-        outcome_df = reduced_df[list(self.outcome)]
+        treatment_df = reduced_df[[self.treatment]]
+        outcome_df = reduced_df[[self.outcome]]
 
         # Fit a model to the data
         model = CausalForestDML(model_y=GradientBoostingRegressor(), model_t=GradientBoostingRegressor())
diff --git a/tests/testing_tests/test_causal_test_engine.py b/tests/testing_tests/test_causal_test_engine.py
diff --git a/tests/testing_tests/test_estimators.py b/tests/testing_tests/test_estimators.py