Extra coverage

jmafoster1 · jmafoster1 · commit 8f32552bf929 · 2023-01-24T11:58:26.000Z
diff --git a/causal_testing/specification/variable.py b/causal_testing/specification/variable.py
@@ -201,7 +201,6 @@ def typestring(self) -> str:
         """
         return type(self).__name__
 
-    @abstractmethod
     def copy(self, name: str = None) -> Variable:
         """Return a new instance of the Variable with the given name, or with
         the original name if no name is supplied.
@@ -211,26 +210,18 @@ def copy(self, name: str = None) -> Variable:
         :rtype: Variable
 
         """
-        raise NotImplementedError("Method `copy` must be instantiated.")
+        if name:
+            return self.__class__(name, self.datatype, self.distribution)
+        return self.__class__(self.name, self.datatype, self.distribution)
 
 
 class Input(Variable):
     """An extension of the Variable class representing inputs."""
 
-    def copy(self, name=None) -> Input:
-        if name:
-            return Input(name, self.datatype, self.distribution)
-        return Input(self.name, self.datatype, self.distribution)
-
 
 class Output(Variable):
     """An extension of the Variable class representing outputs."""
 
-    def copy(self, name=None) -> Output:
-        if name:
-            return Output(name, self.datatype, self.distribution)
-        return Output(self.name, self.datatype, self.distribution)
-
 
 class Meta(Variable):
     """An extension of the Variable class representing metavariables. These are variables which are relevant to the
@@ -250,8 +241,3 @@ class Meta(Variable):
     def __init__(self, name: str, datatype: T, populate: Callable[[DataFrame], DataFrame]):
         super().__init__(name, datatype)
         self.populate = populate
-
-    def copy(self, name=None) -> Meta:
-        if name:
-            return Meta(name, self.datatype, self.distribution)
-        return Meta(self.name, self.datatype, self.distribution)
diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py
@@ -57,6 +57,7 @@ def __init__(
         else:
             raise ValueError(f"Unsupported type for effect_modifiers {effect_modifiers}. Expected iterable")
         self.modelling_assumptions = []
+        self.add_modelling_assumptions()
         logger.debug("Effect Modifiers: %s", self.effect_modifiers)
 
     @abstractmethod
@@ -403,18 +404,23 @@ def estimate_ate(self) -> tuple[float, list[float, float], float]:
         confidence_intervals = list(t_test_results.conf_int().flatten())
         return ate, confidence_intervals
 
-    def estimate_control_treatment(self) -> tuple[pd.Series, pd.Series]:
+    def estimate_control_treatment(self, adjustment_config: dict = None) -> tuple[pd.Series, pd.Series]:
         """Estimate the outcomes under control and treatment.
 
         :return: The estimated outcome under control and treatment in the form
         (control_outcome, treatment_outcome).
         """
+        if adjustment_config is None:
+            adjustment_config = dict()
+
         model = self._run_linear_regression()
         self.model = model
 
         x = pd.DataFrame()
         x[self.treatment[0]] = [self.treatment_values, self.control_values]
         x["Intercept"] = self.intercept
+        for k, v in adjustment_config.items():
+            x[k] = v
         for k, v in self.effect_modifiers.items():
             x[k] = v
         for t in self.square_terms:
@@ -443,16 +449,15 @@ def estimate_risk_ratio(self) -> tuple[float, list[float, float]]:
 
         return (treatment_outcome["mean"] / control_outcome["mean"]), [ci_low, ci_high]
 
-    def estimate_ate_calculated(self) -> tuple[float, list[float, float]]:
+    def estimate_ate_calculated(self, adjustment_config: dict = None) -> tuple[float, list[float, float]]:
         """Estimate the ate effect of the treatment on the outcome. That is, the change in outcome caused
         by changing the treatment variable from the control value to the treatment value. Here, we actually
         calculate the expected outcomes under control and treatment and divide one by the other. This
         allows for custom terms to be put in such as squares, inverses, products, etc.
 
         :return: The average treatment effect and the 95% Wald confidence intervals.
         """
-        control_outcome, treatment_outcome = self.estimate_control_treatment()
-        assert False
+        control_outcome, treatment_outcome = self.estimate_control_treatment(adjustment_config=adjustment_config)
         ci_low = treatment_outcome["mean_ci_lower"] - control_outcome["mean_ci_upper"]
         ci_high = treatment_outcome["mean_ci_upper"] - control_outcome["mean_ci_lower"]
 
diff --git a/tests/data_collection_tests/test_observational_data_collector.py b/tests/data_collection_tests/test_observational_data_collector.py
@@ -5,43 +5,54 @@
 from causal_testing.specification.causal_specification import Scenario
 from causal_testing.specification.variable import Input, Output, Meta
 from scipy.stats import uniform, rv_discrete
+from enum import Enum
+import random
 from tests.test_helpers import create_temp_dir_if_non_existent, remove_temp_dir_if_existent
 
 
 class TestObservationalDataCollector(unittest.TestCase):
     def setUp(self) -> None:
+        class Color(Enum):
+            RED = "RED"
+            GREEN = "GREEN"
+            BLUE = "BLUE"
+
         temp_dir_path = create_temp_dir_if_non_existent()
         self.dag_dot_path = os.path.join(temp_dir_path, "dag.dot")
         self.observational_df_path = os.path.join(temp_dir_path, "observational_data.csv")
         # Y = 3*X1 + X2*X3 + 10
-        self.observational_df = pd.DataFrame({"X1": [1, 2, 3, 4], "X2": [5, 6, 7, 8], "X3": [10, 20, 30, 40]})
-        self.observational_df["Y"] = self.observational_df.apply(
+        self.observational_df = pd.DataFrame(
+            {"X1": [1, 2, 3, 4], "X2": [5, 6, 7, 8], "X3": [10, 20, 30, 40], "Y2": ["RED", "GREEN", "BLUE", "BLUE"]}
+        )
+        self.observational_df["Y1"] = self.observational_df.apply(
             lambda row: (3 * row.X1) + (row.X2 * row.X3) + 10, axis=1
         )
         self.observational_df.to_csv(self.observational_df_path)
+        self.observational_df["Y2"] = [Color[x] for x in self.observational_df["Y2"]]
         self.X1 = Input("X1", int, uniform(1, 4))
         self.X2 = Input("X2", int, rv_discrete(values=([7], [1])))
         self.X3 = Input("X3", int, uniform(10, 40))
         self.X4 = Input("X4", int, rv_discrete(values=([10], [1])))
-        self.Y = Output("Y", int)
+        self.Y1 = Output("Y1", int)
+        self.Y2 = Output("Y2", Color)
 
     def test_not_all_variables_in_data(self):
         scenario = Scenario({self.X1, self.X2, self.X3, self.X4})
         observational_data_collector = ObservationalDataCollector(scenario, self.observational_df_path)
         self.assertRaises(IndexError, observational_data_collector.collect_data)
 
     def test_all_variables_in_data(self):
-        scenario = Scenario({self.X1, self.X2, self.X3, self.Y})
+        scenario = Scenario({self.X1, self.X2, self.X3, self.Y1, self.Y2})
         observational_data_collector = ObservationalDataCollector(scenario, self.observational_df_path)
         df = observational_data_collector.collect_data(index_col=0)
-        assert df.equals(self.observational_df), f"{df}\nwas not equal to\n{self.observational_df}"
+        assert df.equals(self.observational_df), f"\n{df}\nwas not equal to\n{self.observational_df}"
 
     def test_data_constraints(self):
-        scenario = Scenario({self.X1, self.X2, self.X3, self.Y}, {self.X1.z3 > 2})
+        scenario = Scenario({self.X1, self.X2, self.X3, self.Y1, self.Y2}, {self.X1.z3 > 2})
         observational_data_collector = ObservationalDataCollector(scenario, self.observational_df_path)
         df = observational_data_collector.collect_data(index_col=0)
         expected = self.observational_df.loc[[2, 3]]
-        assert df.equals(expected), f"{df}\nwas not equal to\n{expected}"
+        assert df.equals(expected), f"\n{df}\nwas not equal to\n{expected}"
 
     def test_meta_population(self):
         def populate_m(data):
diff --git a/tests/generation_tests/test_abstract_test_case.py b/tests/generation_tests/test_abstract_test_case.py
@@ -1,12 +1,36 @@
 import unittest
 import os
 import pandas as pd
+import numpy as np
 from causal_testing.generation.abstract_causal_test_case import AbstractCausalTestCase
 from causal_testing.specification.causal_specification import Scenario
 from causal_testing.specification.variable import Input, Output
 from scipy.stats import uniform, rv_discrete
 from tests.test_helpers import create_temp_dir_if_non_existent, remove_temp_dir_if_existent
 from causal_testing.testing.causal_test_outcome import Positive
+from z3 import And
+from enum import Enum
+
+
+class Car(Enum):
+    isetta = "vehicle.bmw.isetta"
+    mkz2017 = "vehicle.lincoln.mkz2017"
+
+    def __gt__(self, other):
+        if self.__class__ is other.__class__:
+            return self.value > other.value
+        return NotImplemented
+
+
+class CarGen(rv_discrete):
+    cars = dict(enumerate(Car, 1))
+    inverse_cars = {v: k for k, v in cars.items()}
+
+    def ppf(self, q, *args, **kwds):
+        return np.vectorize(self.cars.get)(np.ceil(len(self.cars) * q))
+
+    def cdf(self, q, *args, **kwds):
+        return np.vectorize(self.inverse_cars.get)(q) / len(Car)
 
 
 class TestAbstractTestCase(unittest.TestCase):
@@ -28,6 +52,8 @@ def setUp(self) -> None:
         self.X2 = Input("X2", int, rv_discrete(values=([7], [1])))
         self.X3 = Input("X3", float, uniform(10, 40))
         self.X4 = Input("X4", int, rv_discrete(values=([10], [1])))
+        self.X5 = Input("X5", bool, rv_discrete(values=(range(2), [0.5, 0.5])))
+        self.Car = Input("Car", Car, CarGen())
         self.Y = Output("Y", int)
 
     def test_generate_concrete_test_cases(self):
@@ -44,6 +70,38 @@ def test_generate_concrete_test_cases(self):
         assert len(concrete_tests) == 2, "Expected 2 concrete tests"
         assert len(runs) == 2, "Expected 2 runs"
 
+    def test_generate_boolean_concrete_test_cases(self):
+        scenario = Scenario({self.X1, self.X2, self.X3, self.X5})
+        scenario.setup_treatment_variables()
+        abstract = AbstractCausalTestCase(
+            scenario=scenario,
+            intervention_constraints={
+                And(scenario.treatment_variables[self.X5.name].z3 == True, scenario.variables[self.X5.name].z3 == False)
+            },
+            treatment_variable=self.X5,
+            expected_causal_effect={self.Y: Positive()},
+            effect_modifiers=None,
+        )
+        concrete_tests, runs = abstract.generate_concrete_tests(2)
+        assert len(concrete_tests) == 1, "Expected 1 concrete test"
+        assert len(runs) == 1, "Expected 1 run"
+
+    def test_generate_enum_concrete_test_cases(self):
+        scenario = Scenario({self.Car})
+        scenario.setup_treatment_variables()
+        abstract = AbstractCausalTestCase(
+            scenario=scenario,
+            intervention_constraints={
+                scenario.treatment_variables[self.Car.name].z3 != scenario.variables[self.Car.name].z3
+            },
+            treatment_variable=self.Car,
+            expected_causal_effect={self.Y: Positive()},
+            effect_modifiers=None,
+        )
+        concrete_tests, runs = abstract.generate_concrete_tests(2)
+        assert len(concrete_tests) == 2, "Expected 2 concrete tests"
+        assert len(runs) == 2, "Expected 2 runs"
+
     def test_str(self):
         scenario = Scenario({self.X1, self.X2, self.X3, self.X4})
         scenario.setup_treatment_variables()
diff --git a/tests/specification_tests/test_variable.py b/tests/specification_tests/test_variable.py
@@ -1,6 +1,7 @@
 import unittest
 from enum import Enum
 import z3
+from scipy.stats import norm, kstest
 
 from causal_testing.specification.variable import z3_types, Variable, Input
 
@@ -35,6 +36,44 @@ class Color(Enum):
         # z3_types(Color)("color") != z3_types(Color)("color")
         self.assertEqual(list(map(str, expected_values)), list(map(str, z3_color_values)))
 
+    def test_cast_z3_bool(self):
+        bip = Input("bip", bool)
+        s = z3.Solver()
+        t = z3.Bool("t")
+        f = z3.Bool("f")
+        s.add(t)
+        s.add(z3.Not(f))
+        s.check()
+        self.assertEqual(bip.cast(s.model()[t]), True)
+        self.assertEqual(bip.cast(s.model()[f]), False)
+
+    def test_cast_z3_string(self):
+        ip = Input("bip", str)
+        s = z3.Solver()
+        t = z3.String("t")
+        s.add(t == "hello")
+        s.check()
+        self.assertEqual(ip.cast(s.model()[t]), "hello")
+
+    def test_sample_flakey(self):
+        ip = Input("ip", float, norm)
+        self.assertGreater(kstest(ip.sample(10), norm.cdf).pvalue, 0.95)
+
+    def test_cast_enum(self):
+        class Color(Enum):
+            """
+            Example enum class color.
+            """
+
+            RED = "RED"
+            GREEN = "GREEN"
+            BLUE = "BLUE"
+
+        color = Input("color", Color)
+
+        dtype, colours = z3.EnumSort("color", ("RED", "GREEN", "BLUE"))
+        self.assertEqual(color.cast(colours[0]), Color.RED)
+
     def test_z3_value_enum(self):
         class Color(Enum):
             """
@@ -89,16 +128,18 @@ class Err:
 
     def test_typestring(self):
         class Var(Variable):
-            """
-            The simplest class which will elicit the correct error.
-            """
-
-            def copy(self, name: str = None):
-                pass
+            pass
 
         var = Var("v", int)
         self.assertEqual(var.typestring(), "Var")
 
+    def test_copy(self):
+        ip = Input("ip", float, norm)
+        self.assertNotEqual(ip.copy(), ip)
+        self.assertEqual(ip.copy().name, ip.name)
+        self.assertEqual(ip.copy().datatype, ip.datatype)
+        self.assertEqual(ip.copy().distribution, ip.distribution)
+
 
 class TestZ3Methods(unittest.TestCase):
 
diff --git a/tests/testing_tests/test_estimators.py b/tests/testing_tests/test_estimators.py
@@ -209,6 +209,64 @@ def test_program_15_no_interaction(self):
         self.assertEqual(round(ate, 1), 3.5)
         self.assertEqual([round(ci_low, 1), round(ci_high, 1)], [2.6, 4.3])
 
+    def test_program_15_no_interaction_ate(self):
+        """Test whether our linear regression implementation produces the same results as program 15.1 (p. 163, 184)
+        without product parameter."""
+        df = self.nhefs_df
+        covariates = {
+            "sex",
+            "race",
+            "age",
+            "edu_2",
+            "edu_3",
+            "edu_4",
+            "edu_5",
+            "exercise_1",
+            "exercise_2",
+            "active_1",
+            "active_2",
+            "wt71",
+            "smokeintensity",
+            "smokeyrs",
+        }
+        linear_regression_estimator = LinearRegressionEstimator(("qsmk",), 1, 0, covariates, ("wt82_71",), df)
+        terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
+        for term_to_square in terms_to_square:
+            linear_regression_estimator.add_squared_term_to_df(term_to_square)
+        ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate()
+        self.assertEqual(round(ate, 1), 3.5)
+        self.assertEqual([round(ci_low, 1), round(ci_high, 1)], [2.6, 4.3])
+
+    def test_program_15_no_interaction_ate_calculated(self):
+        """Test whether our linear regression implementation produces the same results as program 15.1 (p. 163, 184)
+        without product parameter."""
+        df = self.nhefs_df
+        covariates = {
+            "sex",
+            "race",
+            "age",
+            "edu_2",
+            "edu_3",
+            "edu_4",
+            "edu_5",
+            "exercise_1",
+            "exercise_2",
+            "active_1",
+            "active_2",
+            "wt71",
+            "smokeintensity",
+            "smokeyrs",
+        }
+        linear_regression_estimator = LinearRegressionEstimator(("qsmk",), 1, 0, covariates, ("wt82_71",), df)
+        terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
+        for term_to_square in terms_to_square:
+            linear_regression_estimator.add_squared_term_to_df(term_to_square)
+        ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate_calculated(
+            {k: self.nhefs_df.mean()[k] for k in covariates}
+        )
+        self.assertEqual(round(ate, 1), 3.5)
+        self.assertEqual([round(ci_low, 1), round(ci_high, 1)], [1.9, 5])
+
 
 class TestCausalForestEstimator(unittest.TestCase):
     """Test the linear regression estimator against the programming exercises in Section 2 of Hernán and Robins [1].