pytest and linting

jmafoster1 · jmafoster1 · commit 392387cebc05 · 2024-07-25T15:50:15.000+01:00
diff --git a/causal_testing/json_front/json_class.py b/causal_testing/json_front/json_class.py
@@ -276,7 +276,7 @@ def _execute_test_case(
         test_passes = causal_test_case.expected_causal_effect.apply(causal_test_result)
 
         if "coverage" in test and test["coverage"]:
-            adequacy_metric = DataAdequacy(causal_test_case, estimation_model, self.data_collector)
+            adequacy_metric = DataAdequacy(causal_test_case, estimation_model)
             adequacy_metric.measure_adequacy()
             causal_test_result.adequacy = adequacy_metric
 
diff --git a/causal_testing/specification/capabilities.py b/causal_testing/specification/capabilities.py
@@ -1,3 +1,7 @@
+"""
+This module contains the Capability and TreatmentSequence classes to implement
+treatment sequences that operate over time.
+"""
 from causal_testing.specification.variable import Variable
 
 
@@ -14,7 +18,7 @@ def __init__(self, variable: Variable, value: any, start_time: int, end_time: fl
 
     def __eq__(self, other):
         return (
-            type(other) == type(self)
+            isinstance(other, type(self))
             and self.variable == other.variable
             and self.value == other.value
             and self.start_time == other.start_time
@@ -44,7 +48,7 @@ def __init__(self, timesteps_per_intervention, capabilities):
             )
         ]
         # This is a bodge so that causal test adequacy works
-        self.name = tuple([c.variable for c in self.capabilities])
+        self.name = tuple(c.variable for c in self.capabilities)
 
     def set_value(self, index: int, value: float):
         """
diff --git a/causal_testing/testing/causal_test_adequacy.py b/causal_testing/testing/causal_test_adequacy.py
@@ -9,7 +9,6 @@
 from lifelines.exceptions import ConvergenceError
 
 from causal_testing.testing.causal_test_suite import CausalTestSuite
-from causal_testing.data_collection.data_collector import DataCollector
 from causal_testing.specification.causal_dag import CausalDAG
 from causal_testing.testing.estimators import Estimator
 from causal_testing.testing.causal_test_case import CausalTestCase
@@ -72,17 +71,16 @@ class DataAdequacy:
     - Zero kurtosis is optimal.
     """
 
+    # pylint: disable=too-many-instance-attributes
     def __init__(
         self,
         test_case: CausalTestCase,
         estimator: Estimator,
-        data_collector: DataCollector,
         bootstrap_size: int = 100,
         group_by=None,
     ):
         self.test_case = test_case
         self.estimator = estimator
-        self.data_collector = data_collector
         self.kurtosis = None
         self.outcomes = None
         self.successful = None
@@ -104,7 +102,7 @@ def measure_adequacy(self):
             else:
                 estimator.df = estimator.df.sample(len(estimator.df), replace=True, random_state=i)
             try:
-                results.append(self.test_case.execute_test(estimator, self.data_collector))
+                results.append(self.test_case.execute_test(estimator, None))
             except LinAlgError:
                 continue
             except ConvergenceError:
@@ -129,7 +127,7 @@ def convert_to_df(field):
         effect_estimate = pd.concat(results["effect_estimate"].tolist(), axis=1).transpose().reset_index(drop=True)
         self.kurtosis = effect_estimate.kurtosis()
         self.outcomes = sum(filter(lambda x: x is not None, outcomes))
-        self.successful = sum([x is not None for x in outcomes])
+        self.successful = sum(x is not None for x in outcomes)
 
     def to_dict(self):
         "Returns the adequacy object as a dictionary."
diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py
@@ -608,17 +608,18 @@ class IPCWEstimator(Estimator):
     for sequences of treatments over time-varying data.
     """
 
+    # pylint: disable=too-many-arguments
+    # pylint: disable=too-many-instance-attributes
     def __init__(
         self,
         df: pd.DataFrame,
         timesteps_per_intervention: int,
         control_strategy: TreatmentSequence,
         treatment_strategy: TreatmentSequence,
         outcome: str,
-        min: float,
-        max: float,
-        fitBLswitch_formula: str,
-        fitBLTDswitch_formula: str,
+        fault_column: str,
+        fit_bl_switch_formula: str,
+        fit_bltd_switch_formula: str,
         eligibility=None,
         alpha: float = 0.05,
         query: str = "",
@@ -638,13 +639,12 @@ def __init__(
         self.control_strategy = control_strategy
         self.treatment_strategy = treatment_strategy
         self.outcome = outcome
-        self.min = min
-        self.max = max
+        self.fault_column = fault_column
         self.timesteps_per_intervention = timesteps_per_intervention
-        self.fitBLswitch_formula = fitBLswitch_formula
-        self.fitBLTDswitch_formula = fitBLTDswitch_formula
+        self.fit_bl_switch_formula = fit_bl_switch_formula
+        self.fit_bltd_switch_formula = fit_bltd_switch_formula
         self.eligibility = eligibility
-        self.df = self.preprocess_data(df)
+        self.df = df
 
     def add_modelling_assumptions(self):
         self.modelling_assumptions.append("The variables in the data vary over time.")
@@ -667,21 +667,20 @@ def setup_xo_t_do(self, strategy_assigned: list, strategy_followed: list, eligib
         ).astype("boolean")
         mask = mask | ~eligible
         mask.reset_index(inplace=True, drop=True)
-        false = mask.loc[mask == True]
+        false = mask.loc[mask]
         if false.empty:
             return np.zeros(len(mask))
-        else:
-            mask = (mask * 1).tolist()
-            cutoff = false.index[0] + 1
-            return mask[:cutoff] + ([None] * (len(mask) - cutoff))
+        mask = (mask * 1).tolist()
+        cutoff = false.index[0] + 1
+        return mask[:cutoff] + ([None] * (len(mask) - cutoff))
 
     def setup_fault_t_do(self, individual: pd.DataFrame):
         """
         Return a binary sequence with each bit representing whether the current
         index is the time point at which the event of interest (i.e. a fault)
         occurred.
         """
-        fault = individual[individual["within_safe_range"] == False]
+        fault = individual[~individual[self.fault_column]]
         fault_t_do = pd.Series(np.zeros(len(individual)), index=individual.index)
 
         if not fault.empty:
@@ -702,39 +701,43 @@ def setup_fault_time(self, individual, perturbation=-0.001):
         """
         Return the time at which the event of interest (i.e. a fault) occurred.
         """
-        fault = individual[individual["within_safe_range"] == False]
+        fault = individual[~individual[self.fault_column]]
         fault_time = (
             individual["time"].loc[fault.index[0]]
             if not fault.empty
             else (individual["time"].max() + self.timesteps_per_intervention)
         )
         return pd.DataFrame({"fault_time": np.repeat(fault_time + perturbation, len(individual))})
 
-    def preprocess_data(self, df):
+    def preprocess_data(self):
         """
         Set up the treatment-specific columns in the data that are needed to estimate the hazard ratio.
         """
-        df["trtrand"] = None  # treatment/control arm
-        df["xo_t_do"] = None  # did the individual deviate from the treatment of interest here?
-        df["eligible"] = df.eval(self.eligibility) if self.eligibility is not None else True
+        self.df["trtrand"] = None  # treatment/control arm
+        self.df["xo_t_do"] = None  # did the individual deviate from the treatment of interest here?
+        self.df["eligible"] = self.df.eval(self.eligibility) if self.eligibility is not None else True
 
         # when did a fault occur?
-        df["within_safe_range"] = df[self.outcome].between(self.min, self.max)
-        df["fault_time"] = df.groupby("id")[["within_safe_range", "time"]].apply(self.setup_fault_time).values
-        df["fault_t_do"] = df.groupby("id")[["id", "time", "within_safe_range"]].apply(self.setup_fault_t_do).values
-        assert not pd.isnull(df["fault_time"]).any()
+        self.df["fault_time"] = self.df.groupby("id")[[self.fault_column, "time"]].apply(self.setup_fault_time).values
+        self.df["fault_t_do"] = (
+            self.df.groupby("id")[["id", "time", self.fault_column]].apply(self.setup_fault_t_do).values
+        )
+        assert not pd.isnull(self.df["fault_time"]).any()
 
-        living_runs = df.query("fault_time > 0").loc[
-            (df["time"] % self.timesteps_per_intervention == 0) & (df["time"] <= self.control_strategy.total_time())
+        living_runs = self.df.query("fault_time > 0").loc[
+            (self.df["time"] % self.timesteps_per_intervention == 0)
+            & (self.df["time"] <= self.control_strategy.total_time())
         ]
 
         individuals = []
         new_id = 0
         logging.debug("  Preprocessing groups")
-        for id, individual in living_runs.groupby("id"):
-            assert (
-                sum(individual["fault_t_do"]) <= 1
-            ), f"Error initialising fault_t_do for individual\n{individual[['id', 'time', 'fault_time', 'fault_t_do']]}\nwith fault at {individual.fault_time.iloc[0]}"
+        for _, individual in living_runs.groupby("id"):
+            assert sum(individual["fault_t_do"]) <= 1, (
+                f"Error initialising fault_t_do for individual\n"
+                f"{individual[['id', 'time', 'fault_time', 'fault_t_do']]}\n"
+                "with fault at {individual.fault_time.iloc[0]}"
+            )
 
             strategy_followed = [
                 Capability(
@@ -761,59 +764,67 @@ def preprocess_data(self, df):
         if len(individuals) == 0:
             raise ValueError("No individuals followed either strategy.")
 
-        novCEA = pd.concat(individuals)
+        return pd.concat(individuals)
 
-        if novCEA["fault_t_do"].sum() == 0:
+    def estimate_hazard_ratio(self):
+        """
+        Estimate the hazard ratio.
+        """
+
+        preprocessed_data = self.preprocess_data()
+
+        if preprocessed_data["fault_t_do"].sum() == 0:
             raise ValueError("No recorded faults")
 
         # Use logistic regression to predict switching given baseline covariates
-        fitBLswitch = smf.logit(self.fitBLswitch_formula, data=novCEA).fit()
+        fit_bl_switch = smf.logit(self.fit_bl_switch_formula, data=preprocessed_data).fit()
 
-        novCEA["pxo1"] = fitBLswitch.predict(novCEA)
+        preprocessed_data["pxo1"] = fit_bl_switch.predict(preprocessed_data)
 
         # Use logistic regression to predict switching given baseline and time-updated covariates (model S12)
-        fitBLTDswitch = smf.logit(
-            self.fitBLTDswitch_formula,
-            data=novCEA,
+        fit_bltd_switch = smf.logit(
+            self.fit_bltd_switch_formula,
+            data=preprocessed_data,
         ).fit()
 
-        novCEA["pxo2"] = fitBLTDswitch.predict(novCEA)
+        preprocessed_data["pxo2"] = fit_bltd_switch.predict(preprocessed_data)
 
         # IPCW step 3: For each individual at each time, compute the inverse probability of remaining uncensored
         # Estimate the probabilities of remaining ‘un-switched’ and hence the weights
 
-        novCEA["num"] = 1 - novCEA["pxo1"]
-        novCEA["denom"] = 1 - novCEA["pxo2"]
-        novCEA[["num", "denom"]] = novCEA.sort_values(["id", "time"]).groupby("id")[["num", "denom"]].cumprod()
+        preprocessed_data["num"] = 1 - preprocessed_data["pxo1"]
+        preprocessed_data["denom"] = 1 - preprocessed_data["pxo2"]
+        preprocessed_data[["num", "denom"]] = (
+            preprocessed_data.sort_values(["id", "time"]).groupby("id")[["num", "denom"]].cumprod()
+        )
 
-        assert not novCEA["num"].isnull().any(), f"{len(novCEA['num'].isnull())} null numerator values"
-        assert not novCEA["denom"].isnull().any(), f"{len(novCEA['denom'].isnull())} null denom values"
+        assert (
+            not preprocessed_data["num"].isnull().any()
+        ), f"{len(preprocessed_data['num'].isnull())} null numerator values"
+        assert (
+            not preprocessed_data["denom"].isnull().any()
+        ), f"{len(preprocessed_data['denom'].isnull())} null denom values"
 
-        novCEA["weight"] = 1 / novCEA["denom"]
-        novCEA["sweight"] = novCEA["num"] / novCEA["denom"]
+        preprocessed_data["weight"] = 1 / preprocessed_data["denom"]
+        preprocessed_data["sweight"] = preprocessed_data["num"] / preprocessed_data["denom"]
 
-        novCEA_KM = novCEA.loc[novCEA["xo_t_do"] == 0].copy()
-        novCEA_KM["tin"] = novCEA_KM["time"]
-        novCEA_KM["tout"] = pd.concat(
-            [(novCEA_KM["time"] + self.timesteps_per_intervention), novCEA_KM["fault_time"]], axis=1
+        preprocessed_data_km = preprocessed_data.loc[preprocessed_data["xo_t_do"] == 0].copy()
+        preprocessed_data_km["tin"] = preprocessed_data_km["time"]
+        preprocessed_data_km["tout"] = pd.concat(
+            [(preprocessed_data_km["time"] + self.timesteps_per_intervention), preprocessed_data_km["fault_time"]],
+            axis=1,
         ).min(axis=1)
 
-        assert (
-            novCEA_KM["tin"] <= novCEA_KM["tout"]
-        ).all(), f"Left before joining\n{novCEA_KM.loc[novCEA_KM['tin'] >= novCEA_KM['tout']]}"
-
-        return novCEA_KM
-
-    def estimate_hazard_ratio(self):
-        """
-        Estimate the hazard ratio.
-        """
+        assert (preprocessed_data_km["tin"] <= preprocessed_data_km["tout"]).all(), (
+            f"Left before joining\n"
+            f"{preprocessed_data_km.loc[preprocessed_data_km['tin'] >= preprocessed_data_km['tout']]}"
+        )
 
         #  IPCW step 4: Use these weights in a weighted analysis of the outcome model
         # Estimate the KM graph and IPCW hazard ratio using Cox regression.
         cox_ph = CoxPHFitter()
         cox_ph.fit(
-            df=self.df,
+            df=preprocessed_data_km,
             duration_col="tout",
             event_col="fault_t_do",
             weights_col="weight",
diff --git a/tests/data/temporal_data.csv b/tests/data/temporal_data.csv
@@ -0,0 +1,61 @@
+t,outcome,id,time
+0,1,0,0
+0,1,0,1
+0,1,0,2
+0,0,0,3
+0,0,0,4
+0,1,1,0
+0,1,1,1
+0,1,1,2
+0,0,1,3
+0,0,1,4
+0,1,2,0
+0,1,2,1
+0,1,2,2
+0,0,2,3
+0,0,2,4
+0,1,3,0
+0,1,3,1
+0,1,3,2
+0,0,3,3
+0,0,3,4
+0,1,4,0
+0,1,4,1
+0,1,4,2
+0,0,4,3
+0,0,4,4
+0,1,5,0
+0,1,5,1
+0,1,5,2
+0,0,5,3
+0,0,5,4
+0,1,6,0
+0,1,6,1
+0,0,6,2
+0,0,6,3
+0,0,6,4
+1,1,7,0
+1,1,7,1
+1,0,7,2
+1,0,7,3
+1,0,7,4
+1,1,8,0
+1,1,8,1
+1,0,8,2
+1,0,8,3
+1,0,8,4
+1,1,9,0
+1,1,9,1
+1,0,9,2
+1,0,9,3
+1,0,9,4
+1,1,10,0
+1,1,10,1
+1,0,10,2
+1,0,10,3
+1,0,10,4
+1,1,11,0
+1,1,11,1
+1,0,11,2
+1,0,11,3
+1,0,11,4
diff --git a/tests/testing_tests/test_estimators.py b/tests/testing_tests/test_estimators.py