diff --git a/autoum/approaches/uplift_random_forest.py b/autoum/approaches/uplift_random_forest.py index da3c319..30fd0fa 100644 --- a/autoum/approaches/uplift_random_forest.py +++ b/autoum/approaches/uplift_random_forest.py @@ -4,6 +4,7 @@ from datetime import datetime import numpy as np +import pandas as pd from causalml.inference.tree import UpliftRandomForestClassifier from autoum.approaches.utils import ApproachParameters, DataSetsHelper, Helper @@ -57,9 +58,13 @@ def __init__(self, parameters: dict, approach_parameters: ApproachParameters, ev self.feature_importance = approach_parameters.feature_importance self.save = approach_parameters.save self.path = approach_parameters.path + self.post_prune = approach_parameters.post_prune self.split_number = approach_parameters.split_number self.log = logging.getLogger(type(self).__name__) + if eval_function not in ["ED", "KL", "CHI"]: + self.post_prune = False + def analyze(self, data_set_helper: DataSetsHelper) -> dict: """ Calculate the score (ITE/Uplift/CATE) for each sample using uplift random forest @@ -78,6 +83,10 @@ def analyze(self, data_set_helper: DataSetsHelper) -> dict: urf.fit(X=data_set_helper.x_train, treatment=experiment_groups_col, y=data_set_helper.y_train) + if self.post_prune: + for list_id, tree in enumerate(urf.uplift_forest): + tree.prune(data_set_helper.x_train, experiment_groups_col, data_set_helper.y_train) + self.log.debug(urf) if self.save: diff --git a/autoum/approaches/utils.py b/autoum/approaches/utils.py index 7ad340b..5760129 100644 --- a/autoum/approaches/utils.py +++ b/autoum/approaches/utils.py @@ -35,7 +35,7 @@ def add_treatment_group_key(df: pd.DataFrame) -> np.array: """ experiment_groups_col = ["c" if x == 0 else "t" for x in df.treatment] - experiment_groups_col = np.array(experiment_groups_col) + experiment_groups_col = np.array(experiment_groups_col, dtype=object) return experiment_groups_col @@ -108,7 +108,7 @@ class ApproachParameters: Utility class that encompassees all parameters needed to create an approach instance. """ - def __init__(self, cost_sensitive: bool, feature_importance: bool, path: str, save: bool, split_number: int): + def __init__(self, cost_sensitive: bool, feature_importance: bool, path: str, post_prune: bool, save: bool, split_number: int): """ Utility class that encompassees all parameters needed to create an approach instance. @@ -122,5 +122,6 @@ def __init__(self, cost_sensitive: bool, feature_importance: bool, path: str, sa self.cost_sensitive = cost_sensitive self.feature_importance = feature_importance self.path = path + self.post_prune = post_prune self.save = save self.split_number = split_number diff --git a/autoum/pipelines/pipeline_rw.py b/autoum/pipelines/pipeline_rw.py index 34c8c53..1b0572b 100644 --- a/autoum/pipelines/pipeline_rw.py +++ b/autoum/pipelines/pipeline_rw.py @@ -53,6 +53,7 @@ def __init__(self, plot_uqc: bool = True, plot_save_figures: bool = False, pool_capacity: int = 40, + post_prune: bool = False, rlearner: bool = False, run_name: str = "RUN", run_id: int = 1, @@ -109,6 +110,7 @@ def __init__(self, :param plot_uqc: True if the UQC value for a curve should be included in the plot legend. False otherwise. Default: True :param plot_save_figures: True if the resulting qini figures shall be saved. False otherwise. Default: False :param pool_capacity: Set this to the maximum number of free kernels for the calculation. Default 40 + :param post_prune: Prune the uplift models after training, applies to URF_CHI, URF_ED and URF_KL :param rlearner: True, if R-Learner should be applied. False otherwise. Default: False :param run_id: Id of the run (For logging and saving purposes). Default: 1 :param run_name: Name of the run (For logging and saving purposes). Default: "RUN" @@ -152,6 +154,7 @@ def __init__(self, self.plot_uqc = plot_uqc self.plot_save_figures = plot_save_figures self.pool_capacity = pool_capacity + self.post_prune = post_prune self.rlearner = rlearner self.random_seed = random_seed self.save_models = save_models @@ -216,26 +219,31 @@ def sanity_checks(self): assert 0.1 <= self.validation_size <= 0.9, "Please select 0.1 <= validation_size <= 0.9" assert self.n_estimators % 4 == 0, "Please select a multiple of 4 as n_estimators" - def analyze_dataset(self, data: pd.DataFrame): + def analyze_dataset(self, data: pd.DataFrame, test_data: pd.DataFrame = None): """ Apply, compare, and evaluate various uplift modeling approaches on the given data set. :param data: Dataset to be analyzed + :param test_data: (optional) Test Dataset, which the pipeline will use for the test metrics """ if not isinstance(data, pd.DataFrame): return - + start = time.time() logging.info("Starting analyzing dataset ... ") - try: - df_train, df_test = train_test_split(data, test_size=self.test_size, shuffle=True, stratify=data[['response', 'treatment']], random_state=self.random_seed) - df_train.reset_index(inplace=True, drop=True) - df_test.reset_index(inplace=True, drop=True) - except ValueError: - logging.error("Stratification not possible" + data.groupby(["response", "treatment"]).size().reset_index(name="Counter").to_string()) - raise ValueError("Stratification not possible" + data.groupby(["response", "treatment"]).size().reset_index(name="Counter").to_string()) + if test_data is not None: + assert data.columns.equals(test_data.columns), "The train and test dataset columns are not identical" + df_train, df_test = data.sample(frac=1.0, random_state=self.random_seed), test_data + else: + try: + df_train, df_test = train_test_split(data, test_size=self.test_size, shuffle=True, stratify=data[['response', 'treatment']], random_state=self.random_seed) + df_train.reset_index(inplace=True, drop=True) + df_test.reset_index(inplace=True, drop=True) + except ValueError: + logging.error("Stratification not possible" + data.groupby(["response", "treatment"]).size().reset_index(name="Counter").to_string()) + raise ValueError("Stratification not possible" + data.groupby(["response", "treatment"]).size().reset_index(name="Counter").to_string()) # Get feature names feature_names = list(df_train.drop(['response', 'treatment'], axis=1).columns.values) @@ -436,7 +444,7 @@ def train_eval_splits(self, args): scores_dict = HelperPipeline.apply_uplift_approaches(df_train=df_train, df_valid=df_valid, df_test=df_test, parameters=self.parameters, approach=[approach_name], split_number=i, cost_sensitive=self.cost_sensitive, feature_importance=self.feature_importance, - save_models=self.save_models) + save_models=self.save_models, post_prune=self.post_prune) logging.info("Start Evaluation. Split number {}".format(i)) diff --git a/autoum/pipelines/utils.py b/autoum/pipelines/utils.py index bdfaf9f..cd0eac1 100644 --- a/autoum/pipelines/utils.py +++ b/autoum/pipelines/utils.py @@ -81,7 +81,8 @@ def apply_uplift_approaches(df_train: pd.DataFrame, split_number: int, cost_sensitive: bool = False, feature_importance: bool = False, - save_models: bool = False) -> dict: + save_models: bool = False, + post_prune: bool = False) -> dict: """ Apply given uplift modeling approaches on the given dataframes and return the scores @@ -95,6 +96,7 @@ def apply_uplift_approaches(df_train: pd.DataFrame, :param cost_sensitive: Set this to true for cost sensitive learning. :param feature_importance: Set this to True to return the feature importances of the classifiers :param save_models: True if the models generated during training shall be saved. False otherwise. + :param post_prune: Set this to true to prune the trees of the URF approaches after training :return: Dictionary with the following keys: df_scores_train, df_scores_test, df_train, df_test, feature_importances (empty dictionary if not used) """ @@ -110,7 +112,7 @@ def apply_uplift_approaches(df_train: pd.DataFrame, ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) # ApproachParameters contains alll parameters necessary to initialize an approach classifier root = f"{get_data_home()}/models/" - approach_params = ApproachParameters(cost_sensitive=cost_sensitive, feature_importance=feature_importance, path=root, save=save_models, split_number=split_number) + approach_params = ApproachParameters(cost_sensitive=cost_sensitive, feature_importance=feature_importance, path=root, post_prune=post_prune, save=save_models, split_number=split_number) # This dictionary is used as wrapper for passing all parameters at once for apply_approach apply_params = { diff --git a/tests/test_bayesian_causal_forest.py b/tests/test_bayesian_causal_forest.py index 7ae12cf..b6036f8 100644 --- a/tests/test_bayesian_causal_forest.py +++ b/tests/test_bayesian_causal_forest.py @@ -23,7 +23,7 @@ def setUp(self): ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) root = f"{get_data_home()}/testing/models/" - approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0) + approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0) self.ds_helper = ds_helper self.approach_params = approach_params diff --git a/tests/test_class_variable_transformation.py b/tests/test_class_variable_transformation.py index 56d394c..c3eb5f1 100644 --- a/tests/test_class_variable_transformation.py +++ b/tests/test_class_variable_transformation.py @@ -23,7 +23,7 @@ def setUp(self): ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) root = f"{get_data_home()}/testing/models/" - approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0) + approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0) self.ds_helper = ds_helper self.approach_params = approach_params diff --git a/tests/test_generalized_random_forest.py b/tests/test_generalized_random_forest.py index b69f2dd..3fb7d6d 100644 --- a/tests/test_generalized_random_forest.py +++ b/tests/test_generalized_random_forest.py @@ -23,7 +23,7 @@ def setUp(self): ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) root = f"{get_data_home()}/testing/models/" - approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0) + approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0) self.ds_helper = ds_helper self.approach_params = approach_params diff --git a/tests/test_lais_generalization.py b/tests/test_lais_generalization.py index 1af297c..d02bbb2 100644 --- a/tests/test_lais_generalization.py +++ b/tests/test_lais_generalization.py @@ -24,7 +24,7 @@ def setUp(self): ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) root = f"{get_data_home()}/testing/models/" - approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0) + approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0) self.ds_helper = ds_helper self.approach_params = approach_params diff --git a/tests/test_pipeline_rw.py b/tests/test_pipeline_rw.py index afde53d..9d83983 100644 --- a/tests/test_pipeline_rw.py +++ b/tests/test_pipeline_rw.py @@ -241,10 +241,10 @@ def test_plotting(self): def test_create_approach_tuples(self): cv_number_splits = 10 - pipeline = PipelineRW(cv_number_splits=cv_number_splits, urf_ddp=False, two_model=False) + pipeline = PipelineRW(cv_number_splits=cv_number_splits, slearner=True, two_model=True) dataframe_pairs = pipeline.create_k_splits(df_train=self.df_train, df_test=self.df_test) tuple_list = pipeline.create_approach_tuples(dataframe_pairs) - self.assertEqual(len(tuple_list), 15 * cv_number_splits) + self.assertEqual(len(tuple_list), 2 * cv_number_splits) for _tuple in tuple_list: self.assertEqual(len(_tuple), 5) diff --git a/tests/test_r_learner.py b/tests/test_r_learner.py index e262852..04780e6 100644 --- a/tests/test_r_learner.py +++ b/tests/test_r_learner.py @@ -23,7 +23,7 @@ def setUp(self): ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) root = f"{get_data_home()}/testing/models/" - approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0) + approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0) self.ds_helper = ds_helper self.approach_params = approach_params diff --git a/tests/test_s_learner.py b/tests/test_s_learner.py index 83ce78c..29df60a 100644 --- a/tests/test_s_learner.py +++ b/tests/test_s_learner.py @@ -23,7 +23,7 @@ def setUp(self): ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) root = f"{get_data_home()}/testing/models/" - approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0) + approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0) self.ds_helper = ds_helper self.approach_params = approach_params diff --git a/tests/test_traditional.py b/tests/test_traditional.py index e9e318a..47414b4 100644 --- a/tests/test_traditional.py +++ b/tests/test_traditional.py @@ -23,7 +23,7 @@ def setUp(self): ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) root = f"{get_data_home()}/testing/models/" - approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0) + approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0) self.ds_helper = ds_helper self.approach_params = approach_params diff --git a/tests/test_treatment_dummy.py b/tests/test_treatment_dummy.py index 5728114..0eb77e9 100644 --- a/tests/test_treatment_dummy.py +++ b/tests/test_treatment_dummy.py @@ -23,7 +23,7 @@ def setUp(self): ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) root = f"{get_data_home()}/testing/models/" - approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0) + approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0) self.ds_helper = ds_helper self.approach_params = approach_params diff --git a/tests/test_two_model.py b/tests/test_two_model.py index a24e7a6..22d938b 100644 --- a/tests/test_two_model.py +++ b/tests/test_two_model.py @@ -23,7 +23,7 @@ def setUp(self): ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) root = f"{get_data_home()}/testing/models/" - approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0) + approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0) self.ds_helper = ds_helper self.approach_params = approach_params diff --git a/tests/test_uplift_random_forest.py b/tests/test_uplift_random_forest.py index 3c58372..a519eb8 100644 --- a/tests/test_uplift_random_forest.py +++ b/tests/test_uplift_random_forest.py @@ -23,7 +23,7 @@ def setUp(self): ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) root = f"{get_data_home()}/testing/models/" - approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0) + approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0) self.ds_helper = ds_helper self.approach_params = approach_params diff --git a/tests/test_utils_pipelines.py b/tests/test_utils_pipelines.py index 2c07b27..c2e9b62 100644 --- a/tests/test_utils_pipelines.py +++ b/tests/test_utils_pipelines.py @@ -1,3 +1,4 @@ +import time import unittest from unittest.mock import MagicMock, patch @@ -31,7 +32,7 @@ def setUp(self): self.df_train, self.df_valid = train_test_split(data, test_size=0.2, shuffle=True, stratify=data[['response', 'treatment']], random_state=123) self.ds_helper = DataSetsHelper(df_train=self.df_train, df_valid=self.df_valid, df_test=self.df_test) - self.approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=None, save=False, split_number=0) + self.approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=None, post_prune=False, save=False, split_number=0) self.apply_params = { "data_set_helper": self.ds_helper, "feature_importance": False, @@ -56,7 +57,8 @@ def setUp(self): "n_jobs": n_jobs, "control_name": "c", "normalization": True, - "honesty": False + "honesty": False, + "post_prune": True } s_learner_parameters = { @@ -240,7 +242,7 @@ def test_apply_uplift_approaches(self, m_apply_approach): if i == "TWO_MODEL": self.assertTrue(TwoModel.__instancecheck__(m_apply_approach.call_args[0][0])) - elif "URF" in i: + elif i == "URF": self.assertTrue(UpliftRandomForest.__instancecheck__(m_apply_approach.call_args[0][0])) elif i == "TRADITIONAL": self.assertTrue(Traditional.__instancecheck__(m_apply_approach.call_args[0][0])) @@ -343,7 +345,7 @@ def test_cast_to_dataframe(self): df_uplift = helper.cast_to_dataframe(list_dict) # Check if type equals pd.DataFrame - self.assertEqual(type(df_uplift), pd.DataFrame) + self.assertEqual(df_uplift.__class__, pd.DataFrame) # Check if the DataFrame contains 55 columns (11 columns for each approach) self.assertEqual(df_uplift.shape[1], 22) diff --git a/tests/test_x_learner.py b/tests/test_x_learner.py index 961912f..21542f4 100644 --- a/tests/test_x_learner.py +++ b/tests/test_x_learner.py @@ -23,7 +23,7 @@ def setUp(self): ds_helper = DataSetsHelper(df_train=df_train, df_valid=df_valid, df_test=df_test) root = f"{get_data_home()}/testing/models/" - approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, save=False, split_number=0) + approach_params = ApproachParameters(cost_sensitive=False, feature_importance=False, path=root, post_prune=False, save=False, split_number=0) self.ds_helper = ds_helper self.approach_params = approach_params