5454)
5555from bluecast .preprocessing .train_test_split import train_test_split
5656
57+ logger = logging .getLogger (__name__ )
58+
5759
5860class BlueCast :
5961 """Run fully configured classification blueprint.
@@ -70,12 +72,12 @@ class BlueCast:
7072 BlueCast will not split the data by time or order, but do a random split instead.
7173 :param :ml_model: Takes an instance of a CatboostModel class. If not provided, BlueCast will instantiate one.
7274 This is an API to pass any model class. Inherit the baseclass from ml_modelling.base_model.BaseModel.
73- :param custom_in_fold_preprocessor: Takes an instance of a CustomPreprocessing class. Allows users to eeecute
75+ :param custom_in_fold_preprocessor: Takes an instance of a CustomPreprocessing class. Allows users to execute
7476 preprocessing after the train test split within cv folds. This will be executed only if precise_cv_tuning in
7577 the conf_Training is True. Custom ML models need to implement this themselves. This step is only useful when
76- the proprocessing step has a high chance of overfitting otherwise (i.e: oversampling techniques).
78+ the preprocessing step has a high chance of overfitting otherwise (i.e: oversampling techniques).
7779 :param custom_preprocessor: Takes an instance of a CustomPreprocessing class. Allows users to inject custom
78- preprocessing steps which take place right after the train test spit .
80+ preprocessing steps which take place right after the train test split .
7981 :param custom_last_mile_computation: Takes an instance of a CustomPreprocessing class. Allows users to inject custom
8082 preprocessing steps which take place right before the model training.
8183 :param experiment_tracker: Takes an instance of an ExperimentTracker class. If not provided this will be initialized
@@ -91,18 +93,18 @@ def __init__(
9193 cat_columns : Optional [List [Union [str , float , int ]]] = None ,
9294 date_columns : Optional [List [Union [str , float , int ]]] = None ,
9395 time_split_column : Optional [str ] = None ,
94- ml_model : Optional [Union [ CatboostModel , Any ] ] = None ,
96+ ml_model : Optional [Any ] = None ,
9597 custom_in_fold_preprocessor : Optional [CustomPreprocessing ] = None ,
9698 custom_last_mile_computation : Optional [CustomPreprocessing ] = None ,
9799 custom_preprocessor : Optional [CustomPreprocessing ] = None ,
98100 custom_feature_selector : Optional [
99101 Union [BoostaRootaWrapper , CustomPreprocessing ]
100102 ] = None ,
101103 conf_training : Optional [TrainingConfig ] = None ,
102- conf_xgboost : Optional [
104+ conf_tuning : Optional [
103105 Union [XgboostTuneParamsConfig , CatboostTuneParamsConfig ]
104106 ] = None ,
105- conf_params_xgboost : Optional [
107+ conf_params : Optional [
106108 Union [XgboostFinalParamConfig , CatboostFinalParamConfig ]
107109 ] = None ,
108110 experiment_tracker : Optional [ExperimentTracker ] = None ,
@@ -119,8 +121,8 @@ def __init__(
119121 self .date_columns = date_columns
120122 self .time_split_column = time_split_column
121123 self .target_column = "Undefined"
122- self .conf_xgboost = conf_xgboost
123- self .conf_params_xgboost = conf_params_xgboost
124+ self .conf_tuning = conf_tuning
125+ self .conf_params = conf_params
124126 self .feat_type_detector : Optional [FeatureTypeDetector ] = None
125127 self .infreq_cat_encoder : Optional [InFrequentCategoryEncoder ] = None
126128 self .cat_encoder : Optional [
@@ -147,23 +149,16 @@ def __init__(
147149 else :
148150 self .experiment_tracker = ExperimentTracker ()
149151
150- if not self .conf_params_xgboost :
151- self .conf_params_xgboost = CatboostFinalParamConfig ()
152+ if not self .conf_params :
153+ self .conf_params = CatboostFinalParamConfig ()
152154
153155 self .conf_training : TrainingConfig = conf_training or TrainingConfig ()
154156
155- if not self .conf_xgboost :
156- self .conf_xgboost = CatboostTuneParamsConfig ()
157+ if not self .conf_tuning :
158+ self .conf_tuning = CatboostTuneParamsConfig ()
157159 if not self .single_fold_eval_metric_func :
158160 self .single_fold_eval_metric_func = ClassificationEvalWrapper ()
159- logging .basicConfig (
160- filename = self .conf_training .logging_file_path ,
161- filemode = "w" ,
162- format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" ,
163- level = logging .INFO ,
164- force = True ,
165- )
166- logging .info ("BlueCast blueprint initialized." )
161+ logger .info ("BlueCast blueprint initialized." )
167162
168163 def initial_checks (self , df : pd .DataFrame ) -> None :
169164 if not self .conf_training :
@@ -193,7 +188,7 @@ def initial_checks(self, df: pd.DataFrame) -> None:
193188 feature selector."""
194189 warnings .warn (message , UserWarning , stacklevel = 2 )
195190
196- if not self .conf_xgboost :
191+ if not self .conf_tuning :
197192 message = """No CatboostTuneParamsConfig has been provided. Falling back to default values. Default values
198193 have been chosen to speed up the prototyping. For robust hyperparameter tuning consider providing a custom
199194 CatboostTuneParamsConfig with a deeper hyperparameter search space and a custom TrainingConfig to enable
@@ -266,11 +261,11 @@ def initial_checks(self, df: pd.DataFrame) -> None:
266261 unique target classes have been found. Did you mean 'binary' instead?"""
267262 warnings .warn (message , UserWarning , stacklevel = 2 )
268263
269- if self .conf_xgboost and isinstance (self .conf_xgboost , XgboostTuneParamsConfig ):
264+ if self .conf_tuning and isinstance (self .conf_tuning , XgboostTuneParamsConfig ):
270265 if self .conf_training .cat_encoding_via_ml_algorithm :
271- if "exact" in self .conf_xgboost .tree_method :
272- self .conf_xgboost .tree_method .remove ("exact" )
273- message = f"""Categorical encoding via ML algorithm is enabled. The tree method 'exact' is not supported with categorical encoding within Xgboost. The tree method 'exact' has been removed. Using { self .conf_xgboost .tree_method } only during hyperparameter tuning."""
266+ if "exact" in self .conf_tuning .tree_method :
267+ self .conf_tuning .tree_method .remove ("exact" )
268+ message = f"""Categorical encoding via ML algorithm is enabled. The tree method 'exact' is not supported with categorical encoding within Xgboost. The tree method 'exact' has been removed. Using { self .conf_tuning .tree_method } only during hyperparameter tuning."""
274269 warnings .warn (message , UserWarning , stacklevel = 2 )
275270
276271 def fit (self , df : pd .DataFrame , target_col : str ) -> None :
@@ -312,15 +307,15 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
312307 self .conf_training .train_split_stratify ,
313308 )
314309
315- if not self .conf_training .autotune_model and self .conf_params_xgboost :
316- self .conf_params_xgboost .params ["num_class" ] = (
317- self . conf_params_xgboost . params . get ( "num_class" , y_test .nunique () )
310+ if not self .conf_training .autotune_model and self .conf_params :
311+ self .conf_params .params ["num_class" ] = self . conf_params . params . get (
312+ "num_class" , y_test .nunique ()
318313 )
319314
320315 if self .custom_preprocessor :
321316 x_train , y_train = self .custom_preprocessor .fit_transform (x_train , y_train )
322317 x_test , y_test = self .custom_preprocessor .transform (
323- x_test , y_test , predicton_mode = False
318+ x_test , y_test , prediction_mode = False
324319 )
325320 feat_type_detector = FeatureTypeDetector (
326321 cat_columns = [], num_columns = [], date_columns = []
@@ -416,7 +411,7 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
416411 x_train .copy (), y_train
417412 )
418413 x_test , y_test = self .custom_last_mile_computation .transform (
419- x_test .copy (), y_test , predicton_mode = False
414+ x_test .copy (), y_test , prediction_mode = False
420415 )
421416
422417 if not self .custom_feature_selector :
@@ -430,21 +425,21 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
430425 x_train .copy (), y_train
431426 )
432427 x_test , _ = self .custom_feature_selector .transform (
433- x_test .copy (), predicton_mode = False
428+ x_test .copy (), prediction_mode = False
434429 )
435430
436431 if not self .ml_model :
437432 self .ml_model = CatboostModel (
438433 self .class_problem ,
439434 conf_training = self .conf_training ,
440435 conf_catboost = (
441- self .conf_xgboost
442- if isinstance (self .conf_xgboost , CatboostTuneParamsConfig )
436+ self .conf_tuning
437+ if isinstance (self .conf_tuning , CatboostTuneParamsConfig )
443438 else CatboostTuneParamsConfig ()
444439 ),
445440 conf_params_catboost = (
446- self .conf_params_xgboost
447- if isinstance (self .conf_params_xgboost , CatboostFinalParamConfig )
441+ self .conf_params
442+ if isinstance (self .conf_params , CatboostFinalParamConfig )
448443 else CatboostFinalParamConfig ()
449444 ),
450445 experiment_tracker = self .experiment_tracker ,
@@ -468,16 +463,16 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
468463 self .ml_model .conf_training = self .conf_training
469464 if isinstance (self .ml_model , CatboostModel ):
470465 # Ensure CatBoost final params config exists and is of correct type
471- if isinstance (self .conf_params_xgboost , CatboostFinalParamConfig ):
472- self .ml_model .conf_params_catboost = self .conf_params_xgboost
466+ if isinstance (self .conf_params , CatboostFinalParamConfig ):
467+ self .ml_model .conf_params_catboost = self .conf_params
473468 else :
474469 self .ml_model .conf_params_catboost = CatboostFinalParamConfig ()
475470
476471 self .ml_model .fit (x_train , x_test , y_train , y_test )
477472
478473 if self .custom_in_fold_preprocessor :
479474 x_test , _ = self .custom_in_fold_preprocessor .transform (
480- x_test .copy (), None , predicton_mode = True
475+ x_test .copy (), None , prediction_mode = True
481476 )
482477
483478 if self .conf_training and self .conf_training .calculate_shap_values :
@@ -536,9 +531,9 @@ def fit_eval(
536531 raise ValueError ("Could not find CatBoost params" )
537532 final_params_for_log = self .ml_model .conf_params_catboost .params
538533 else :
539- if not self .conf_params_xgboost :
534+ if not self .conf_params :
540535 raise ValueError ("Could not find Xgboost params" )
541- final_params_for_log = self .conf_params_xgboost .params
536+ final_params_for_log = self .conf_params .params
542537
543538 if len (self .experiment_tracker .experiment_id ) == 0 :
544539 self .experiment_tracker .experiment_id .append (0 )
@@ -578,7 +573,7 @@ def fit_eval(
578573 score_category = "oof_score" ,
579574 training_config = self .conf_training ,
580575 model_parameters = final_params_for_log , # noqa
581- eval_scores = self .eval_metrics ["accuracy" ],
576+ eval_scores = self .eval_metrics [metric ],
582577 metric_used = metric ,
583578 metric_higher_is_better = higher_is_better ,
584579 )
@@ -587,17 +582,17 @@ def fit_eval(
587582 def transform_new_data (self , df : pd .DataFrame ) -> pd .DataFrame :
588583 """Transform new data according to preprocessing pipeline."""
589584 if not self .feat_type_detector :
590- raise Exception ("Feature type converter could not be found." )
585+ raise RuntimeError ("Feature type converter could not be found." )
591586
592587 if not self .conf_training :
593- raise Exception ("Training configuration could not be found." )
588+ raise RuntimeError ("Training configuration could not be found." )
594589
595590 df = self .feat_type_detector .transform_feature_types (
596591 df , ignore_cols = [self .target_column ]
597592 )
598593
599594 if self .custom_preprocessor :
600- df , _ = self .custom_preprocessor .transform (df , predicton_mode = True )
595+ df , _ = self .custom_preprocessor .transform (df , prediction_mode = True )
601596 df = df .reset_index (drop = True )
602597
603598 df = fill_infinite_values (df )
@@ -645,12 +640,12 @@ def transform_new_data(self, df: pd.DataFrame) -> pd.DataFrame:
645640
646641 if self .custom_last_mile_computation :
647642 df , _ = self .custom_last_mile_computation .transform (
648- df .copy (), predicton_mode = True
643+ df .copy (), prediction_mode = True
649644 )
650645
651646 if self .custom_feature_selector and self .conf_training .enable_feature_selection :
652647 df , _ = self .custom_feature_selector .transform (
653- df .copy (), predicton_mode = True
648+ df .copy (), prediction_mode = True
654649 )
655650
656651 if self .conf_training .cat_encoding_via_ml_algorithm and self .cat_columns :
@@ -676,13 +671,13 @@ def predict(
676671 :param return_original_labels: If True, returns the original labels instead of the encoded ones.
677672 """
678673 if not self .ml_model :
679- raise Exception ( "Ml model could not be found" )
674+ raise RuntimeError ( "ML model could not be found. " )
680675
681676 if not self .feat_type_detector :
682- raise Exception ("Feature type converter could not be found." )
677+ raise RuntimeError ("Feature type converter could not be found." )
683678
684679 if not self .conf_training :
685- raise ValueError ( "conf_training is None" )
680+ raise RuntimeError ( "Training configuration is None. " )
686681
687682 df = self .transform_new_data (df )
688683
@@ -712,13 +707,13 @@ def predict_proba(
712707 waterfall plots for selected rows o demand.
713708 """
714709 if not self .ml_model :
715- raise Exception ( "Ml model could not be found" )
710+ raise RuntimeError ( "ML model could not be found. " )
716711
717712 if not self .feat_type_detector :
718- raise Exception ("Feature type converter could not be found." )
713+ raise RuntimeError ("Feature type converter could not be found." )
719714
720715 if not self .conf_training :
721- raise ValueError ( "conf_training is None" )
716+ raise RuntimeError ( "Training configuration is None. " )
722717
723718 df = self .transform_new_data (df )
724719
@@ -798,17 +793,21 @@ def predict_sets(self, df: pd.DataFrame, alpha: float = 0.05) -> pd.DataFrame:
798793
799794 string_pred_sets = []
800795 for numerical_set in pred_sets :
801- # Convert numerical labels to string labels
802- string_set = {reverse_mapping [label ] for label in numerical_set }
796+ string_set = {
797+ reverse_mapping [i ]
798+ for i , indicator in enumerate (numerical_set )
799+ if indicator == 1
800+ }
803801 string_pred_sets .append (string_set )
804802 return pd .DataFrame ({"prediction_set" : string_pred_sets })
805803 else :
806- string_pred_sets = []
804+ pred_sets_list : list = []
807805 for numerical_set in pred_sets :
808- # Convert numerical labels to string labels
809- string_set = {label for label in numerical_set }
810- string_pred_sets .append (string_set )
811- return pd .DataFrame ({"prediction_set" : string_pred_sets })
806+ pred_set = {
807+ i for i , indicator in enumerate (numerical_set ) if indicator == 1
808+ }
809+ pred_sets_list .append (pred_set )
810+ return pd .DataFrame ({"prediction_set" : pred_sets_list })
812811 else :
813812 raise ValueError (
814813 """This instance has not been calibrated yet. Make use of calibrate to fit the
0 commit comments