Skip to content

Commit 183041c

Browse files
Update to version 3.0
1 parent b9f0e0a commit 183041c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+731
-937
lines changed

.pre-commit-config.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
default_stages: [pre-commit]
22
repos:
33
- repo: https://github.com/pre-commit/pre-commit-hooks
4-
rev: v4.5.0 #v4.0.1
4+
rev: v6.0.0 #v4.0.1
55
hooks:
66
- id: end-of-file-fixer
77
- id: trailing-whitespace
88
- repo: https://github.com/psf/black
9-
rev: 25.1.0
9+
rev: 26.1.0
1010
hooks:
1111
- id: black
1212
exclude: ^.github/
1313
- repo: https://github.com/pre-commit/mirrors-mypy
14-
rev: v1.17.0 #0.910
14+
rev: v1.19.1 #0.910
1515
hooks:
1616
- id: mypy
1717
additional_dependencies: [types-requests]
1818
exclude: ^.github/
1919
- repo: https://github.com/pycqa/isort
20-
rev: 5.13.2
20+
rev: 8.0.1
2121
hooks:
2222
- id: isort
2323
args: [--profile, black, --filter-files]
@@ -42,7 +42,7 @@ repos:
4242
files: ^README\.md$
4343
args: [-i, --bullets, "*"]
4444
- repo: https://github.com/DavidAnson/markdownlint-cli2
45-
rev: v0.18.1 #v0.2.0
45+
rev: v0.21.0 #v0.2.0
4646
hooks:
4747
- id: markdownlint-cli2
4848
exclude: ^.github/

bluecast/blueprints/cast.py

Lines changed: 58 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@
5454
)
5555
from bluecast.preprocessing.train_test_split import train_test_split
5656

57+
logger = logging.getLogger(__name__)
58+
5759

5860
class BlueCast:
5961
"""Run fully configured classification blueprint.
@@ -70,12 +72,12 @@ class BlueCast:
7072
BlueCast will not split the data by time or order, but do a random split instead.
7173
:param :ml_model: Takes an instance of a CatboostModel class. If not provided, BlueCast will instantiate one.
7274
This is an API to pass any model class. Inherit the baseclass from ml_modelling.base_model.BaseModel.
73-
:param custom_in_fold_preprocessor: Takes an instance of a CustomPreprocessing class. Allows users to eeecute
75+
:param custom_in_fold_preprocessor: Takes an instance of a CustomPreprocessing class. Allows users to execute
7476
preprocessing after the train test split within cv folds. This will be executed only if precise_cv_tuning in
7577
the conf_Training is True. Custom ML models need to implement this themselves. This step is only useful when
76-
the proprocessing step has a high chance of overfitting otherwise (i.e: oversampling techniques).
78+
the preprocessing step has a high chance of overfitting otherwise (i.e: oversampling techniques).
7779
:param custom_preprocessor: Takes an instance of a CustomPreprocessing class. Allows users to inject custom
78-
preprocessing steps which take place right after the train test spit.
80+
preprocessing steps which take place right after the train test split.
7981
:param custom_last_mile_computation: Takes an instance of a CustomPreprocessing class. Allows users to inject custom
8082
preprocessing steps which take place right before the model training.
8183
:param experiment_tracker: Takes an instance of an ExperimentTracker class. If not provided this will be initialized
@@ -91,18 +93,18 @@ def __init__(
9193
cat_columns: Optional[List[Union[str, float, int]]] = None,
9294
date_columns: Optional[List[Union[str, float, int]]] = None,
9395
time_split_column: Optional[str] = None,
94-
ml_model: Optional[Union[CatboostModel, Any]] = None,
96+
ml_model: Optional[Any] = None,
9597
custom_in_fold_preprocessor: Optional[CustomPreprocessing] = None,
9698
custom_last_mile_computation: Optional[CustomPreprocessing] = None,
9799
custom_preprocessor: Optional[CustomPreprocessing] = None,
98100
custom_feature_selector: Optional[
99101
Union[BoostaRootaWrapper, CustomPreprocessing]
100102
] = None,
101103
conf_training: Optional[TrainingConfig] = None,
102-
conf_xgboost: Optional[
104+
conf_tuning: Optional[
103105
Union[XgboostTuneParamsConfig, CatboostTuneParamsConfig]
104106
] = None,
105-
conf_params_xgboost: Optional[
107+
conf_params: Optional[
106108
Union[XgboostFinalParamConfig, CatboostFinalParamConfig]
107109
] = None,
108110
experiment_tracker: Optional[ExperimentTracker] = None,
@@ -119,8 +121,8 @@ def __init__(
119121
self.date_columns = date_columns
120122
self.time_split_column = time_split_column
121123
self.target_column = "Undefined"
122-
self.conf_xgboost = conf_xgboost
123-
self.conf_params_xgboost = conf_params_xgboost
124+
self.conf_tuning = conf_tuning
125+
self.conf_params = conf_params
124126
self.feat_type_detector: Optional[FeatureTypeDetector] = None
125127
self.infreq_cat_encoder: Optional[InFrequentCategoryEncoder] = None
126128
self.cat_encoder: Optional[
@@ -147,23 +149,16 @@ def __init__(
147149
else:
148150
self.experiment_tracker = ExperimentTracker()
149151

150-
if not self.conf_params_xgboost:
151-
self.conf_params_xgboost = CatboostFinalParamConfig()
152+
if not self.conf_params:
153+
self.conf_params = CatboostFinalParamConfig()
152154

153155
self.conf_training: TrainingConfig = conf_training or TrainingConfig()
154156

155-
if not self.conf_xgboost:
156-
self.conf_xgboost = CatboostTuneParamsConfig()
157+
if not self.conf_tuning:
158+
self.conf_tuning = CatboostTuneParamsConfig()
157159
if not self.single_fold_eval_metric_func:
158160
self.single_fold_eval_metric_func = ClassificationEvalWrapper()
159-
logging.basicConfig(
160-
filename=self.conf_training.logging_file_path,
161-
filemode="w",
162-
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
163-
level=logging.INFO,
164-
force=True,
165-
)
166-
logging.info("BlueCast blueprint initialized.")
161+
logger.info("BlueCast blueprint initialized.")
167162

168163
def initial_checks(self, df: pd.DataFrame) -> None:
169164
if not self.conf_training:
@@ -193,7 +188,7 @@ def initial_checks(self, df: pd.DataFrame) -> None:
193188
feature selector."""
194189
warnings.warn(message, UserWarning, stacklevel=2)
195190

196-
if not self.conf_xgboost:
191+
if not self.conf_tuning:
197192
message = """No CatboostTuneParamsConfig has been provided. Falling back to default values. Default values
198193
have been chosen to speed up the prototyping. For robust hyperparameter tuning consider providing a custom
199194
CatboostTuneParamsConfig with a deeper hyperparameter search space and a custom TrainingConfig to enable
@@ -266,11 +261,11 @@ def initial_checks(self, df: pd.DataFrame) -> None:
266261
unique target classes have been found. Did you mean 'binary' instead?"""
267262
warnings.warn(message, UserWarning, stacklevel=2)
268263

269-
if self.conf_xgboost and isinstance(self.conf_xgboost, XgboostTuneParamsConfig):
264+
if self.conf_tuning and isinstance(self.conf_tuning, XgboostTuneParamsConfig):
270265
if self.conf_training.cat_encoding_via_ml_algorithm:
271-
if "exact" in self.conf_xgboost.tree_method:
272-
self.conf_xgboost.tree_method.remove("exact")
273-
message = f"""Categorical encoding via ML algorithm is enabled. The tree method 'exact' is not supported with categorical encoding within Xgboost. The tree method 'exact' has been removed. Using {self.conf_xgboost.tree_method} only during hyperparameter tuning."""
266+
if "exact" in self.conf_tuning.tree_method:
267+
self.conf_tuning.tree_method.remove("exact")
268+
message = f"""Categorical encoding via ML algorithm is enabled. The tree method 'exact' is not supported with categorical encoding within Xgboost. The tree method 'exact' has been removed. Using {self.conf_tuning.tree_method} only during hyperparameter tuning."""
274269
warnings.warn(message, UserWarning, stacklevel=2)
275270

276271
def fit(self, df: pd.DataFrame, target_col: str) -> None:
@@ -312,15 +307,15 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
312307
self.conf_training.train_split_stratify,
313308
)
314309

315-
if not self.conf_training.autotune_model and self.conf_params_xgboost:
316-
self.conf_params_xgboost.params["num_class"] = (
317-
self.conf_params_xgboost.params.get("num_class", y_test.nunique())
310+
if not self.conf_training.autotune_model and self.conf_params:
311+
self.conf_params.params["num_class"] = self.conf_params.params.get(
312+
"num_class", y_test.nunique()
318313
)
319314

320315
if self.custom_preprocessor:
321316
x_train, y_train = self.custom_preprocessor.fit_transform(x_train, y_train)
322317
x_test, y_test = self.custom_preprocessor.transform(
323-
x_test, y_test, predicton_mode=False
318+
x_test, y_test, prediction_mode=False
324319
)
325320
feat_type_detector = FeatureTypeDetector(
326321
cat_columns=[], num_columns=[], date_columns=[]
@@ -416,7 +411,7 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
416411
x_train.copy(), y_train
417412
)
418413
x_test, y_test = self.custom_last_mile_computation.transform(
419-
x_test.copy(), y_test, predicton_mode=False
414+
x_test.copy(), y_test, prediction_mode=False
420415
)
421416

422417
if not self.custom_feature_selector:
@@ -430,21 +425,21 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
430425
x_train.copy(), y_train
431426
)
432427
x_test, _ = self.custom_feature_selector.transform(
433-
x_test.copy(), predicton_mode=False
428+
x_test.copy(), prediction_mode=False
434429
)
435430

436431
if not self.ml_model:
437432
self.ml_model = CatboostModel(
438433
self.class_problem,
439434
conf_training=self.conf_training,
440435
conf_catboost=(
441-
self.conf_xgboost
442-
if isinstance(self.conf_xgboost, CatboostTuneParamsConfig)
436+
self.conf_tuning
437+
if isinstance(self.conf_tuning, CatboostTuneParamsConfig)
443438
else CatboostTuneParamsConfig()
444439
),
445440
conf_params_catboost=(
446-
self.conf_params_xgboost
447-
if isinstance(self.conf_params_xgboost, CatboostFinalParamConfig)
441+
self.conf_params
442+
if isinstance(self.conf_params, CatboostFinalParamConfig)
448443
else CatboostFinalParamConfig()
449444
),
450445
experiment_tracker=self.experiment_tracker,
@@ -468,16 +463,16 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
468463
self.ml_model.conf_training = self.conf_training
469464
if isinstance(self.ml_model, CatboostModel):
470465
# Ensure CatBoost final params config exists and is of correct type
471-
if isinstance(self.conf_params_xgboost, CatboostFinalParamConfig):
472-
self.ml_model.conf_params_catboost = self.conf_params_xgboost
466+
if isinstance(self.conf_params, CatboostFinalParamConfig):
467+
self.ml_model.conf_params_catboost = self.conf_params
473468
else:
474469
self.ml_model.conf_params_catboost = CatboostFinalParamConfig()
475470

476471
self.ml_model.fit(x_train, x_test, y_train, y_test)
477472

478473
if self.custom_in_fold_preprocessor:
479474
x_test, _ = self.custom_in_fold_preprocessor.transform(
480-
x_test.copy(), None, predicton_mode=True
475+
x_test.copy(), None, prediction_mode=True
481476
)
482477

483478
if self.conf_training and self.conf_training.calculate_shap_values:
@@ -536,9 +531,9 @@ def fit_eval(
536531
raise ValueError("Could not find CatBoost params")
537532
final_params_for_log = self.ml_model.conf_params_catboost.params
538533
else:
539-
if not self.conf_params_xgboost:
534+
if not self.conf_params:
540535
raise ValueError("Could not find Xgboost params")
541-
final_params_for_log = self.conf_params_xgboost.params
536+
final_params_for_log = self.conf_params.params
542537

543538
if len(self.experiment_tracker.experiment_id) == 0:
544539
self.experiment_tracker.experiment_id.append(0)
@@ -578,7 +573,7 @@ def fit_eval(
578573
score_category="oof_score",
579574
training_config=self.conf_training,
580575
model_parameters=final_params_for_log, # noqa
581-
eval_scores=self.eval_metrics["accuracy"],
576+
eval_scores=self.eval_metrics[metric],
582577
metric_used=metric,
583578
metric_higher_is_better=higher_is_better,
584579
)
@@ -587,17 +582,17 @@ def fit_eval(
587582
def transform_new_data(self, df: pd.DataFrame) -> pd.DataFrame:
588583
"""Transform new data according to preprocessing pipeline."""
589584
if not self.feat_type_detector:
590-
raise Exception("Feature type converter could not be found.")
585+
raise RuntimeError("Feature type converter could not be found.")
591586

592587
if not self.conf_training:
593-
raise Exception("Training configuration could not be found.")
588+
raise RuntimeError("Training configuration could not be found.")
594589

595590
df = self.feat_type_detector.transform_feature_types(
596591
df, ignore_cols=[self.target_column]
597592
)
598593

599594
if self.custom_preprocessor:
600-
df, _ = self.custom_preprocessor.transform(df, predicton_mode=True)
595+
df, _ = self.custom_preprocessor.transform(df, prediction_mode=True)
601596
df = df.reset_index(drop=True)
602597

603598
df = fill_infinite_values(df)
@@ -645,12 +640,12 @@ def transform_new_data(self, df: pd.DataFrame) -> pd.DataFrame:
645640

646641
if self.custom_last_mile_computation:
647642
df, _ = self.custom_last_mile_computation.transform(
648-
df.copy(), predicton_mode=True
643+
df.copy(), prediction_mode=True
649644
)
650645

651646
if self.custom_feature_selector and self.conf_training.enable_feature_selection:
652647
df, _ = self.custom_feature_selector.transform(
653-
df.copy(), predicton_mode=True
648+
df.copy(), prediction_mode=True
654649
)
655650

656651
if self.conf_training.cat_encoding_via_ml_algorithm and self.cat_columns:
@@ -676,13 +671,13 @@ def predict(
676671
:param return_original_labels: If True, returns the original labels instead of the encoded ones.
677672
"""
678673
if not self.ml_model:
679-
raise Exception("Ml model could not be found")
674+
raise RuntimeError("ML model could not be found.")
680675

681676
if not self.feat_type_detector:
682-
raise Exception("Feature type converter could not be found.")
677+
raise RuntimeError("Feature type converter could not be found.")
683678

684679
if not self.conf_training:
685-
raise ValueError("conf_training is None")
680+
raise RuntimeError("Training configuration is None.")
686681

687682
df = self.transform_new_data(df)
688683

@@ -712,13 +707,13 @@ def predict_proba(
712707
waterfall plots for selected rows o demand.
713708
"""
714709
if not self.ml_model:
715-
raise Exception("Ml model could not be found")
710+
raise RuntimeError("ML model could not be found.")
716711

717712
if not self.feat_type_detector:
718-
raise Exception("Feature type converter could not be found.")
713+
raise RuntimeError("Feature type converter could not be found.")
719714

720715
if not self.conf_training:
721-
raise ValueError("conf_training is None")
716+
raise RuntimeError("Training configuration is None.")
722717

723718
df = self.transform_new_data(df)
724719

@@ -798,17 +793,21 @@ def predict_sets(self, df: pd.DataFrame, alpha: float = 0.05) -> pd.DataFrame:
798793

799794
string_pred_sets = []
800795
for numerical_set in pred_sets:
801-
# Convert numerical labels to string labels
802-
string_set = {reverse_mapping[label] for label in numerical_set}
796+
string_set = {
797+
reverse_mapping[i]
798+
for i, indicator in enumerate(numerical_set)
799+
if indicator == 1
800+
}
803801
string_pred_sets.append(string_set)
804802
return pd.DataFrame({"prediction_set": string_pred_sets})
805803
else:
806-
string_pred_sets = []
804+
pred_sets_list: list = []
807805
for numerical_set in pred_sets:
808-
# Convert numerical labels to string labels
809-
string_set = {label for label in numerical_set}
810-
string_pred_sets.append(string_set)
811-
return pd.DataFrame({"prediction_set": string_pred_sets})
806+
pred_set = {
807+
i for i, indicator in enumerate(numerical_set) if indicator == 1
808+
}
809+
pred_sets_list.append(pred_set)
810+
return pd.DataFrame({"prediction_set": pred_sets_list})
812811
else:
813812
raise ValueError(
814813
"""This instance has not been calibrated yet. Make use of calibrate to fit the

0 commit comments

Comments
 (0)