|
1 | 1 | from DimRed import *
|
| 2 | + |
| 3 | + |
| 4 | +class Evaluation: |
| 5 | + def __init__(self, _data: Dict[str, np.ndarray], all_possible_variations: Dict[str, List], labels: np.ndarray, metric: str = "accuracy", sklearn_config: Dict[Any, Dict[str, Union[str, int]]] = sklearn_config, lgb_config: Dict[str, Union[str, int]] = lgb_config, xgb_config: Dict[str, Union[str, int]] = xgb_config) -> None: |
| 6 | + self.sklearn_config = sklearn_config |
| 7 | + self.lgb_config = lgb_config |
| 8 | + self.xgb_config = xgb_config |
| 9 | + self._data = _data |
| 10 | + self.all_variations = all_possible_variations |
| 11 | + self.labels = labels |
| 12 | + self.metric = metric |
| 13 | + |
| 14 | + def sklearn(self, X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, results: Dict = {}, dimred_technique: str = None) -> Tuple[Dict[str, Union[str, int]], Dict[str, int]]: |
| 15 | + best_model = [0, {}] |
| 16 | + for model in self.sklearn_config: |
| 17 | + name = dimred_technique + model().__class__.__name__ |
| 18 | + model_config = self.sklearn_config[model] |
| 19 | + wandb.init(project=PROJECT_NAME, name=name, config={ |
| 20 | + "model": name, "results": results, "modelLibrary": "sklearn", "config": model_config}) |
| 21 | + model = GridSearchCV(model(), model_config, cv=5, verbose=0) |
| 22 | + y_train = y_train.reshape(y_train.shape[0],) |
| 23 | + model.fit(X_train, y_train) |
| 24 | + y_preds = model.predict(X_test) |
| 25 | + y_probas = model.predict_proba(X_test) |
| 26 | + metrics = classification_report( |
| 27 | + y_test, y_preds, output_dict=True) |
| 28 | + results[model.__class__.__name__] = metrics |
| 29 | + wandb.log(metrics) |
| 30 | + wandb.sklearn.plot_classifier( |
| 31 | + model, |
| 32 | + X_train, |
| 33 | + X_test, |
| 34 | + y_train, |
| 35 | + y_test, |
| 36 | + y_preds, |
| 37 | + y_probas, |
| 38 | + range(min(y_probas.shape)), |
| 39 | + model_name=name, |
| 40 | + feature_names=None, |
| 41 | + ) |
| 42 | + if metrics[self.metric] > best_model[0]: |
| 43 | + best_model[0] = metrics[self.metric] |
| 44 | + best_model[1] = metrics |
| 45 | + wandb.finish() |
| 46 | + return results, best_model[-1] |
| 47 | + |
| 48 | + def xgb(self, X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, results: Dict = {}, dimred_technique: str = None) -> Tuple[Dict[str, Union[str, int]], Dict[str, int]]: |
| 49 | + model = xgb.XGBClassifier(**self.xgb_config) |
| 50 | + name = dimred_technique + model.__class__.__name__ |
| 51 | + wandb.init(project=PROJECT_NAME, name=name, config={"config": self.xgb_config, |
| 52 | + "model": name, "results": results, "modelLibrary": "XGB"}) |
| 53 | + y_train, y_test = label_encoding(y_train, y_test) |
| 54 | + model.fit(cp.asarray(X_train), cp.asarray(y_train), eval_set=[ |
| 55 | + (cp.asarray(X_test), cp.asarray(y_test))], callbacks=[WandbCallback(log_model=True)]) |
| 56 | + y_preds = model.predict(X_test) |
| 57 | + metrics = classification_report( |
| 58 | + y_test, y_preds, output_dict=True) |
| 59 | + results[name] = metrics |
| 60 | + wandb.log(metrics) |
| 61 | + wandb.finish() |
| 62 | + return results, metrics |
| 63 | + |
| 64 | + def lgb(self, X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, results: Dict = {}, dimred_technique: str = None) -> Tuple[Dict[str, Union[str, int]], Dict[str, int]]: |
| 65 | + name = dimred_technique + "LGBClf" |
| 66 | + wandb.init(project=PROJECT_NAME, name=name, config={"config": self.lgb_config, |
| 67 | + "results": results, "modelLibrary": "LGB"}) |
| 68 | + y_train, y_test = label_encoding(y_train, y_test) |
| 69 | + train_data = lgb.Dataset(X_train, label=y_train) |
| 70 | + test_data = lgb.Dataset(X_test, label=y_test, reference=train_data) |
| 71 | + model = lgb.train(self.lgb_config, train_data, |
| 72 | + valid_sets=[test_data], callbacks=[wandb_callback()]) |
| 73 | + y_preds = model.predict(X_test) |
| 74 | + metrics = classification_report( |
| 75 | + y_test, np.argmax(y_preds, axis=1), output_dict=True) |
| 76 | + results[name] = metrics |
| 77 | + log_summary(model, save_model_checkpoint=True) |
| 78 | + wandb.log(metrics) |
| 79 | + wandb.finish() |
| 80 | + return results, metrics |
| 81 | + |
| 82 | + def evaluate(self) -> Dict[str, Dict[str, Dict[str, Union[str, int]]]]: |
| 83 | + all_pipeline_performance = {} |
| 84 | + outer_iterator = tqdm(self.all_variations) |
| 85 | + best_performances = { |
| 86 | + self.metric: [], |
| 87 | + "pipeline_variation": [], |
| 88 | + # "pipeline_performance": [], |
| 89 | + "pipeline_name": [], |
| 90 | + } |
| 91 | + for pipeline_variation_name in outer_iterator: |
| 92 | + best_performing_pipeline = [0, None, pipeline_variation_name] |
| 93 | + specific_pipeline_variations = self.all_variations[pipeline_variation_name] |
| 94 | + inner_iterator = tqdm(specific_pipeline_variations, leave=False) |
| 95 | + for pipeline_variation in inner_iterator: |
| 96 | + name_of_pipeline = pipeline_variation.__class__.__name__ |
| 97 | + pipeline_performance = {} |
| 98 | + X_train = pipeline_variation.fit_transform( |
| 99 | + self._data['X_train']) |
| 100 | + X_test = pipeline_variation.transform(self._data['X_test']) |
| 101 | + inner_iterator.set_description("Sklearn Model...") |
| 102 | + pipeline_performance, sklearn_metrics = self.sklearn( |
| 103 | + X_train, X_test, self._data['y_train'], self._data['y_test'], pipeline_performance, name_of_pipeline) |
| 104 | + inner_iterator.set_description("Sklearn Model Done :)") |
| 105 | + inner_iterator.set_description("XGB Model...") |
| 106 | + pipeline_performance, xgb_metrics = self.xgb( |
| 107 | + X_train, X_test, self._data['y_train'], self._data['y_test'], pipeline_performance, name_of_pipeline) |
| 108 | + inner_iterator.set_description("XGB Model Done :)") |
| 109 | + inner_iterator.set_description("LGB Model...") |
| 110 | + pipeline_performance, lgb_metrics = self.lgb( |
| 111 | + X_train, X_test, self._data['y_train'], self._data['y_test'], pipeline_performance, name_of_pipeline) |
| 112 | + inner_iterator.set_description("LGB Model Done :)") |
| 113 | + all_pipeline_performance[name_of_pipeline] = pipeline_performance |
| 114 | + avg_var = average_metric(self.metric, [sklearn_metrics, |
| 115 | + xgb_metrics, lgb_metrics]) |
| 116 | + if avg_var > best_performing_pipeline[0]: |
| 117 | + best_performing_pipeline[0] = avg_var |
| 118 | + best_performing_pipeline[1] = str(pipeline_variation) |
| 119 | + # best_performing_pipeline[2] = pipeline_performance |
| 120 | + inner_iterator.set_description( |
| 121 | + f"{name_of_pipeline} Done :)") |
| 122 | + # best_performances[pipeline_variation_name] = best_performing_pipeline |
| 123 | + best_performances = add_to_dictionary( |
| 124 | + best_performances, best_performing_pipeline) |
| 125 | + with open('all_performance_data.json', 'w') as f: |
| 126 | + json.dump(all_pipeline_performance, f) |
| 127 | + with open('best_performance_dimred.json', 'w') as json_f: |
| 128 | + json.dump(best_performances, json_f) |
| 129 | + return all_pipeline_performance, best_performances |
0 commit comments