From 5121800c66c20b3b2a4c2620773d00ae551845bf Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Tue, 24 Oct 2023 10:40:45 +0900 Subject: [PATCH 1/2] Fixed an error when there were many missing bool columns in the input data Signed-off-by: tashiro akira --- .../preprocessing_templates/fillna-type-string.py.jinja | 2 ++ .../preprocessing_templates/fillna-type-string_predict.py.jinja | 1 + .../preprocessing_templates/fillna-type-string_train.py.jinja | 1 + 3 files changed, 4 insertions(+) diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja index 5a8da92..5d0787b 100644 --- a/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja +++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja @@ -9,6 +9,8 @@ simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') {% endif %} {% if cols_almost_missing_string %} STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} +{{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) +{{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {% endif %} \ No newline at end of file diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja index ef58ba7..472ec11 100644 --- a/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja +++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja @@ -7,5 +7,6 @@ STRING_COLS_WITH_MISSING_VALUES = {{ columns }} {% endif %} {% if cols_almost_missing_string %} STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} +{{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {% endif %} \ No newline at end of file diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja index 404804b..81d5621 100644 --- a/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja +++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja @@ -11,5 +11,6 @@ with open('simpleimputer-string.pkl', 'wb') as f: {% endif %} {% if cols_almost_missing_string %} STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} +{{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {% endif %} \ No newline at end of file From fc8a2d5a1486372e9f863c569ee228d6ded99312 Mon Sep 17 00:00:00 2001 From: tashiro-akira Date: Fri, 28 Jun 2024 15:46:36 +0900 Subject: [PATCH 2/2] fix:Fixed processing when setting auc, logloss, and gini for calculation --- .../generation/pipeline_template.py | 1 - .../classification_post_process.jinja | 10 ++-- .../other_templates/evaluation.py.jinja | 51 ++++++++++++++++--- 3 files changed, 49 insertions(+), 13 deletions(-) diff --git a/sapientml_core/adaptation/generation/pipeline_template.py b/sapientml_core/adaptation/generation/pipeline_template.py index 84926fc..30a5f05 100644 --- a/sapientml_core/adaptation/generation/pipeline_template.py +++ b/sapientml_core/adaptation/generation/pipeline_template.py @@ -547,7 +547,6 @@ def populate_model(self): or pipeline.adaptation_metric.startswith(macros.Metric.MAP_K.value) or pipeline.config.predict_option == macros.PRED_PROBABILITY ): - snippet = snippet.replace("predict", "predict_proba") tpl = env.get_template("model_templates/classification_post_process.jinja") snippet += "\n" + self._render(tpl, pipeline=pipeline) diff --git a/sapientml_core/templates/model_templates/classification_post_process.jinja b/sapientml_core/templates/model_templates/classification_post_process.jinja index 7e25fe3..4e72780 100644 --- a/sapientml_core/templates/model_templates/classification_post_process.jinja +++ b/sapientml_core/templates/model_templates/classification_post_process.jinja @@ -1,8 +1,10 @@ +y_prob = model.predict_proba(feature_test) + # POST PROCESSING {% if pipeline.adaptation_metric.startswith("MAP_") %} -y_pred_sorted_index = pd.DataFrame(np.argsort(-y_pred)) -y_pred = y_pred_sorted_index.apply(lambda x: model.classes_[x]).to_numpy() +y_prob_sorted_index = pd.DataFrame(np.argsort(-y_prob)) +y_prob = y_prob_sorted_index.apply(lambda x: model.classes_[x]).to_numpy() {% else %} -if np.shape(y_pred)[1] == 2: - y_pred = y_pred[:, 1] +if np.shape(y_prob)[1] == 2: + y_prob = y_prob[:, 1] {% endif %} \ No newline at end of file diff --git a/sapientml_core/templates/other_templates/evaluation.py.jinja b/sapientml_core/templates/other_templates/evaluation.py.jinja index 151ce72..5091078 100644 --- a/sapientml_core/templates/other_templates/evaluation.py.jinja +++ b/sapientml_core/templates/other_templates/evaluation.py.jinja @@ -1,11 +1,23 @@ -{% if pipeline.adaptation_metric == macros.Metric.AUC.value %} +{% if pipeline.adaptation_metric == macros.Metric.AUC.value and not is_multioutput_classification%} from sklearn.metrics import roc_auc_score {% if pipeline.task.is_multiclass == True %} -auc = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") +auc = roc_auc_score(target_test.values.ravel(), y_prob, multi_class="ovr") {% else %} -auc = roc_auc_score(target_test, y_pred) +auc = roc_auc_score(target_test, y_prob) {% endif %} print('RESULT: AUC Score: ' + str(auc)) +{% elif pipeline.adaptation_metric == macros.Metric.AUC.value and is_multioutput_classification%} +from sklearn.metrics import roc_auc_score +auc_scores = [] +for i, col in enumerate(target_test.columns): +{% if pipeline.task.is_multiclass == True %} + one_auc = roc_auc_score(target_test[column], y_prob[i], multi_class="ovr") +{% else %} + one_auc = roc_auc_score(target_test[column], y_prob[i][:, 1]) +{% endif %} + auc_scores.append(one_auc) +auc = np.mean(auc_scores) +print('RESULT: Average AUC Score:', str(auc)) {% elif (pipeline.adaptation_metric == macros.Metric.Accuracy.value) and (not pipeline.is_multi_class_multi_targets) %} from sklearn.metrics import accuracy_score @@ -50,24 +62,47 @@ target_test = np.clip(target_test, 0, None) y_pred = np.clip(y_pred, 0, None) rmsle = np.sqrt(mean_squared_log_error(target_test, y_pred)) print('RESULT: RMSLE:', str(rmsle)) -{% elif pipeline.adaptation_metric == macros.Metric.Gini.value %} +{% elif pipeline.adaptation_metric == macros.Metric.Gini.value and not is_multioutput_classification%} from sklearn.metrics import roc_auc_score {% if pipeline.task.is_multiclass == True %} -gini = 2 * roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") - 1 +gini = 2 * roc_auc_score(target_test.values.ravel(), y_prob, multi_class="ovr") - 1 {% else %} -gini = 2 * roc_auc_score(target_test, y_pred) - 1 +gini = 2 * roc_auc_score(target_test, y_prob) - 1 {% endif %} print('RESULT: Gini: ' + str(gini)) +{% elif pipeline.adaptation_metric == macros.Metric.Gini.value and is_multioutput_classification%} +from sklearn.metrics import roc_auc_score +gini_scores = [] +for i, col in enumerate(target_test.columns): +{% if pipeline.task.is_multiclass == True %} + one_auc = roc_auc_score(target_test[column], y_prob[i], multi_class="ovr") +{% else %} + one_auc = roc_auc_score(target_test[column], y_prob[i][:, 1]) +{% endif %} + gini_score = 2 * one_auc - 1 + gini_scores.append(gini_score) +gini = np.mean(gini_scores) +print('RESULT: Average Gini Score:', str(gini)) {% elif pipeline.adaptation_metric == macros.Metric.MAE.value %} from sklearn.metrics import mean_absolute_error mae = mean_absolute_error(target_test, y_pred) print('RESULT: MAE:', str(mae)) -{% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value %} +{% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value and not is_multioutput_classification%} from sklearn.metrics import log_loss -log_loss = log_loss(target_test, y_pred) +log_loss = log_loss(target_test, y_prob) print('RESULT: Log Loss:', str(log_loss)) + +{% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value and is_multioutput_classification%} +from sklearn.metrics import log_loss + +log_loss_scores = [] +for i, column in enumerate(target_test.columns): + loss = log_loss(target_test[column], y_prob[i]) + log_loss_scores.append(loss) +avg_log_loss = np.mean(log_loss_scores) +print('RESULT: Average Log Loss:', str(avg_log_loss)) {% elif pipeline.adaptation_metric == macros.Metric.ROC_AUC.value %} from sklearn.metrics import roc_auc_score {% if pipeline.task.is_multiclass == True %}