From 5121800c66c20b3b2a4c2620773d00ae551845bf Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Tue, 24 Oct 2023 10:40:45 +0900
Subject: [PATCH 1/2] Fixed an error when there were many missing bool columns
 in the input data

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 .../preprocessing_templates/fillna-type-string.py.jinja         | 2 ++
 .../preprocessing_templates/fillna-type-string_predict.py.jinja | 1 +
 .../preprocessing_templates/fillna-type-string_train.py.jinja   | 1 +
 3 files changed, 4 insertions(+)

diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja
index 5a8da92..5d0787b 100644
--- a/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja
+++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja
@@ -9,6 +9,8 @@ simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
 {% endif %}
 {% if cols_almost_missing_string %}
 STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
+{{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
+{{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
 {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
 {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
 {% endif %}
\ No newline at end of file
diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja
index ef58ba7..472ec11 100644
--- a/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja
+++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja
@@ -7,5 +7,6 @@ STRING_COLS_WITH_MISSING_VALUES = {{ columns }}
 {% endif %}
 {% if cols_almost_missing_string %}
 STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
+{{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
 {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
 {% endif %}
\ No newline at end of file
diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja
index 404804b..81d5621 100644
--- a/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja
+++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja
@@ -11,5 +11,6 @@ with open('simpleimputer-string.pkl', 'wb') as f:
 {% endif %}
 {% if cols_almost_missing_string %}
 STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
+{{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
 {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
 {% endif %}
\ No newline at end of file

From fc8a2d5a1486372e9f863c569ee228d6ded99312 Mon Sep 17 00:00:00 2001
From: tashiro-akira <fj0822cr@fujitsu.com>
Date: Fri, 28 Jun 2024 15:46:36 +0900
Subject: [PATCH 2/2] fix:Fixed processing when setting auc, logloss, and gini
 for calculation

---
 .../generation/pipeline_template.py           |  1 -
 .../classification_post_process.jinja         | 10 ++--
 .../other_templates/evaluation.py.jinja       | 51 ++++++++++++++++---
 3 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/sapientml_core/adaptation/generation/pipeline_template.py b/sapientml_core/adaptation/generation/pipeline_template.py
index 84926fc..30a5f05 100644
--- a/sapientml_core/adaptation/generation/pipeline_template.py
+++ b/sapientml_core/adaptation/generation/pipeline_template.py
@@ -547,7 +547,6 @@ def populate_model(self):
             or pipeline.adaptation_metric.startswith(macros.Metric.MAP_K.value)
             or pipeline.config.predict_option == macros.PRED_PROBABILITY
         ):
-            snippet = snippet.replace("predict", "predict_proba")
             tpl = env.get_template("model_templates/classification_post_process.jinja")
             snippet += "\n" + self._render(tpl, pipeline=pipeline)
 
diff --git a/sapientml_core/templates/model_templates/classification_post_process.jinja b/sapientml_core/templates/model_templates/classification_post_process.jinja
index 7e25fe3..4e72780 100644
--- a/sapientml_core/templates/model_templates/classification_post_process.jinja
+++ b/sapientml_core/templates/model_templates/classification_post_process.jinja
@@ -1,8 +1,10 @@
+y_prob = model.predict_proba(feature_test)
+
 # POST PROCESSING
 {% if pipeline.adaptation_metric.startswith("MAP_") %}
-y_pred_sorted_index = pd.DataFrame(np.argsort(-y_pred))
-y_pred = y_pred_sorted_index.apply(lambda x: model.classes_[x]).to_numpy()
+y_prob_sorted_index = pd.DataFrame(np.argsort(-y_prob))
+y_prob = y_prob_sorted_index.apply(lambda x: model.classes_[x]).to_numpy()
 {% else %}
-if np.shape(y_pred)[1] == 2:
-    y_pred = y_pred[:, 1]
+if np.shape(y_prob)[1] == 2:
+    y_prob = y_prob[:, 1]
 {% endif %}
\ No newline at end of file
diff --git a/sapientml_core/templates/other_templates/evaluation.py.jinja b/sapientml_core/templates/other_templates/evaluation.py.jinja
index 151ce72..5091078 100644
--- a/sapientml_core/templates/other_templates/evaluation.py.jinja
+++ b/sapientml_core/templates/other_templates/evaluation.py.jinja
@@ -1,11 +1,23 @@
-{% if pipeline.adaptation_metric == macros.Metric.AUC.value %}
+{% if pipeline.adaptation_metric == macros.Metric.AUC.value and not is_multioutput_classification%}
 from sklearn.metrics import roc_auc_score
 {% if pipeline.task.is_multiclass == True %}
-auc = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr")
+auc = roc_auc_score(target_test.values.ravel(), y_prob, multi_class="ovr")
 {% else %}
-auc = roc_auc_score(target_test, y_pred)
+auc = roc_auc_score(target_test, y_prob)
 {% endif %}
 print('RESULT: AUC Score: ' + str(auc))
+{% elif pipeline.adaptation_metric == macros.Metric.AUC.value and is_multioutput_classification%}
+from sklearn.metrics import roc_auc_score
+auc_scores = []
+for i, col in enumerate(target_test.columns):
+{% if pipeline.task.is_multiclass == True %}
+    one_auc = roc_auc_score(target_test[column], y_prob[i], multi_class="ovr")
+{% else %}
+    one_auc = roc_auc_score(target_test[column], y_prob[i][:, 1])
+{% endif %}
+    auc_scores.append(one_auc)
+auc = np.mean(auc_scores)
+print('RESULT: Average AUC Score:', str(auc))
 {% elif (pipeline.adaptation_metric == macros.Metric.Accuracy.value) and (not pipeline.is_multi_class_multi_targets) %}
 from sklearn.metrics import accuracy_score
 
@@ -50,24 +62,47 @@ target_test = np.clip(target_test, 0, None)
 y_pred = np.clip(y_pred, 0, None)
 rmsle = np.sqrt(mean_squared_log_error(target_test, y_pred))
 print('RESULT: RMSLE:', str(rmsle))
-{% elif pipeline.adaptation_metric == macros.Metric.Gini.value %}
+{% elif pipeline.adaptation_metric == macros.Metric.Gini.value and not is_multioutput_classification%}
 from sklearn.metrics import roc_auc_score
 {% if pipeline.task.is_multiclass == True %}
-gini = 2 * roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") - 1
+gini = 2 * roc_auc_score(target_test.values.ravel(), y_prob, multi_class="ovr") - 1
 {% else %}
-gini = 2 * roc_auc_score(target_test, y_pred) - 1
+gini = 2 * roc_auc_score(target_test, y_prob) - 1
 {% endif %}
 print('RESULT: Gini: ' + str(gini))
+{% elif pipeline.adaptation_metric == macros.Metric.Gini.value and is_multioutput_classification%}
+from sklearn.metrics import roc_auc_score
+gini_scores = []
+for i, col in enumerate(target_test.columns):
+{% if pipeline.task.is_multiclass == True %}
+    one_auc = roc_auc_score(target_test[column], y_prob[i], multi_class="ovr")
+{% else %}
+    one_auc = roc_auc_score(target_test[column], y_prob[i][:, 1])
+{% endif %}
+    gini_score = 2 * one_auc - 1
+    gini_scores.append(gini_score)
+gini = np.mean(gini_scores)
+print('RESULT: Average Gini Score:', str(gini))
 {% elif pipeline.adaptation_metric == macros.Metric.MAE.value %}
 from sklearn.metrics import mean_absolute_error
 
 mae = mean_absolute_error(target_test, y_pred)
 print('RESULT: MAE:', str(mae))
-{% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value %}
+{% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value and not is_multioutput_classification%}
 from sklearn.metrics import log_loss
 
-log_loss = log_loss(target_test, y_pred)
+log_loss = log_loss(target_test, y_prob)
 print('RESULT: Log Loss:', str(log_loss))
+
+{% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value and is_multioutput_classification%}
+from sklearn.metrics import log_loss
+
+log_loss_scores = []
+for i, column in enumerate(target_test.columns):
+    loss = log_loss(target_test[column], y_prob[i])
+    log_loss_scores.append(loss)
+avg_log_loss = np.mean(log_loss_scores)
+print('RESULT: Average Log Loss:', str(avg_log_loss))
 {% elif pipeline.adaptation_metric == macros.Metric.ROC_AUC.value %}
 from sklearn.metrics import roc_auc_score
 {% if pipeline.task.is_multiclass == True %}