Update tests + fixes found with the tests

Agathe Guillemot · Agathe Guillemot · commit 862a53f69f88 · 2022-11-08T18:47:39.000+01:00
diff --git a/python-lib/dku_idtb_decision_tree/node.py b/python-lib/dku_idtb_decision_tree/node.py
@@ -105,7 +105,7 @@ def get_decision_rule(self):
             rule += "{} ≤ ".format(self.beginning)
         rule += self.feature
         if self.end:
-            rule += "< {}".format(self.end)
+            rule += " < {}".format(self.end)
         return rule
 
     def apply_filter(self, df, mean):
diff --git a/python-lib/dku_idtb_scoring/score.py b/python-lib/dku_idtb_scoring/score.py
@@ -34,7 +34,7 @@ def get_scored_df_schema(tree, schema, columns, output_probabilities, is_evaluat
     if check_prediction:
         _add_column('prediction_correct', 'boolean', schema, columns)
     _add_column('decision_rule', 'array', schema, columns)
-    _add_column('node_id', 'int', schema, columns)
+    _add_column('leaf_id', 'int', schema, columns)
     _add_column('label', 'string', schema, columns)
     return schema
 
@@ -75,9 +75,8 @@ def add_scoring_columns(tree, df, output_probabilities, is_evaluation=False, che
             df.loc[filtered_df_indices, "prediction"] = leaf.prediction
             if check_prediction:
                 df.loc[filtered_df_indices, "prediction_correct"] = filtered_df[tree.target] == leaf.prediction
-            df.loc[label_indices, "label"] = leaf.label
 
         filtered_df = tree.get_filtered_df(leaf, df)
         df.loc[filtered_df.index, "decision_rule"] = safe_str(tree.get_decision_rule(leaf_id))
-        df.loc[filtered_df.index, "node_id"] = leaf_id
+        df.loc[filtered_df.index, "leaf_id"] = leaf_id
         df.loc[filtered_df.index, "label"] = leaf.label
diff --git a/python-tests/test_score.py b/python-tests/test_score.py
@@ -79,47 +79,77 @@ def get_input_df():
 def test_score():
 	df = get_input_df()
 	add_scoring_columns(tree, df, True)
-	expected_df = pd.DataFrame([[.2, "u", "A", .8, .2, "A", "hello there"],
-								[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, pd.np.nan, "general Kenobi"],
-								[4, "u", "A", .25, .75, "B", None],
-								[3, "v", "A", .8, .2, "A", "hello there"],
-								[pd.np.nan, "u", "C", .8, .2, "A", "hello there"]], columns=("num", "cat", "target", "proba_A", "proba_B", "prediction", "label"))
-	assert df.equals(expected_df)
+	expected_df = pd.DataFrame([
+		[.2, "u", "A", .8, .2, "A", str(["num < 4"]), 1.0, "hello there"],
+		[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, pd.np.nan, str(["4 ≤ num", "cat not in {}".format(["u", "v"])]), 4.0, "general Kenobi"],
+		[4, "u", "A", .25, .75, "B", str(["4 ≤ num", "cat in {}".format(["u", "v"])]), 3.0, None],
+		[3, "v", "A", .8, .2, "A", str(["num < 4"]), 1.0, "hello there"],
+		[pd.np.nan, "u", "C", .8, .2, "A", str(["num < 4"]), 1.0, "hello there"]
+	], columns=("num", "cat", "target", "proba_A", "proba_B", "prediction", "decision_rule", "leaf_id", "label"))
+	pd.testing.assert_frame_equal(df, expected_df)
 
 	df = get_input_df()
 	add_scoring_columns(tree, df, False, True, False)
-	expected_df = pd.DataFrame([[.2, "u", "A", "A", "hello there"],
-								[7, pd.np.nan, "B", pd.np.nan, "general Kenobi"],
-								[4, "u", "A", "B", None],
-								[3, "v", "A", "A", "hello there"],
-								[pd.np.nan, "u", "C", pd.np.nan, "hello there"]], columns=("num", "cat", "target", "prediction", "label"))
-	assert df.equals(expected_df)
+	expected_df = pd.DataFrame([
+		[.2, "u", "A", "A", str(["num < 4"]), 1.0, "hello there"],
+		[7, pd.np.nan, "B", pd.np.nan, str(["4 ≤ num", "cat not in {}".format(["u", "v"])]), 4.0, "general Kenobi"],
+		[4, "u", "A", "B", str(["4 ≤ num", "cat in {}".format(["u", "v"])]), 3.0, None],
+		[3, "v", "A", "A", str(["num < 4"]), 1.0, "hello there"],
+		[pd.np.nan, "u", "C", pd.np.nan, str(["num < 4"]), 1.0, "hello there"]
+	], columns=("num", "cat", "target", "prediction", "decision_rule", "leaf_id", "label"))
+	pd.testing.assert_frame_equal(df, expected_df)
 
 	df = get_input_df()
 	add_scoring_columns(tree, df, False, True, True)
-	expected_df = pd.DataFrame([[.2, "u", "A", "A", True, "hello there"],
-								[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, "general Kenobi"],
-								[4, "u", "A", "B", False, None],
-								[3, "v", "A", "A", True, "hello there"],
-								[pd.np.nan, "u", "C", pd.np.nan, pd.np.nan, "hello there"]], columns=("num", "cat", "target", "prediction", "prediction_correct", "label"))
-	assert df.equals(expected_df)
+	expected_df = pd.DataFrame([
+		[.2, "u", "A", "A", True, str(["num < 4"]), 1.0, "hello there"],
+		[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, str(["4 ≤ num", "cat not in {}".format(["u", "v"])]), 4.0, "general Kenobi"],
+		[4, "u", "A", "B", False, str(["4 ≤ num", "cat in {}".format(["u", "v"])]), 3.0, None],
+		[3, "v", "A", "A", True, str(["num < 4"]), 1.0, "hello there"],
+		[pd.np.nan, "u", "C", pd.np.nan, pd.np.nan, str(["num < 4"]), 1.0, "hello there"]
+	], columns=("num", "cat", "target", "prediction", "prediction_correct", "decision_rule", "leaf_id", "label"))
+	pd.testing.assert_frame_equal(df, expected_df)
 
 def get_input_schema():
 	return [{"type": "double", "name": "num"}, {"type": "string", "name": "cat"}, {"type": "string", "name": "target"}]
 
 def test_scored_df_schema():
 	schema = get_scored_df_schema(tree, get_input_schema(), None, True)
-	assert schema == [{"type": "double", "name": "num"}, {"type": "string", "name": "cat"}, {"type": "string", "name": "target"},
-					{"type": "double", "name": "proba_A"}, {"type": "double", "name": "proba_B"}, {"type": "string", "name": "prediction"}, {"type": "string", "name": "label"}]
+	expected_schema = [
+		{"type": "double", "name": "num"},
+		{"type": "string", "name": "cat"},
+		{"type": "string", "name": "target"},
+		{"type": "double", "name": "proba_A"},
+		{"type": "double", "name": "proba_B"},
+		{"type": "string", "name": "prediction"},
+		{"type": "array", "name": "decision_rule"},
+		{"type": "int", "name": "leaf_id"},
+		{"type": "string", "name": "label"}
+	]
+	assert schema == expected_schema
 	columns = []
 	schema = get_scored_df_schema(tree, get_input_schema(), columns, False, True, False)
-	assert schema == [{"type": "string", "name": "prediction"}, {"type": "string", "name": "label"}]
-	assert columns == ["prediction", "label"]
+	expected_schema = [
+		{"type": "string", "name": "prediction"},
+		{"type": "array", "name": "decision_rule"},
+		{"type": "int", "name": "leaf_id"},
+		{"type": "string", "name": "label"}
+	]
+	assert schema == expected_schema
+	assert columns == ["prediction", "decision_rule", "leaf_id", "label"]
 
 	columns = ["num"]
 	schema = get_scored_df_schema(tree, get_input_schema(), columns, False, True, True)
-	assert schema == [{"type": "double", "name": "num"}, {"type": "string", "name": "prediction"}, {"type": "boolean", "name": "prediction_correct"}, {"type": "string", "name": "label"}]
-	assert columns == ["num", "prediction", "prediction_correct", "label"]
+	expected_schema = [
+		{"type": "double", "name": "num"},
+		{"type": "string", "name": "prediction"},
+		{"type": "boolean", "name": "prediction_correct"},
+		{"type": "array", "name": "decision_rule"},
+		{"type": "int", "name": "leaf_id"},
+		{"type": "string", "name": "label"}
+	]
+	assert schema == expected_schema
+	assert columns == ["num", "prediction", "prediction_correct", "decision_rule", "leaf_id", "label"]
 
 	schema_missing_feature = [{"type": "double", "name": "num"}, {"type": "string", "name": "target"}]
 	schema_missing_target = [{"type": "double", "name": "num"}, {"type": "string", "name": "cat"}]