Merge pull request #11 from dataiku/release/1.0.7

AgatheG · web-flow · commit bd90cedc9cf8 · 2022-11-10T11:02:48.000+01:00
Release/1.0.7
diff --git a/python-lib/dku_idtb_decision_tree/node.py b/python-lib/dku_idtb_decision_tree/node.py
@@ -45,6 +45,9 @@ def set_node_info(self, samples, total_samples, probabilities, prediction):
     def get_type(self):
         raise NotImplementedError
 
+    def get_decision_rule(self):
+        raise NotImplementedError
+
     def rebuild(self, prediction, samples, probabilities):
         self.prediction = prediction
         self.samples = samples
@@ -67,6 +70,11 @@ def __init__(self, node_id, parent_id, treated_as_numerical, feature, values, ot
     def get_type(self):
         return Node.TYPES.CAT
 
+    def get_decision_rule(self):
+        return "{feature} {negation}in {values}".format(
+            feature=self.feature, negation="not " if self.others else "", values=self.values
+        )
+
     def apply_filter(self, df):
         if self.others:
             return df[~df[self.feature].isin(self.values)]
@@ -91,6 +99,15 @@ def __init__(self, node_id, parent_id, treated_as_numerical, feature, beginning=
     def get_type(self):
         return Node.TYPES.NUM
 
+    def get_decision_rule(self):
+        rule = ""
+        if self.beginning:
+            rule += "{} ≤ ".format(self.beginning)
+        rule += self.feature
+        if self.end:
+            rule += " < {}".format(self.end)
+        return rule
+
     def apply_filter(self, df, mean):
         if self.beginning is not None:
             df = df[df[self.feature].ge(self.beginning, fill_value=mean)]
diff --git a/python-lib/dku_idtb_decision_tree/tree.py b/python-lib/dku_idtb_decision_tree/tree.py
@@ -83,6 +83,14 @@ def add_node(self, node):
             parent_node.children_ids.append(node.id)
         super(ScoringTree, self).add_node(node)
 
+    def get_decision_rule(self, node_id):
+        rule = deque()
+        while node_id > 0:
+            node = self.get_node(node_id)
+            rule.appendleft(node.get_decision_rule())
+            node_id = node.parent_id
+        return list(rule)
+
 #Used by the webapp
 class InteractiveTree(Tree):
     """
diff --git a/python-lib/dku_idtb_scoring/score.py b/python-lib/dku_idtb_scoring/score.py
@@ -18,25 +18,24 @@ def update_input_schema(input_schema, columns):
             new_input_schema.append(column)
     return new_input_schema
 
+def _add_column(name, type, schema, columns=None):
+    schema.append({'type': type, 'name': name})
+    if columns is not None:
+        columns.append(name)
+
 def get_scored_df_schema(tree, schema, columns, output_probabilities, is_evaluation=False, check_prediction=False):
     check_input_schema(tree, set(column["name"] for column in schema), is_evaluation)
     if columns is not None:
         schema = update_input_schema(schema, columns)
     if output_probabilities:
         for value in tree.target_values:
-            schema.append({'type': 'double', 'name': "proba_" + safe_str(value)})
-            if columns is not None:
-                columns.append("proba_"+safe_str(value))
-    schema.append({'type': 'string', 'name': 'prediction'})
-    if columns is not None:
-        columns.append("prediction")
+            _add_column('proba_' + safe_str(value), 'double', schema, columns)
+    _add_column('prediction', 'string', schema, columns)
     if check_prediction:
-        schema.append({'type': 'boolean', 'name': 'prediction_correct'})
-        if columns is not None:
-            columns.append("prediction_correct")
-    schema.append({'type': 'string', 'name': 'label'})
-    if columns is not None:
-        columns.append("label")
+        _add_column('prediction_correct', 'boolean', schema, columns)
+    _add_column('decision_rule', 'array', schema, columns)
+    _add_column('leaf_id', 'int', schema, columns)
+    _add_column('label', 'string', schema, columns)
     return schema
 
 def get_metric_df_schema(metrics_dict, metrics, recipe_config):
@@ -76,8 +75,8 @@ def add_scoring_columns(tree, df, output_probabilities, is_evaluation=False, che
             df.loc[filtered_df_indices, "prediction"] = leaf.prediction
             if check_prediction:
                 df.loc[filtered_df_indices, "prediction_correct"] = filtered_df[tree.target] == leaf.prediction
-            df.loc[label_indices, "label"] = leaf.label
 
-        elif leaf.label is not None:
-            filtered_df = tree.get_filtered_df(leaf, df)
-            df.loc[filtered_df.index, "label"] = leaf.label
+        filtered_df = tree.get_filtered_df(leaf, df)
+        df.loc[filtered_df.index, "decision_rule"] = safe_str(tree.get_decision_rule(leaf_id))
+        df.loc[filtered_df.index, "leaf_id"] = leaf_id
+        df.loc[filtered_df.index, "label"] = leaf.label
diff --git a/python-tests/test_score.py b/python-tests/test_score.py
@@ -79,47 +79,77 @@ def get_input_df():
 def test_score():
 	df = get_input_df()
 	add_scoring_columns(tree, df, True)
-	expected_df = pd.DataFrame([[.2, "u", "A", .8, .2, "A", "hello there"],
-								[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, pd.np.nan, "general Kenobi"],
-								[4, "u", "A", .25, .75, "B", None],
-								[3, "v", "A", .8, .2, "A", "hello there"],
-								[pd.np.nan, "u", "C", .8, .2, "A", "hello there"]], columns=("num", "cat", "target", "proba_A", "proba_B", "prediction", "label"))
-	assert df.equals(expected_df)
+	expected_df = pd.DataFrame([
+		[.2, "u", "A", .8, .2, "A", str(["num < 4"]), 1.0, "hello there"],
+		[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, pd.np.nan, str(["4 ≤ num", "cat not in {}".format(["u", "v"])]), 4.0, "general Kenobi"],
+		[4, "u", "A", .25, .75, "B", str(["4 ≤ num", "cat in {}".format(["u", "v"])]), 3.0, None],
+		[3, "v", "A", .8, .2, "A", str(["num < 4"]), 1.0, "hello there"],
+		[pd.np.nan, "u", "C", .8, .2, "A", str(["num < 4"]), 1.0, "hello there"]
+	], columns=("num", "cat", "target", "proba_A", "proba_B", "prediction", "decision_rule", "leaf_id", "label"))
+	pd.testing.assert_frame_equal(df, expected_df)
 
 	df = get_input_df()
 	add_scoring_columns(tree, df, False, True, False)
-	expected_df = pd.DataFrame([[.2, "u", "A", "A", "hello there"],
-								[7, pd.np.nan, "B", pd.np.nan, "general Kenobi"],
-								[4, "u", "A", "B", None],
-								[3, "v", "A", "A", "hello there"],
-								[pd.np.nan, "u", "C", pd.np.nan, "hello there"]], columns=("num", "cat", "target", "prediction", "label"))
-	assert df.equals(expected_df)
+	expected_df = pd.DataFrame([
+		[.2, "u", "A", "A", str(["num < 4"]), 1.0, "hello there"],
+		[7, pd.np.nan, "B", pd.np.nan, str(["4 ≤ num", "cat not in {}".format(["u", "v"])]), 4.0, "general Kenobi"],
+		[4, "u", "A", "B", str(["4 ≤ num", "cat in {}".format(["u", "v"])]), 3.0, None],
+		[3, "v", "A", "A", str(["num < 4"]), 1.0, "hello there"],
+		[pd.np.nan, "u", "C", pd.np.nan, str(["num < 4"]), 1.0, "hello there"]
+	], columns=("num", "cat", "target", "prediction", "decision_rule", "leaf_id", "label"))
+	pd.testing.assert_frame_equal(df, expected_df)
 
 	df = get_input_df()
 	add_scoring_columns(tree, df, False, True, True)
-	expected_df = pd.DataFrame([[.2, "u", "A", "A", True, "hello there"],
-								[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, "general Kenobi"],
-								[4, "u", "A", "B", False, None],
-								[3, "v", "A", "A", True, "hello there"],
-								[pd.np.nan, "u", "C", pd.np.nan, pd.np.nan, "hello there"]], columns=("num", "cat", "target", "prediction", "prediction_correct", "label"))
-	assert df.equals(expected_df)
+	expected_df = pd.DataFrame([
+		[.2, "u", "A", "A", True, str(["num < 4"]), 1.0, "hello there"],
+		[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, str(["4 ≤ num", "cat not in {}".format(["u", "v"])]), 4.0, "general Kenobi"],
+		[4, "u", "A", "B", False, str(["4 ≤ num", "cat in {}".format(["u", "v"])]), 3.0, None],
+		[3, "v", "A", "A", True, str(["num < 4"]), 1.0, "hello there"],
+		[pd.np.nan, "u", "C", pd.np.nan, pd.np.nan, str(["num < 4"]), 1.0, "hello there"]
+	], columns=("num", "cat", "target", "prediction", "prediction_correct", "decision_rule", "leaf_id", "label"))
+	pd.testing.assert_frame_equal(df, expected_df)
 
 def get_input_schema():
 	return [{"type": "double", "name": "num"}, {"type": "string", "name": "cat"}, {"type": "string", "name": "target"}]
 
 def test_scored_df_schema():
 	schema = get_scored_df_schema(tree, get_input_schema(), None, True)
-	assert schema == [{"type": "double", "name": "num"}, {"type": "string", "name": "cat"}, {"type": "string", "name": "target"},
-					{"type": "double", "name": "proba_A"}, {"type": "double", "name": "proba_B"}, {"type": "string", "name": "prediction"}, {"type": "string", "name": "label"}]
+	expected_schema = [
+		{"type": "double", "name": "num"},
+		{"type": "string", "name": "cat"},
+		{"type": "string", "name": "target"},
+		{"type": "double", "name": "proba_A"},
+		{"type": "double", "name": "proba_B"},
+		{"type": "string", "name": "prediction"},
+		{"type": "array", "name": "decision_rule"},
+		{"type": "int", "name": "leaf_id"},
+		{"type": "string", "name": "label"}
+	]
+	assert schema == expected_schema
 	columns = []
 	schema = get_scored_df_schema(tree, get_input_schema(), columns, False, True, False)
-	assert schema == [{"type": "string", "name": "prediction"}, {"type": "string", "name": "label"}]
-	assert columns == ["prediction", "label"]
+	expected_schema = [
+		{"type": "string", "name": "prediction"},
+		{"type": "array", "name": "decision_rule"},
+		{"type": "int", "name": "leaf_id"},
+		{"type": "string", "name": "label"}
+	]
+	assert schema == expected_schema
+	assert columns == ["prediction", "decision_rule", "leaf_id", "label"]
 
 	columns = ["num"]
 	schema = get_scored_df_schema(tree, get_input_schema(), columns, False, True, True)
-	assert schema == [{"type": "double", "name": "num"}, {"type": "string", "name": "prediction"}, {"type": "boolean", "name": "prediction_correct"}, {"type": "string", "name": "label"}]
-	assert columns == ["num", "prediction", "prediction_correct", "label"]
+	expected_schema = [
+		{"type": "double", "name": "num"},
+		{"type": "string", "name": "prediction"},
+		{"type": "boolean", "name": "prediction_correct"},
+		{"type": "array", "name": "decision_rule"},
+		{"type": "int", "name": "leaf_id"},
+		{"type": "string", "name": "label"}
+	]
+	assert schema == expected_schema
+	assert columns == ["num", "prediction", "prediction_correct", "decision_rule", "leaf_id", "label"]
 
 	schema_missing_feature = [{"type": "double", "name": "num"}, {"type": "string", "name": "target"}]
 	schema_missing_target = [{"type": "double", "name": "num"}, {"type": "string", "name": "cat"}]
diff --git a/resource/templates/edit.html b/resource/templates/edit.html
@@ -68,7 +68,7 @@
         <div class="selected-node-panel" ng-if="!loadingTree">
             <div class="node-info">
                 <div class="header">
-                    <div class="dku-title">Selected node</div>
+                    <div class="dku-title">Node {{ selectedNode.id }}</div>
                 </div>
                 <div class="node-info-sections">
                     <div class="section-tall">