Skip to content

Commit bd90ced

Browse files
author
AgatheG
authored
Merge pull request #11 from dataiku/release/1.0.7
Release/1.0.7
2 parents 43c2318 + e6a3572 commit bd90ced

File tree

5 files changed

+95
-41
lines changed

5 files changed

+95
-41
lines changed

python-lib/dku_idtb_decision_tree/node.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ def set_node_info(self, samples, total_samples, probabilities, prediction):
4545
def get_type(self):
4646
raise NotImplementedError
4747

48+
def get_decision_rule(self):
49+
raise NotImplementedError
50+
4851
def rebuild(self, prediction, samples, probabilities):
4952
self.prediction = prediction
5053
self.samples = samples
@@ -67,6 +70,11 @@ def __init__(self, node_id, parent_id, treated_as_numerical, feature, values, ot
6770
def get_type(self):
6871
return Node.TYPES.CAT
6972

73+
def get_decision_rule(self):
74+
return "{feature} {negation}in {values}".format(
75+
feature=self.feature, negation="not " if self.others else "", values=self.values
76+
)
77+
7078
def apply_filter(self, df):
7179
if self.others:
7280
return df[~df[self.feature].isin(self.values)]
@@ -91,6 +99,15 @@ def __init__(self, node_id, parent_id, treated_as_numerical, feature, beginning=
9199
def get_type(self):
92100
return Node.TYPES.NUM
93101

102+
def get_decision_rule(self):
103+
rule = ""
104+
if self.beginning:
105+
rule += "{} ≤ ".format(self.beginning)
106+
rule += self.feature
107+
if self.end:
108+
rule += " < {}".format(self.end)
109+
return rule
110+
94111
def apply_filter(self, df, mean):
95112
if self.beginning is not None:
96113
df = df[df[self.feature].ge(self.beginning, fill_value=mean)]

python-lib/dku_idtb_decision_tree/tree.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,14 @@ def add_node(self, node):
8383
parent_node.children_ids.append(node.id)
8484
super(ScoringTree, self).add_node(node)
8585

86+
def get_decision_rule(self, node_id):
87+
rule = deque()
88+
while node_id > 0:
89+
node = self.get_node(node_id)
90+
rule.appendleft(node.get_decision_rule())
91+
node_id = node.parent_id
92+
return list(rule)
93+
8694
#Used by the webapp
8795
class InteractiveTree(Tree):
8896
"""

python-lib/dku_idtb_scoring/score.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,24 @@ def update_input_schema(input_schema, columns):
1818
new_input_schema.append(column)
1919
return new_input_schema
2020

21+
def _add_column(name, type, schema, columns=None):
22+
schema.append({'type': type, 'name': name})
23+
if columns is not None:
24+
columns.append(name)
25+
2126
def get_scored_df_schema(tree, schema, columns, output_probabilities, is_evaluation=False, check_prediction=False):
2227
check_input_schema(tree, set(column["name"] for column in schema), is_evaluation)
2328
if columns is not None:
2429
schema = update_input_schema(schema, columns)
2530
if output_probabilities:
2631
for value in tree.target_values:
27-
schema.append({'type': 'double', 'name': "proba_" + safe_str(value)})
28-
if columns is not None:
29-
columns.append("proba_"+safe_str(value))
30-
schema.append({'type': 'string', 'name': 'prediction'})
31-
if columns is not None:
32-
columns.append("prediction")
32+
_add_column('proba_' + safe_str(value), 'double', schema, columns)
33+
_add_column('prediction', 'string', schema, columns)
3334
if check_prediction:
34-
schema.append({'type': 'boolean', 'name': 'prediction_correct'})
35-
if columns is not None:
36-
columns.append("prediction_correct")
37-
schema.append({'type': 'string', 'name': 'label'})
38-
if columns is not None:
39-
columns.append("label")
35+
_add_column('prediction_correct', 'boolean', schema, columns)
36+
_add_column('decision_rule', 'array', schema, columns)
37+
_add_column('leaf_id', 'int', schema, columns)
38+
_add_column('label', 'string', schema, columns)
4039
return schema
4140

4241
def get_metric_df_schema(metrics_dict, metrics, recipe_config):
@@ -76,8 +75,8 @@ def add_scoring_columns(tree, df, output_probabilities, is_evaluation=False, che
7675
df.loc[filtered_df_indices, "prediction"] = leaf.prediction
7776
if check_prediction:
7877
df.loc[filtered_df_indices, "prediction_correct"] = filtered_df[tree.target] == leaf.prediction
79-
df.loc[label_indices, "label"] = leaf.label
8078

81-
elif leaf.label is not None:
82-
filtered_df = tree.get_filtered_df(leaf, df)
83-
df.loc[filtered_df.index, "label"] = leaf.label
79+
filtered_df = tree.get_filtered_df(leaf, df)
80+
df.loc[filtered_df.index, "decision_rule"] = safe_str(tree.get_decision_rule(leaf_id))
81+
df.loc[filtered_df.index, "leaf_id"] = leaf_id
82+
df.loc[filtered_df.index, "label"] = leaf.label

python-tests/test_score.py

Lines changed: 54 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -79,47 +79,77 @@ def get_input_df():
7979
def test_score():
8080
df = get_input_df()
8181
add_scoring_columns(tree, df, True)
82-
expected_df = pd.DataFrame([[.2, "u", "A", .8, .2, "A", "hello there"],
83-
[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, pd.np.nan, "general Kenobi"],
84-
[4, "u", "A", .25, .75, "B", None],
85-
[3, "v", "A", .8, .2, "A", "hello there"],
86-
[pd.np.nan, "u", "C", .8, .2, "A", "hello there"]], columns=("num", "cat", "target", "proba_A", "proba_B", "prediction", "label"))
87-
assert df.equals(expected_df)
82+
expected_df = pd.DataFrame([
83+
[.2, "u", "A", .8, .2, "A", str(["num < 4"]), 1.0, "hello there"],
84+
[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, pd.np.nan, str(["4 ≤ num", "cat not in {}".format(["u", "v"])]), 4.0, "general Kenobi"],
85+
[4, "u", "A", .25, .75, "B", str(["4 ≤ num", "cat in {}".format(["u", "v"])]), 3.0, None],
86+
[3, "v", "A", .8, .2, "A", str(["num < 4"]), 1.0, "hello there"],
87+
[pd.np.nan, "u", "C", .8, .2, "A", str(["num < 4"]), 1.0, "hello there"]
88+
], columns=("num", "cat", "target", "proba_A", "proba_B", "prediction", "decision_rule", "leaf_id", "label"))
89+
pd.testing.assert_frame_equal(df, expected_df)
8890

8991
df = get_input_df()
9092
add_scoring_columns(tree, df, False, True, False)
91-
expected_df = pd.DataFrame([[.2, "u", "A", "A", "hello there"],
92-
[7, pd.np.nan, "B", pd.np.nan, "general Kenobi"],
93-
[4, "u", "A", "B", None],
94-
[3, "v", "A", "A", "hello there"],
95-
[pd.np.nan, "u", "C", pd.np.nan, "hello there"]], columns=("num", "cat", "target", "prediction", "label"))
96-
assert df.equals(expected_df)
93+
expected_df = pd.DataFrame([
94+
[.2, "u", "A", "A", str(["num < 4"]), 1.0, "hello there"],
95+
[7, pd.np.nan, "B", pd.np.nan, str(["4 ≤ num", "cat not in {}".format(["u", "v"])]), 4.0, "general Kenobi"],
96+
[4, "u", "A", "B", str(["4 ≤ num", "cat in {}".format(["u", "v"])]), 3.0, None],
97+
[3, "v", "A", "A", str(["num < 4"]), 1.0, "hello there"],
98+
[pd.np.nan, "u", "C", pd.np.nan, str(["num < 4"]), 1.0, "hello there"]
99+
], columns=("num", "cat", "target", "prediction", "decision_rule", "leaf_id", "label"))
100+
pd.testing.assert_frame_equal(df, expected_df)
97101

98102
df = get_input_df()
99103
add_scoring_columns(tree, df, False, True, True)
100-
expected_df = pd.DataFrame([[.2, "u", "A", "A", True, "hello there"],
101-
[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, "general Kenobi"],
102-
[4, "u", "A", "B", False, None],
103-
[3, "v", "A", "A", True, "hello there"],
104-
[pd.np.nan, "u", "C", pd.np.nan, pd.np.nan, "hello there"]], columns=("num", "cat", "target", "prediction", "prediction_correct", "label"))
105-
assert df.equals(expected_df)
104+
expected_df = pd.DataFrame([
105+
[.2, "u", "A", "A", True, str(["num < 4"]), 1.0, "hello there"],
106+
[7, pd.np.nan, "B", pd.np.nan, pd.np.nan, str(["4 ≤ num", "cat not in {}".format(["u", "v"])]), 4.0, "general Kenobi"],
107+
[4, "u", "A", "B", False, str(["4 ≤ num", "cat in {}".format(["u", "v"])]), 3.0, None],
108+
[3, "v", "A", "A", True, str(["num < 4"]), 1.0, "hello there"],
109+
[pd.np.nan, "u", "C", pd.np.nan, pd.np.nan, str(["num < 4"]), 1.0, "hello there"]
110+
], columns=("num", "cat", "target", "prediction", "prediction_correct", "decision_rule", "leaf_id", "label"))
111+
pd.testing.assert_frame_equal(df, expected_df)
106112

107113
def get_input_schema():
108114
return [{"type": "double", "name": "num"}, {"type": "string", "name": "cat"}, {"type": "string", "name": "target"}]
109115

110116
def test_scored_df_schema():
111117
schema = get_scored_df_schema(tree, get_input_schema(), None, True)
112-
assert schema == [{"type": "double", "name": "num"}, {"type": "string", "name": "cat"}, {"type": "string", "name": "target"},
113-
{"type": "double", "name": "proba_A"}, {"type": "double", "name": "proba_B"}, {"type": "string", "name": "prediction"}, {"type": "string", "name": "label"}]
118+
expected_schema = [
119+
{"type": "double", "name": "num"},
120+
{"type": "string", "name": "cat"},
121+
{"type": "string", "name": "target"},
122+
{"type": "double", "name": "proba_A"},
123+
{"type": "double", "name": "proba_B"},
124+
{"type": "string", "name": "prediction"},
125+
{"type": "array", "name": "decision_rule"},
126+
{"type": "int", "name": "leaf_id"},
127+
{"type": "string", "name": "label"}
128+
]
129+
assert schema == expected_schema
114130
columns = []
115131
schema = get_scored_df_schema(tree, get_input_schema(), columns, False, True, False)
116-
assert schema == [{"type": "string", "name": "prediction"}, {"type": "string", "name": "label"}]
117-
assert columns == ["prediction", "label"]
132+
expected_schema = [
133+
{"type": "string", "name": "prediction"},
134+
{"type": "array", "name": "decision_rule"},
135+
{"type": "int", "name": "leaf_id"},
136+
{"type": "string", "name": "label"}
137+
]
138+
assert schema == expected_schema
139+
assert columns == ["prediction", "decision_rule", "leaf_id", "label"]
118140

119141
columns = ["num"]
120142
schema = get_scored_df_schema(tree, get_input_schema(), columns, False, True, True)
121-
assert schema == [{"type": "double", "name": "num"}, {"type": "string", "name": "prediction"}, {"type": "boolean", "name": "prediction_correct"}, {"type": "string", "name": "label"}]
122-
assert columns == ["num", "prediction", "prediction_correct", "label"]
143+
expected_schema = [
144+
{"type": "double", "name": "num"},
145+
{"type": "string", "name": "prediction"},
146+
{"type": "boolean", "name": "prediction_correct"},
147+
{"type": "array", "name": "decision_rule"},
148+
{"type": "int", "name": "leaf_id"},
149+
{"type": "string", "name": "label"}
150+
]
151+
assert schema == expected_schema
152+
assert columns == ["num", "prediction", "prediction_correct", "decision_rule", "leaf_id", "label"]
123153

124154
schema_missing_feature = [{"type": "double", "name": "num"}, {"type": "string", "name": "target"}]
125155
schema_missing_target = [{"type": "double", "name": "num"}, {"type": "string", "name": "cat"}]

resource/templates/edit.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
<div class="selected-node-panel" ng-if="!loadingTree">
6969
<div class="node-info">
7070
<div class="header">
71-
<div class="dku-title">Selected node</div>
71+
<div class="dku-title">Node {{ selectedNode.id }}</div>
7272
</div>
7373
<div class="node-info-sections">
7474
<div class="section-tall">

0 commit comments

Comments
 (0)