Skip to content

Commit 4ee939d

Browse files
authored
Merge pull request #29 from floriangardin/master
[MRG+2]Add new deduplication algorithm
2 parents aa8da03 + 94c0473 commit 4ee939d

File tree

3 files changed

+203
-95
lines changed

3 files changed

+203
-95
lines changed

examples/plot_credit_default.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@
149149
# fit the model
150150

151151
clf = SkopeRules(
152-
similarity_thres=.8, max_depth=3, max_features=0.5,
152+
max_depth_duplication=3, max_depth=3, max_features=0.5,
153153
max_samples_features=0.5, random_state=rng, n_estimators=20,
154154
feature_names=feature_names, recall_min=0.04, precision_min=0.6)
155155
clf.fit(X_train, y_train)

skrules/skope_rules.py

Lines changed: 138 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
import numpy as np
2+
from collections import Counter, Iterable
23
import pandas
34
import numbers
45
from warnings import warn
6+
57
from sklearn.base import BaseEstimator
68
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
79
from sklearn.utils.multiclass import check_classification_targets
8-
910
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
10-
1111
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
12-
1312
from sklearn.externals import six
1413
from sklearn.tree import _tree
1514

@@ -36,11 +35,6 @@ class SkopeRules(BaseEstimator):
3635
The number of base estimators (rules) to use for prediction. More are
3736
built before selection. All are available in the estimators_ attribute.
3837
39-
similarity_thres : float, optional (default=0.99)
40-
Similarity threshold between rules. Rules too similar
41-
(> similarity_thres) are fused. The similarity between two rules is
42-
computed according to the formula `# {intersection} / # {union}`.
43-
4438
max_samples : int or float, optional (default=.8)
4539
The number of samples to draw from X to train each decision tree, from
4640
which rules are generated and selected.
@@ -61,10 +55,17 @@ class SkopeRules(BaseEstimator):
6155
bootstrap_features : boolean, optional (default=False)
6256
Whether features are drawn with replacement.
6357
64-
max_depth : integer or None, optional (default=3)
58+
max_depth : integer or List or None, optional (default=3)
6559
The maximum depth of the decision trees. If None, then nodes are
6660
expanded until all leaves are pure or until all leaves contain less
6761
than min_samples_split samples.
62+
If an iterable is passed, you will train n_estimators
63+
for each tree depth. It allows you to create and compare
64+
rules of different length.
65+
66+
max_depth_duplication : integer, optional (default=None)
67+
The maximum depth of the decision tree for rule deduplication,
68+
if None then no deduplication occurs.
6869
6970
max_features : int, float, string or None, optional (default="auto")
7071
The number of features considered (by each decision tree) when looking
@@ -138,12 +139,12 @@ def __init__(self,
138139
precision_min=0.5,
139140
recall_min=0.01,
140141
n_estimators=10,
141-
similarity_thres=0.95,
142142
max_samples=.8,
143143
max_samples_features=1.,
144144
bootstrap=False,
145145
bootstrap_features=False,
146146
max_depth=3,
147+
max_depth_duplication=None,
147148
max_features=1.,
148149
min_samples_split=2,
149150
n_jobs=1,
@@ -153,12 +154,14 @@ def __init__(self,
153154
self.recall_min = recall_min
154155
self.feature_names = feature_names
155156
self.n_estimators = n_estimators
156-
self.similarity_thres = similarity_thres
157157
self.max_samples = max_samples
158158
self.max_samples_features = max_samples_features
159159
self.bootstrap = bootstrap
160160
self.bootstrap_features = bootstrap_features
161161
self.max_depth = max_depth
162+
self.max_depths = max_depth \
163+
if isinstance(max_depth, Iterable) else [max_depth]
164+
self.max_depth_duplication = max_depth_duplication
162165
self.max_features = max_features
163166
self.min_samples_split = min_samples_split
164167
self.n_jobs = n_jobs
@@ -202,6 +205,9 @@ def fit(self, X, y, sample_weight=None):
202205
" in the data, but the data contains only one"
203206
" class: %r" % self.classes_[0])
204207

208+
if not isinstance(self.max_depth_duplication, int) and self.max_depth_duplication is not None:
209+
raise ValueError("max_depth_duplication should be an integer"
210+
)
205211
if not set(self.classes_) == set([0, 1]):
206212
warn("Found labels %s. This method assumes target class to be"
207213
" labeled as 1 and normal data to be labeled as 0. Any label"
@@ -210,11 +216,6 @@ def fit(self, X, y, sample_weight=None):
210216
% set(self.classes_))
211217
y = (y > 0)
212218

213-
# ensure similarity_thres is in (0., 1.]:
214-
if not (0. < self.similarity_thres <= 1.):
215-
raise ValueError("similarity_thres must be in (0, 1], got %r"
216-
% self.similarity_thres)
217-
218219
# ensure that max_samples is in [1, n_samples]:
219220
n_samples = X.shape[0]
220221

@@ -250,40 +251,44 @@ def fit(self, X, y, sample_weight=None):
250251
else ['c' + x for x in
251252
np.arange(X.shape[1]).astype(str)])
252253
self.feature_names_ = feature_names_
253-
254-
bagging_clf = BaggingClassifier(
255-
base_estimator=DecisionTreeClassifier(
256-
max_depth=self.max_depth,
257-
max_features=self.max_features,
258-
min_samples_split=self.min_samples_split),
259-
n_estimators=self.n_estimators,
260-
max_samples=self.max_samples_,
261-
max_features=self.max_samples_features,
262-
bootstrap=self.bootstrap,
263-
bootstrap_features=self.bootstrap_features,
264-
# oob_score=... XXX may be added if selection on tree perf needed.
265-
# warm_start=... XXX may be added to increase computation perf.
266-
n_jobs=self.n_jobs,
267-
random_state=self.random_state,
268-
verbose=self.verbose)
269-
270-
bagging_reg = BaggingRegressor(
271-
base_estimator=DecisionTreeRegressor(
272-
max_depth=self.max_depth,
273-
max_features=self.max_features,
274-
min_samples_split=self.min_samples_split),
275-
n_estimators=self.n_estimators,
276-
max_samples=self.max_samples_,
277-
max_features=self.max_samples_features,
278-
bootstrap=self.bootstrap,
279-
bootstrap_features=self.bootstrap_features,
280-
# oob_score=... XXX may be added if selection on tree perf needed.
281-
# warm_start=... XXX may be added to increase computation perf.
282-
n_jobs=self.n_jobs,
283-
random_state=self.random_state,
284-
verbose=self.verbose)
285-
286-
bagging_clf.fit(X, y)
254+
clfs = []
255+
regs = []
256+
257+
for max_depth in self.max_depths:
258+
bagging_clf = BaggingClassifier(
259+
base_estimator=DecisionTreeClassifier(
260+
max_depth=max_depth,
261+
max_features=self.max_features,
262+
min_samples_split=self.min_samples_split),
263+
n_estimators=self.n_estimators,
264+
max_samples=self.max_samples_,
265+
max_features=self.max_samples_features,
266+
bootstrap=self.bootstrap,
267+
bootstrap_features=self.bootstrap_features,
268+
# oob_score=... XXX may be added if selection on tree perf needed.
269+
# warm_start=... XXX may be added to increase computation perf.
270+
n_jobs=self.n_jobs,
271+
random_state=self.random_state,
272+
verbose=self.verbose)
273+
274+
bagging_reg = BaggingRegressor(
275+
base_estimator=DecisionTreeRegressor(
276+
max_depth=max_depth,
277+
max_features=self.max_features,
278+
min_samples_split=self.min_samples_split),
279+
n_estimators=self.n_estimators,
280+
max_samples=self.max_samples_,
281+
max_features=self.max_samples_features,
282+
bootstrap=self.bootstrap,
283+
bootstrap_features=self.bootstrap_features,
284+
# oob_score=... XXX may be added if selection on tree perf needed.
285+
# warm_start=... XXX may be added to increase computation perf.
286+
n_jobs=self.n_jobs,
287+
random_state=self.random_state,
288+
verbose=self.verbose)
289+
290+
clfs.append(bagging_clf)
291+
regs.append(bagging_reg)
287292

288293
# define regression target:
289294
if sample_weight is not None:
@@ -298,16 +303,17 @@ def fit(self, X, y, sample_weight=None):
298303
else:
299304
y_reg = y # same as an other classification bagging
300305

301-
bagging_reg.fit(X, y_reg)
302-
303-
self.estimators_ += bagging_clf.estimators_
304-
self.estimators_ += bagging_reg.estimators_
306+
for clf in clfs:
307+
clf.fit(X, y)
308+
self.estimators_ += clf.estimators_
309+
self.estimators_samples_ += clf.estimators_samples_
310+
self.estimators_features_ += clf.estimators_features_
305311

306-
self.estimators_samples_ += bagging_clf.estimators_samples_
307-
self.estimators_samples_ += bagging_reg.estimators_samples_
308-
309-
self.estimators_features_ += bagging_clf.estimators_features_
310-
self.estimators_features_ += bagging_reg.estimators_features_
312+
for reg in regs:
313+
reg.fit(X, y_reg)
314+
self.estimators_ += reg.estimators_
315+
self.estimators_samples_ += reg.estimators_samples_
316+
self.estimators_features_ += reg.estimators_features_
311317

312318
rules_ = []
313319
for estimator, samples, features in zip(self.estimators_,
@@ -357,34 +363,9 @@ def fit(self, X, y, sample_weight=None):
357363
self.rules_ = sorted(self.rules_.items(),
358364
key=lambda x: (x[1][0], x[1][1]), reverse=True)
359365

360-
# removing rules which have very similar domains
361-
X_ = pandas.DataFrame(X, columns=np.array(self.feature_names_))
362-
omit_these_rules_list = []
363-
perimeter_index_of_all_rules = []
364-
for i in range(len(self.rules_)):
365-
current = self.rules_[i]
366-
perimeter_index_of_all_rules.append(
367-
set(list(X_.query(current[0]).index))
368-
)
369-
index_current = perimeter_index_of_all_rules[i]
370-
371-
for j in range(i):
372-
if j in omit_these_rules_list:
373-
continue
374-
# if a rule have already been discarded,
375-
# it should not be processed again
376-
377-
index_rival = perimeter_index_of_all_rules[j]
378-
size_union = len(index_rival.union(index_current))
379-
size_intersection = len(
380-
index_rival.intersection(index_current))
381-
382-
if float(size_intersection)/size_union > self.similarity_thres:
383-
omit_these_rules_list.append(j)
384-
385-
self.rules_ = [self.rules_[i] for i in range(
386-
len(self.rules_)) if i not in omit_these_rules_list]
387-
366+
# count representation of feature
367+
if self.max_depth_duplication is not None:
368+
self.rules_ = self.deduplicate(self.rules_)
388369
return self
389370

390371
def predict(self, X):
@@ -613,3 +594,73 @@ def _eval_rule_perf(self, rule, X, y):
613594
return (0, 0)
614595
pos = y[y > 0].sum()
615596
return y_detected.mean(), float(true_pos) / pos
597+
598+
def deduplicate(self, rules):
599+
return [max(rules_set, key=self.f1_score)
600+
for rules_set in self._find_similar_rulesets(rules)]
601+
602+
def _find_similar_rulesets(self, rules):
603+
"""Create clusters of rules using a decision tree based
604+
on the terms of the rules
605+
606+
Parameters
607+
----------
608+
rules : List, List of rules
609+
The rules that should be splitted in subsets of similar rules
610+
611+
Returns
612+
-------
613+
rules : List of list of rules
614+
The different set of rules. Each set should be homogeneous
615+
616+
"""
617+
def split_with_best_feature(rules, depth, exceptions=[]):
618+
"""
619+
Method to find a split of rules given most represented feature
620+
"""
621+
if depth == 0:
622+
return rules
623+
624+
rulelist = [rule.split(' and ') for rule, score in rules]
625+
terms = [t.split(' ')[0] for term in rulelist for t in term]
626+
counter = Counter(terms)
627+
# Drop exception list
628+
for exception in exceptions:
629+
del counter[exception]
630+
631+
if len(counter) == 0:
632+
return rules
633+
634+
most_represented_term = counter.most_common()[0][0]
635+
# Proceed to split
636+
rules_splitted = [[], [], []]
637+
for rule in rules:
638+
if (most_represented_term + ' <=') in rule[0]:
639+
rules_splitted[0].append(rule)
640+
elif (most_represented_term + ' >') in rule[0]:
641+
rules_splitted[1].append(rule)
642+
else:
643+
rules_splitted[2].append(rule)
644+
new_exceptions = exceptions+[most_represented_term]
645+
# Choose best term
646+
return [split_with_best_feature(ruleset,
647+
depth-1,
648+
exceptions=new_exceptions)
649+
for ruleset in rules_splitted]
650+
651+
def breadth_first_search(rules, leaves=None):
652+
if len(rules) == 0 or not isinstance(rules[0], list):
653+
if len(rules) > 0:
654+
return leaves.append(rules)
655+
else:
656+
for rules_child in rules:
657+
breadth_first_search(rules_child, leaves=leaves)
658+
return leaves
659+
leaves = []
660+
res = split_with_best_feature(rules, self.max_depth_duplication)
661+
breadth_first_search(res, leaves=leaves)
662+
return leaves
663+
664+
def f1_score(self, x):
665+
return 2 * x[1][0] * x[1][1] / \
666+
(x[1][0] + x[1][1]) if (x[1][0] + x[1][1]) > 0 else 0

0 commit comments

Comments
 (0)