11import numpy as np
2+ from collections import Counter , Iterable
23import pandas
34import numbers
45from warnings import warn
6+
57from sklearn .base import BaseEstimator
68from sklearn .utils .validation import check_X_y , check_array , check_is_fitted
79from sklearn .utils .multiclass import check_classification_targets
8-
910from sklearn .tree import DecisionTreeClassifier , DecisionTreeRegressor
10-
1111from sklearn .ensemble import BaggingClassifier , BaggingRegressor
12-
1312from sklearn .externals import six
1413from sklearn .tree import _tree
1514
@@ -36,11 +35,6 @@ class SkopeRules(BaseEstimator):
3635 The number of base estimators (rules) to use for prediction. More are
3736 built before selection. All are available in the estimators_ attribute.
3837
39- similarity_thres : float, optional (default=0.99)
40- Similarity threshold between rules. Rules too similar
41- (> similarity_thres) are fused. The similarity between two rules is
42- computed according to the formula `# {intersection} / # {union}`.
43-
4438 max_samples : int or float, optional (default=.8)
4539 The number of samples to draw from X to train each decision tree, from
4640 which rules are generated and selected.
@@ -61,10 +55,17 @@ class SkopeRules(BaseEstimator):
6155 bootstrap_features : boolean, optional (default=False)
6256 Whether features are drawn with replacement.
6357
64- max_depth : integer or None, optional (default=3)
58+ max_depth : integer or List or None, optional (default=3)
6559 The maximum depth of the decision trees. If None, then nodes are
6660 expanded until all leaves are pure or until all leaves contain less
6761 than min_samples_split samples.
62+ If an iterable is passed, you will train n_estimators
63+ for each tree depth. It allows you to create and compare
64+ rules of different length.
65+
66+ max_depth_duplication : integer, optional (default=None)
67+ The maximum depth of the decision tree for rule deduplication,
68+ if None then no deduplication occurs.
6869
6970 max_features : int, float, string or None, optional (default="auto")
7071 The number of features considered (by each decision tree) when looking
@@ -138,12 +139,12 @@ def __init__(self,
138139 precision_min = 0.5 ,
139140 recall_min = 0.01 ,
140141 n_estimators = 10 ,
141- similarity_thres = 0.95 ,
142142 max_samples = .8 ,
143143 max_samples_features = 1. ,
144144 bootstrap = False ,
145145 bootstrap_features = False ,
146146 max_depth = 3 ,
147+ max_depth_duplication = None ,
147148 max_features = 1. ,
148149 min_samples_split = 2 ,
149150 n_jobs = 1 ,
@@ -153,12 +154,14 @@ def __init__(self,
153154 self .recall_min = recall_min
154155 self .feature_names = feature_names
155156 self .n_estimators = n_estimators
156- self .similarity_thres = similarity_thres
157157 self .max_samples = max_samples
158158 self .max_samples_features = max_samples_features
159159 self .bootstrap = bootstrap
160160 self .bootstrap_features = bootstrap_features
161161 self .max_depth = max_depth
162+ self .max_depths = max_depth \
163+ if isinstance (max_depth , Iterable ) else [max_depth ]
164+ self .max_depth_duplication = max_depth_duplication
162165 self .max_features = max_features
163166 self .min_samples_split = min_samples_split
164167 self .n_jobs = n_jobs
@@ -202,6 +205,9 @@ def fit(self, X, y, sample_weight=None):
202205 " in the data, but the data contains only one"
203206 " class: %r" % self .classes_ [0 ])
204207
208+ if not isinstance (self .max_depth_duplication , int ) and self .max_depth_duplication is not None :
209+ raise ValueError ("max_depth_duplication should be an integer"
210+ )
205211 if not set (self .classes_ ) == set ([0 , 1 ]):
206212 warn ("Found labels %s. This method assumes target class to be"
207213 " labeled as 1 and normal data to be labeled as 0. Any label"
@@ -210,11 +216,6 @@ def fit(self, X, y, sample_weight=None):
210216 % set (self .classes_ ))
211217 y = (y > 0 )
212218
213- # ensure similarity_thres is in (0., 1.]:
214- if not (0. < self .similarity_thres <= 1. ):
215- raise ValueError ("similarity_thres must be in (0, 1], got %r"
216- % self .similarity_thres )
217-
218219 # ensure that max_samples is in [1, n_samples]:
219220 n_samples = X .shape [0 ]
220221
@@ -250,40 +251,44 @@ def fit(self, X, y, sample_weight=None):
250251 else ['c' + x for x in
251252 np .arange (X .shape [1 ]).astype (str )])
252253 self .feature_names_ = feature_names_
253-
254- bagging_clf = BaggingClassifier (
255- base_estimator = DecisionTreeClassifier (
256- max_depth = self .max_depth ,
257- max_features = self .max_features ,
258- min_samples_split = self .min_samples_split ),
259- n_estimators = self .n_estimators ,
260- max_samples = self .max_samples_ ,
261- max_features = self .max_samples_features ,
262- bootstrap = self .bootstrap ,
263- bootstrap_features = self .bootstrap_features ,
264- # oob_score=... XXX may be added if selection on tree perf needed.
265- # warm_start=... XXX may be added to increase computation perf.
266- n_jobs = self .n_jobs ,
267- random_state = self .random_state ,
268- verbose = self .verbose )
269-
270- bagging_reg = BaggingRegressor (
271- base_estimator = DecisionTreeRegressor (
272- max_depth = self .max_depth ,
273- max_features = self .max_features ,
274- min_samples_split = self .min_samples_split ),
275- n_estimators = self .n_estimators ,
276- max_samples = self .max_samples_ ,
277- max_features = self .max_samples_features ,
278- bootstrap = self .bootstrap ,
279- bootstrap_features = self .bootstrap_features ,
280- # oob_score=... XXX may be added if selection on tree perf needed.
281- # warm_start=... XXX may be added to increase computation perf.
282- n_jobs = self .n_jobs ,
283- random_state = self .random_state ,
284- verbose = self .verbose )
285-
286- bagging_clf .fit (X , y )
254+ clfs = []
255+ regs = []
256+
257+ for max_depth in self .max_depths :
258+ bagging_clf = BaggingClassifier (
259+ base_estimator = DecisionTreeClassifier (
260+ max_depth = max_depth ,
261+ max_features = self .max_features ,
262+ min_samples_split = self .min_samples_split ),
263+ n_estimators = self .n_estimators ,
264+ max_samples = self .max_samples_ ,
265+ max_features = self .max_samples_features ,
266+ bootstrap = self .bootstrap ,
267+ bootstrap_features = self .bootstrap_features ,
268+ # oob_score=... XXX may be added if selection on tree perf needed.
269+ # warm_start=... XXX may be added to increase computation perf.
270+ n_jobs = self .n_jobs ,
271+ random_state = self .random_state ,
272+ verbose = self .verbose )
273+
274+ bagging_reg = BaggingRegressor (
275+ base_estimator = DecisionTreeRegressor (
276+ max_depth = max_depth ,
277+ max_features = self .max_features ,
278+ min_samples_split = self .min_samples_split ),
279+ n_estimators = self .n_estimators ,
280+ max_samples = self .max_samples_ ,
281+ max_features = self .max_samples_features ,
282+ bootstrap = self .bootstrap ,
283+ bootstrap_features = self .bootstrap_features ,
284+ # oob_score=... XXX may be added if selection on tree perf needed.
285+ # warm_start=... XXX may be added to increase computation perf.
286+ n_jobs = self .n_jobs ,
287+ random_state = self .random_state ,
288+ verbose = self .verbose )
289+
290+ clfs .append (bagging_clf )
291+ regs .append (bagging_reg )
287292
288293 # define regression target:
289294 if sample_weight is not None :
@@ -298,16 +303,17 @@ def fit(self, X, y, sample_weight=None):
298303 else :
299304 y_reg = y # same as an other classification bagging
300305
301- bagging_reg .fit (X , y_reg )
302-
303- self .estimators_ += bagging_clf .estimators_
304- self .estimators_ += bagging_reg .estimators_
306+ for clf in clfs :
307+ clf .fit (X , y )
308+ self .estimators_ += clf .estimators_
309+ self .estimators_samples_ += clf .estimators_samples_
310+ self .estimators_features_ += clf .estimators_features_
305311
306- self . estimators_samples_ += bagging_clf . estimators_samples_
307- self . estimators_samples_ += bagging_reg . estimators_samples_
308-
309- self .estimators_features_ += bagging_clf . estimators_features_
310- self .estimators_features_ += bagging_reg .estimators_features_
312+ for reg in regs :
313+ reg . fit ( X , y_reg )
314+ self . estimators_ += reg . estimators_
315+ self .estimators_samples_ += reg . estimators_samples_
316+ self .estimators_features_ += reg .estimators_features_
311317
312318 rules_ = []
313319 for estimator , samples , features in zip (self .estimators_ ,
@@ -357,34 +363,9 @@ def fit(self, X, y, sample_weight=None):
357363 self .rules_ = sorted (self .rules_ .items (),
358364 key = lambda x : (x [1 ][0 ], x [1 ][1 ]), reverse = True )
359365
360- # removing rules which have very similar domains
361- X_ = pandas .DataFrame (X , columns = np .array (self .feature_names_ ))
362- omit_these_rules_list = []
363- perimeter_index_of_all_rules = []
364- for i in range (len (self .rules_ )):
365- current = self .rules_ [i ]
366- perimeter_index_of_all_rules .append (
367- set (list (X_ .query (current [0 ]).index ))
368- )
369- index_current = perimeter_index_of_all_rules [i ]
370-
371- for j in range (i ):
372- if j in omit_these_rules_list :
373- continue
374- # if a rule have already been discarded,
375- # it should not be processed again
376-
377- index_rival = perimeter_index_of_all_rules [j ]
378- size_union = len (index_rival .union (index_current ))
379- size_intersection = len (
380- index_rival .intersection (index_current ))
381-
382- if float (size_intersection )/ size_union > self .similarity_thres :
383- omit_these_rules_list .append (j )
384-
385- self .rules_ = [self .rules_ [i ] for i in range (
386- len (self .rules_ )) if i not in omit_these_rules_list ]
387-
366+ # count representation of feature
367+ if self .max_depth_duplication is not None :
368+ self .rules_ = self .deduplicate (self .rules_ )
388369 return self
389370
390371 def predict (self , X ):
@@ -613,3 +594,73 @@ def _eval_rule_perf(self, rule, X, y):
613594 return (0 , 0 )
614595 pos = y [y > 0 ].sum ()
615596 return y_detected .mean (), float (true_pos ) / pos
597+
598+ def deduplicate (self , rules ):
599+ return [max (rules_set , key = self .f1_score )
600+ for rules_set in self ._find_similar_rulesets (rules )]
601+
602+ def _find_similar_rulesets (self , rules ):
603+ """Create clusters of rules using a decision tree based
604+ on the terms of the rules
605+
606+ Parameters
607+ ----------
608+ rules : List, List of rules
609+ The rules that should be splitted in subsets of similar rules
610+
611+ Returns
612+ -------
613+ rules : List of list of rules
614+ The different set of rules. Each set should be homogeneous
615+
616+ """
617+ def split_with_best_feature (rules , depth , exceptions = []):
618+ """
619+ Method to find a split of rules given most represented feature
620+ """
621+ if depth == 0 :
622+ return rules
623+
624+ rulelist = [rule .split (' and ' ) for rule , score in rules ]
625+ terms = [t .split (' ' )[0 ] for term in rulelist for t in term ]
626+ counter = Counter (terms )
627+ # Drop exception list
628+ for exception in exceptions :
629+ del counter [exception ]
630+
631+ if len (counter ) == 0 :
632+ return rules
633+
634+ most_represented_term = counter .most_common ()[0 ][0 ]
635+ # Proceed to split
636+ rules_splitted = [[], [], []]
637+ for rule in rules :
638+ if (most_represented_term + ' <=' ) in rule [0 ]:
639+ rules_splitted [0 ].append (rule )
640+ elif (most_represented_term + ' >' ) in rule [0 ]:
641+ rules_splitted [1 ].append (rule )
642+ else :
643+ rules_splitted [2 ].append (rule )
644+ new_exceptions = exceptions + [most_represented_term ]
645+ # Choose best term
646+ return [split_with_best_feature (ruleset ,
647+ depth - 1 ,
648+ exceptions = new_exceptions )
649+ for ruleset in rules_splitted ]
650+
651+ def breadth_first_search (rules , leaves = None ):
652+ if len (rules ) == 0 or not isinstance (rules [0 ], list ):
653+ if len (rules ) > 0 :
654+ return leaves .append (rules )
655+ else :
656+ for rules_child in rules :
657+ breadth_first_search (rules_child , leaves = leaves )
658+ return leaves
659+ leaves = []
660+ res = split_with_best_feature (rules , self .max_depth_duplication )
661+ breadth_first_search (res , leaves = leaves )
662+ return leaves
663+
664+ def f1_score (self , x ):
665+ return 2 * x [1 ][0 ] * x [1 ][1 ] / \
666+ (x [1 ][0 ] + x [1 ][1 ]) if (x [1 ][0 ] + x [1 ][1 ]) > 0 else 0
0 commit comments