Merge pull request #30 from floriangardin/master

datajms · web-flow · commit 59c03c5cb585 · 2018-02-13T17:56:48.000+01:00
add factorization of rules
diff --git a/skrules/__init__.py b/skrules/__init__.py
@@ -1,3 +1,4 @@
 from .skope_rules import SkopeRules
+from .rule import Rule
 
-__all__ = ['SkopeRules']
+__all__ = ['SkopeRules', 'Rule']
diff --git a/skrules/rule.py b/skrules/rule.py
@@ -0,0 +1,58 @@
+class Rule:
+    """ An object modelizing a logical rule and add factorization methods.
+    It is used to simplify rules and deduplicate them.
+
+    Parameters
+    ----------
+
+    rule : str
+        The logical rule that is interpretable by a pandas query.
+
+    args : object, optional
+        Arguments associated to the rule, it is not used for factorization
+        but it takes part of the output when the rule is converted to an array.
+    """
+
+    def __init__(self, rule, args=None):
+        self.rule = rule
+        self.args = args
+        self.terms = [t.split(' ') for t in self.rule.split(' and ')]
+        self.agg_dict = {}
+        self.factorize()
+        self.rule = str(self)
+
+    def __eq__(self, other):
+        return self.agg_dict == other.agg_dict
+
+    def __hash__(self):
+        # FIXME : Easier method ?
+        return hash(tuple(sorted(((i, j) for i, j in self.agg_dict.items()))))
+
+    def factorize(self):
+        for feature, symbol, value in self.terms:
+            if (feature, symbol) not in self.agg_dict:
+                if symbol != '==':
+                    self.agg_dict[(feature, symbol)] = str(float(value))
+                else:
+                    self.agg_dict[(feature, symbol)] = value
+            else:
+                if symbol[0] == '<':
+                    self.agg_dict[(feature, symbol)] = str(min(
+                                float(self.agg_dict[(feature, symbol)]),
+                                float(value)))
+                elif symbol[0] == '>':
+                    self.agg_dict[(feature, symbol)] = str(max(
+                                float(self.agg_dict[(feature, symbol)]),
+                                float(value)))
+                else:  # Handle the c0 == c0 case
+                    self.agg_dict[(feature, symbol)] = value
+
+    def __iter__(self):
+        yield str(self)
+        yield self.args
+
+    def __repr__(self):
+        return ' and '.join([' '.join(
+                [feature, symbol, str(self.agg_dict[(feature, symbol)])])
+                for feature, symbol in sorted(self.agg_dict.keys())
+                ])
diff --git a/skrules/skope_rules.py b/skrules/skope_rules.py
@@ -12,6 +12,8 @@
 from sklearn.externals import six
 from sklearn.tree import _tree
 
+from .rule import Rule
+
 INTEGER_TYPES = (numbers.Integral, np.integer)
 
 
@@ -205,7 +207,8 @@ def fit(self, X, y, sample_weight=None):
                              " in the data, but the data contains only one"
                              " class: %r" % self.classes_[0])
 
-        if not isinstance(self.max_depth_duplication, int) and self.max_depth_duplication is not None:
+        if not isinstance(self.max_depth_duplication, int) \
+                and self.max_depth_duplication is not None:
             raise ValueError("max_depth_duplication should be an integer"
                              )
         if not set(self.classes_) == set([0, 1]):
@@ -265,7 +268,8 @@ def fit(self, X, y, sample_weight=None):
                 max_features=self.max_samples_features,
                 bootstrap=self.bootstrap,
                 bootstrap_features=self.bootstrap_features,
-                # oob_score=... XXX may be added if selection on tree perf needed.
+                # oob_score=... XXX may be added
+                # if selection on tree perf needed.
                 # warm_start=... XXX may be added to increase computation perf.
                 n_jobs=self.n_jobs,
                 random_state=self.random_state,
@@ -281,7 +285,8 @@ def fit(self, X, y, sample_weight=None):
                 max_features=self.max_samples_features,
                 bootstrap=self.bootstrap,
                 bootstrap_features=self.bootstrap_features,
-                # oob_score=... XXX may be added if selection on tree perf needed.
+                # oob_score=... XXX may be added
+                # if selection on tree perf needed.
                 # warm_start=... XXX may be added to increase computation perf.
                 n_jobs=self.n_jobs,
                 random_state=self.random_state,
@@ -345,6 +350,12 @@ def fit(self, X, y, sample_weight=None):
                                    for r in set(rules_from_tree)]
                 rules_ += rules_from_tree
 
+        # Factorize rules before semantic tree filtering
+        rules_ = [
+            tuple(rule)
+            for rule in
+            [Rule(r, args=args) for r, args in rules_]]
+
         # keep only rules verifying precision_min and recall_min:
         for rule, score in rules_:
             if score[0] >= self.precision_min and score[1] >= self.recall_min:
@@ -363,7 +374,7 @@ def fit(self, X, y, sample_weight=None):
         self.rules_ = sorted(self.rules_.items(),
                              key=lambda x: (x[1][0], x[1][1]), reverse=True)
 
-        # count representation of feature
+        # Deduplicate the rule using semantic tree
         if self.max_depth_duplication is not None:
             self.rules_ = self.deduplicate(self.rules_)
         return self
@@ -576,7 +587,7 @@ def recurse(node, base_name):
             else:
                 rule = str.join(' and ', base_name)
                 rule = (rule if rule != ''
-                        else '=='.join([feature_names[0]] * 2))
+                        else ' == '.join([feature_names[0]] * 2))
                 # a rule selecting all is set to "c0==c0"
                 rules.append(rule)
 
diff --git a/skrules/tests/test_rule.py b/skrules/tests/test_rule.py
@@ -0,0 +1,55 @@
+from sklearn.utils.testing import assert_equal, assert_not_equal
+
+from skrules import Rule
+
+
+def test_rule():
+    assert_equal(Rule('a <= 10 and a <= 12'),
+                 Rule('a <= 10'))
+    assert_equal(Rule('a <= 10 and a <= 12 and a > 3'),
+                 Rule('a > 3 and a <= 10'))
+
+    assert_equal(Rule('a <= 10 and a <= 10 and a > 3'),
+                 Rule('a > 3 and a <= 10'))
+
+    assert_equal(Rule('a <= 10 and a <= 12 and b > 3 and b > 6'),
+                 Rule('a <= 10 and b > 6'))
+
+    assert_equal(len({Rule('a <= 2 and a <= 3'),
+                      Rule('a <= 2')
+                      }), 1)
+
+    assert_equal(len({Rule('a > 2 and a > 3 and b <= 2 and b <= 3'),
+                      Rule('a > 3 and b <= 2')
+                      }), 1)
+
+    assert_equal(len({Rule('a <= 3 and b <= 2'),
+                      Rule('b <= 2 and a <= 3')
+                      }), 1)
+
+
+def test_hash_rule():
+    assert_equal(len({
+                        Rule('a <= 2 and a <= 3'),
+                        Rule('a <= 2')
+                      }), 1)
+    assert_not_equal(len({
+                        Rule('a <= 4 and a <= 3'),
+                        Rule('a <= 2')
+                      }), 1)
+
+
+def test_str_rule():
+    rule = 'a <= 10.0 and b > 3.0'
+    assert_equal(rule, str(Rule(rule)))
+
+
+def test_equals_rule():
+    rule = "a == a"
+    assert_equal(rule, str(Rule(rule)))
+
+    rule2 = "a == a and a == a"
+    assert_equal(rule, str(Rule(rule2)))
+
+    rule3 = "a < 3.0 and a == a"
+    assert_equal(rule3, str(Rule(rule3)))