From 66251367c869a9d9ecb4ff6d77914619125f682b Mon Sep 17 00:00:00 2001 From: Andrew Tan Date: Fri, 30 Apr 2021 05:05:54 +0800 Subject: [PATCH 1/3] fix typo --- skrules/datasets/credit_data.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/skrules/datasets/credit_data.py b/skrules/datasets/credit_data.py index 2f2979a..0528e36 100644 --- a/skrules/datasets/credit_data.py +++ b/skrules/datasets/credit_data.py @@ -18,7 +18,17 @@ import pandas as pd import numpy as np +<<<<<<< HEAD from sklearn.datasets.base import get_data_home, Bunch +======= + +try: + from sklearn.datasets.base import get_data_home, Bunch +except ModuleNotFoundError: + from sklearn.datasets import get_data_home + from sklearn.utils import Bunch + +>>>>>>> aa9588c... fix typo from sklearn.datasets.base import _fetch_remote, RemoteFileMetadata from os.path import exists, join From 5eb8a005bce05679495646bf7654b0503fc72fad Mon Sep 17 00:00:00 2001 From: Andrew Tan Date: Fri, 30 Apr 2021 06:00:52 +0800 Subject: [PATCH 2/3] more fixes for tests --- skrules/datasets/credit_data.py | 11 +- skrules/tests/test_common.py | 20 ++- skrules/tests/test_rule.py | 57 +++--- skrules/tests/test_skope_rules.py | 279 ++++++++++++++++++++---------- 4 files changed, 228 insertions(+), 139 deletions(-) diff --git a/skrules/datasets/credit_data.py b/skrules/datasets/credit_data.py index 0528e36..b682541 100644 --- a/skrules/datasets/credit_data.py +++ b/skrules/datasets/credit_data.py @@ -18,18 +18,15 @@ import pandas as pd import numpy as np -<<<<<<< HEAD -from sklearn.datasets.base import get_data_home, Bunch -======= try: - from sklearn.datasets.base import get_data_home, Bunch -except ModuleNotFoundError: + from sklearn.datasets.base import get_data_home, Bunch, _fetch_remote, RemoteFileMetadata +except (ModuleNotFoundError, ImportError): + # For scikit-learn >= 0.24 compatibility from sklearn.datasets import get_data_home from sklearn.utils import Bunch + from sklearn.datasets._base import _fetch_remote, RemoteFileMetadata ->>>>>>> aa9588c... fix typo -from sklearn.datasets.base import _fetch_remote, RemoteFileMetadata from os.path import exists, join diff --git a/skrules/tests/test_common.py b/skrules/tests/test_common.py index b8d9380..ad0f8ef 100644 --- a/skrules/tests/test_common.py +++ b/skrules/tests/test_common.py @@ -1,10 +1,28 @@ from sklearn.utils.estimator_checks import check_estimator from skrules import SkopeRules from skrules.datasets import load_credit_data +import sklearn def test_classifier(): - check_estimator(SkopeRules) + try: + check_estimator(SkopeRules) + except TypeError: + # For sklearn >= 0.24.0 compatibility + from sklearn.utils._testing import SkipTest + from sklearn.utils.estimator_checks import check_sample_weights_invariance + + checks = check_estimator(SkopeRules(), generate_only=True) + for estimator, check in checks: + # Here we ignore this particular estimator check because + # sample weights are treated differently in skope-rules + if check.func != check_sample_weights_invariance: + try: + check(estimator) + except SkipTest as exception: + # SkipTest is thrown when pandas can't be imported, or by checks + # that are in the xfail_checks tag + warnings.warn(str(exception), SkipTestWarning) def test_load_credit_data(): diff --git a/skrules/tests/test_rule.py b/skrules/tests/test_rule.py index ab1048a..0dc1914 100644 --- a/skrules/tests/test_rule.py +++ b/skrules/tests/test_rule.py @@ -1,65 +1,48 @@ -from sklearn.utils.testing import assert_equal, assert_not_equal - from skrules import Rule, replace_feature_name def test_rule(): - assert_equal(Rule('a <= 10 and a <= 12'), - Rule('a <= 10')) - assert_equal(Rule('a <= 10 and a <= 12 and a > 3'), - Rule('a > 3 and a <= 10')) + assert Rule("a <= 10 and a <= 12") == Rule("a <= 10") + assert Rule("a <= 10 and a <= 12 and a > 3") == Rule("a > 3 and a <= 10") - assert_equal(Rule('a <= 10 and a <= 10 and a > 3'), - Rule('a > 3 and a <= 10')) + assert Rule("a <= 10 and a <= 10 and a > 3") == Rule("a > 3 and a <= 10") - assert_equal(Rule('a <= 10 and a <= 12 and b > 3 and b > 6'), - Rule('a <= 10 and b > 6')) + assert Rule("a <= 10 and a <= 12 and b > 3 and b > 6") == Rule("a <= 10 and b > 6") - assert_equal(len({Rule('a <= 2 and a <= 3'), - Rule('a <= 2') - }), 1) + assert len({Rule("a <= 2 and a <= 3"), Rule("a <= 2")}) == 1 - assert_equal(len({Rule('a > 2 and a > 3 and b <= 2 and b <= 3'), - Rule('a > 3 and b <= 2') - }), 1) + assert ( + len({Rule("a > 2 and a > 3 and b <= 2 and b <= 3"), Rule("a > 3 and b <= 2")}) + == 1 + ) - assert_equal(len({Rule('a <= 3 and b <= 2'), - Rule('b <= 2 and a <= 3') - }), 1) + assert len({Rule("a <= 3 and b <= 2"), Rule("b <= 2 and a <= 3")}) == 1 def test_hash_rule(): - assert_equal(len({ - Rule('a <= 2 and a <= 3'), - Rule('a <= 2') - }), 1) - assert_not_equal(len({ - Rule('a <= 4 and a <= 3'), - Rule('a <= 2') - }), 1) + assert len({Rule("a <= 2 and a <= 3"), Rule("a <= 2")}) == 1 + assert len({Rule("a <= 4 and a <= 3"), Rule("a <= 2")}) != 1 def test_str_rule(): - rule = 'a <= 10.0 and b > 3.0' - assert_equal(rule, str(Rule(rule))) + rule = "a <= 10.0 and b > 3.0" + assert rule == str(Rule(rule)) def test_equals_rule(): rule = "a == a" - assert_equal(rule, str(Rule(rule))) + assert rule == str(Rule(rule)) rule2 = "a == a and a == a" - assert_equal(rule, str(Rule(rule2))) + assert rule == str(Rule(rule2)) rule3 = "a < 3.0 and a == a" - assert_equal(rule3, str(Rule(rule3))) + assert rule3 == str(Rule(rule3)) def test_replace_feature_name(): rule = "__C__0 <= 3 and __C__1 > 4" real_rule = "$b <= 3 and c(4) > 4" - replace_dict = { - "__C__0": "$b", - "__C__1": "c(4)" - } - assert_equal(replace_feature_name(rule, replace_dict=replace_dict), real_rule) + replace_dict = {"__C__0": "$b", "__C__1": "c(4)"} + assert replace_feature_name(rule, replace_dict=replace_dict) == real_rule + diff --git a/skrules/tests/test_skope_rules.py b/skrules/tests/test_skope_rules.py index 238871e..237f91a 100644 --- a/skrules/tests/test_skope_rules.py +++ b/skrules/tests/test_skope_rules.py @@ -9,17 +9,10 @@ from sklearn.metrics import accuracy_score from sklearn.utils import check_random_state -from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_warns_message -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_in -from sklearn.utils.testing import assert_not_in -from sklearn.utils.testing import assert_not_equal -from sklearn.utils.testing import assert_no_warnings -from sklearn.utils.testing import assert_greater -from sklearn.utils.testing import ignore_warnings +from numpy.testing import assert_array_equal +from unittest import TestCase +import warnings from skrules import SkopeRules @@ -39,95 +32,98 @@ boston.data = boston.data[perm] boston.target = boston.target[perm] +_dummy = TestCase("__init__") + def test_skope_rules(): """Check various parameter settings.""" - X_train = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], - [6, 3], [-4, -7]] + X_train = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, -7]] y_train = [0] * 6 + [1] * 2 X_test = np.array([[2, 1], [1, 1]]) - grid = ParameterGrid({ - "feature_names": [None, ['a', 'b']], - "precision_min": [0.], - "recall_min": [0.], - "n_estimators": [1], - "max_samples": [0.5, 4], - "max_samples_features": [0.5, 2], - "bootstrap": [True, False], - "bootstrap_features": [True, False], - "max_depth": [2], - "max_features": ["auto", 1, 0.1], - "min_samples_split": [2, 0.1], - "n_jobs": [-1, 2]}) - - with ignore_warnings(): - for params in grid: - SkopeRules(random_state=rng, - **params).fit(X_train, y_train).predict(X_test) + grid = ParameterGrid( + { + "feature_names": [None, ["a", "b"]], + "precision_min": [0.0], + "recall_min": [0.0], + "n_estimators": [1], + "max_samples": [0.5, 4], + "max_samples_features": [0.5, 2], + "bootstrap": [True, False], + "bootstrap_features": [True, False], + "max_depth": [2], + "max_features": ["auto", 1, 0.1], + "min_samples_split": [2, 0.1], + "n_jobs": [-1, 2], + } + ) + + for params in grid: + SkopeRules(random_state=rng, **params).fit(X_train, y_train).predict(X_test) # additional parameters: - SkopeRules(n_estimators=50, - max_samples=1., - recall_min=0., - precision_min=0.).fit(X_train, y_train).predict(X_test) + SkopeRules(n_estimators=50, max_samples=1.0, recall_min=0.0, precision_min=0.0).fit( + X_train, y_train + ).predict(X_test) def test_skope_rules_error(): """Test that it gives proper exception on deficient input.""" X = iris.data y = iris.target - y = (y != 0) + y = y != 0 # Test max_samples - assert_raises(ValueError, - SkopeRules(max_samples=-1).fit, X, y) - assert_raises(ValueError, - SkopeRules(max_samples=0.0).fit, X, y) - assert_raises(ValueError, - SkopeRules(max_samples=2.0).fit, X, y) + _dummy.assertRaises(ValueError, SkopeRules(max_samples=-1).fit, X, y) + _dummy.assertRaises(ValueError, SkopeRules(max_samples=0.0).fit, X, y) + _dummy.assertRaises(ValueError, SkopeRules(max_samples=2.0).fit, X, y) # explicitly setting max_samples > n_samples should result in a warning. - assert_warns_message(UserWarning, - "max_samples will be set to n_samples for estimation", - SkopeRules(max_samples=1000).fit, X, y) + assert_warns_message( + UserWarning, + "max_samples will be set to n_samples for estimation", + SkopeRules(max_samples=1000).fit, + X, + y, + ) assert_no_warnings(SkopeRules(max_samples=np.int64(2)).fit, X, y) - assert_raises(ValueError, SkopeRules(max_samples='foobar').fit, X, y) - assert_raises(ValueError, SkopeRules(max_samples=1.5).fit, X, y) - assert_raises(ValueError, SkopeRules(max_depth_duplication=1.5).fit, X, y) - assert_raises(ValueError, SkopeRules().fit(X, y).predict, X[:, 1:]) - assert_raises(ValueError, SkopeRules().fit(X, y).decision_function, - X[:, 1:]) - assert_raises(ValueError, SkopeRules().fit(X, y).rules_vote, X[:, 1:]) - assert_raises(ValueError, SkopeRules().fit(X, y).score_top_rules, - X[:, 1:]) + _dummy.assertRaises(ValueError, SkopeRules(max_samples="foobar").fit, X, y) + _dummy.assertRaises(ValueError, SkopeRules(max_samples=1.5).fit, X, y) + _dummy.assertRaises(ValueError, SkopeRules(max_depth_duplication=1.5).fit, X, y) + _dummy.assertRaises(ValueError, SkopeRules().fit(X, y).predict, X[:, 1:]) + _dummy.assertRaises(ValueError, SkopeRules().fit(X, y).decision_function, X[:, 1:]) + _dummy.assertRaises(ValueError, SkopeRules().fit(X, y).rules_vote, X[:, 1:]) + _dummy.assertRaises(ValueError, SkopeRules().fit(X, y).score_top_rules, X[:, 1:]) def test_max_samples_attribute(): X = iris.data y = iris.target - y = (y != 0) + y = y != 0 - clf = SkopeRules(max_samples=1.).fit(X, y) - assert_equal(clf.max_samples_, X.shape[0]) + clf = SkopeRules(max_samples=1.0).fit(X, y) + assert clf.max_samples_ == X.shape[0] clf = SkopeRules(max_samples=500) - assert_warns_message(UserWarning, - "max_samples will be set to n_samples for estimation", - clf.fit, X, y) - assert_equal(clf.max_samples_, X.shape[0]) + assert_warns_message( + UserWarning, + "max_samples will be set to n_samples for estimation", + clf.fit, + X, + y, + ) + assert clf.max_samples_ == X.shape[0] clf = SkopeRules(max_samples=0.4).fit(X, y) - assert_equal(clf.max_samples_, 0.4*X.shape[0]) + assert clf.max_samples_ == 0.4 * X.shape[0] def test_skope_rules_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 - X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], - [10, 5], [5, -7]] + X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF - clf = SkopeRules(random_state=rng, max_samples=1.) + clf = SkopeRules(random_state=rng, max_samples=1.0) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) @@ -135,10 +131,9 @@ def test_skope_rules_works(): pred = clf.predict(X_test) pred_score_top_rules = clf.predict_top_rules(X_test, 1) # assert detect outliers: - assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) - assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2])) - assert_greater(np.min(score_top_rules[-2:]), - np.max(score_top_rules[:-2])) + assert np.min(decision_func[-2:]) > np.max(decision_func[:-2]) + assert np.min(rules_vote[-2:]) > np.max(rules_vote[:-2]) + assert np.min(score_top_rules[-2:]) > np.max(score_top_rules[:-2]) assert_array_equal(pred, 6 * [0] + 2 * [1]) assert_array_equal(pred_score_top_rules, 6 * [0] + 2 * [1]) @@ -147,10 +142,9 @@ def test_deduplication_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 - X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], - [10, 5], [5, -7]] + X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF - clf = SkopeRules(random_state=rng, max_samples=1., max_depth_duplication=3) + clf = SkopeRules(random_state=rng, max_samples=1.0, max_depth_duplication=3) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) @@ -176,27 +170,28 @@ def test_performances(): # with lists clf.fit(X.tolist(), y.tolist()) y_pred = clf.predict(X) - assert_equal(y_pred.shape, (n_samples,)) + assert y_pred.shape == (n_samples,) # training set performance - assert_greater(accuracy_score(y, y_pred), 0.83) + assert accuracy_score(y, y_pred) > 0.83 # decision_function agrees with predict decision = -clf.decision_function(X) - assert_equal(decision.shape, (n_samples,)) + assert decision.shape == (n_samples,) dec_pred = (decision.ravel() < 0).astype(np.int) assert_array_equal(dec_pred, y_pred) def test_similarity_tree(): # Test that rules are well splitted - rules = [("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)), - ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)), - ("a > 2 and b > 45", (0.5, 0.3, 0)), - ("a > 2 and b > 40", (0.5, 0.2, 0)), - ("a <= 2 and b <= 45", (1, 1, 0)), - ("a > 2 and c <= 3", (1, 1, 0)), - ("b > 45", (1, 1, 0)), - ] + rules = [ + ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)), + ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)), + ("a > 2 and b > 45", (0.5, 0.3, 0)), + ("a > 2 and b > 40", (0.5, 0.2, 0)), + ("a <= 2 and b <= 45", (1, 1, 0)), + ("a > 2 and c <= 3", (1, 1, 0)), + ("b > 45", (1, 1, 0)), + ] sk = SkopeRules(max_depth_duplication=2) rulesets = sk._find_similar_rulesets(rules) @@ -209,21 +204,117 @@ def test_similarity_tree(): idx_bags_for_rule.append(idx_bag) idx_bags_rules.append(idx_bags_for_rule) - assert_equal(idx_bags_rules[0], idx_bags_rules[1]) - assert_not_equal(idx_bags_rules[0], idx_bags_rules[2]) + assert idx_bags_rules[0] == idx_bags_rules[1] + assert idx_bags_rules[0] != idx_bags_rules[2] # Assert the best rules are kept final_rules = sk.deduplicate(rules) - assert_in(rules[0], final_rules) - assert_in(rules[2], final_rules) - assert_not_in(rules[3], final_rules) + assert rules[0] in final_rules + assert rules[2] in final_rules + assert rules[3] not in final_rules def test_f1_score(): clf = SkopeRules() - rule0 = ('a > 0', (0, 0, 0)) - rule1 = ('a > 0', (0.5, 0.5, 0)) - rule2 = ('a > 0', (0.5, 0, 0)) - - assert_equal(clf.f1_score(rule0), 0) - assert_equal(clf.f1_score(rule1), 0.5) - assert_equal(clf.f1_score(rule2), 0) + rule0 = ("a > 0", (0, 0, 0)) + rule1 = ("a > 0", (0.5, 0.5, 0)) + rule2 = ("a > 0", (0.5, 0, 0)) + + assert clf.f1_score(rule0) == 0 + assert clf.f1_score(rule1) == 0.5 + assert clf.f1_score(rule2) == 0 + + +def assert_warns_message(warning_class, message, func, *args, **kw): + # very important to avoid uncontrolled state propagation + """Test that a certain warning occurs and with a certain message. + + Parameters + ---------- + warning_class : the warning class + The class to test for, e.g. UserWarning. + + message : str | callable + The message or a substring of the message to test for. If callable, + it takes a string as the argument and will trigger an AssertionError + if the callable returns `False`. + + func : callable + Callable object to trigger warnings. + + *args : the positional arguments to `func`. + + **kw : the keyword arguments to `func`. + + Returns + ------- + result : the return value of `func` + + """ + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + if hasattr(np, "FutureWarning"): + # Let's not catch the numpy internal DeprecationWarnings + warnings.simplefilter("ignore", np.VisibleDeprecationWarning) + # Trigger a warning. + result = func(*args, **kw) + # Verify some things + if not len(w) > 0: + raise AssertionError("No warning raised when calling %s" % func.__name__) + + found = [issubclass(warning.category, warning_class) for warning in w] + if not any(found): + raise AssertionError( + "No warning raised for %s with class " + "%s" % (func.__name__, warning_class) + ) + + message_found = False + # Checks the message of all warnings belong to warning_class + for index in [i for i, x in enumerate(found) if x]: + # substring will match, the entire message with typo won't + msg = w[index].message # For Python 3 compatibility + msg = str(msg.args[0] if hasattr(msg, "args") else msg) + if callable(message): # add support for certain tests + check_in_message = message + else: + + def check_in_message(msg): + return message in msg + + if check_in_message(msg): + message_found = True + break + + if not message_found: + raise AssertionError( + "Did not receive the message you expected " + "('%s') for <%s>, got: '%s'" % (message, func.__name__, msg) + ) + + return result + + +def assert_no_warnings(func, *args, **kw): + """ + Parameters + ---------- + func + *args + **kw + """ + # very important to avoid uncontrolled state propagation + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + + result = func(*args, **kw) + if hasattr(np, "FutureWarning"): + # Filter out numpy-specific warnings in numpy >= 1.9 + w = [e for e in w if e.category is not np.VisibleDeprecationWarning] + + if len(w) > 0: + raise AssertionError( + "Got warnings when calling %s: [%s]" + % (func.__name__, ", ".join(str(warning) for warning in w)) + ) + return result From 94aa49b2fba1d01e5de0f1fa8948c3606ff085ec Mon Sep 17 00:00:00 2001 From: Andrew Tan Date: Wed, 12 May 2021 16:20:12 +0800 Subject: [PATCH 3/3] try changing socket timeout --- skrules/datasets/credit_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skrules/datasets/credit_data.py b/skrules/datasets/credit_data.py index b682541..a0de5f5 100644 --- a/skrules/datasets/credit_data.py +++ b/skrules/datasets/credit_data.py @@ -40,6 +40,8 @@ def load_credit_data(): '011238620f5369220bd60cfc82700933')) if not exists(join(sk_data_dir, archive.filename)): + import socket + socket.setdefaulttimeout(180) _fetch_remote(archive, dirname=sk_data_dir) data = pd.read_excel(join(sk_data_dir, archive.filename),