From 1821a800a30fb8fe703f4bcbba088487aea5a10b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Sat, 21 Aug 2021 13:21:58 +0200 Subject: [PATCH 1/8] Implements numpy.digitalize with a DecisionTreeRegressor --- .gitignore | 1 + .../ut_mlmodel/test_quantile_regression.py | 4 +- _unittests/ut_mltree/test_tree_digitize.py | 50 +++++++++++++ mlinsights/mltree/__init__.py | 5 +- mlinsights/mltree/_tree_digitize.pyx | 45 ++++++++++++ mlinsights/mltree/tree_digitize.py | 71 +++++++++++++++++++ setup.py | 12 ++-- 7 files changed, 180 insertions(+), 8 deletions(-) create mode 100644 _unittests/ut_mltree/test_tree_digitize.py create mode 100644 mlinsights/mltree/_tree_digitize.pyx create mode 100644 mlinsights/mltree/tree_digitize.py diff --git a/.gitignore b/.gitignore index 736bc0ab..5dbb4522 100644 --- a/.gitignore +++ b/.gitignore @@ -285,3 +285,4 @@ _unittests/unittests.out _doc/notebooks/explore/simages/* _unittests/ut_mlbatch/cache__2/ _doc/sphinxdoc/source/_temp_custom_run_script* +mlinsights/mltree/_tree_digitize.c diff --git a/_unittests/ut_mlmodel/test_quantile_regression.py b/_unittests/ut_mlmodel/test_quantile_regression.py index 08103342..4b6d40b3 100644 --- a/_unittests/ut_mlmodel/test_quantile_regression.py +++ b/_unittests/ut_mlmodel/test_quantile_regression.py @@ -33,7 +33,7 @@ def test_quantile_regression_no_intercept(self): self.assertEqualArray(clr.intercept_, clq.intercept_) @unittest.skipIf( - compare_module_version(sklver,"0.24") == -1, + compare_module_version(sklver, "0.24") == -1, reason="positive was introduce in 0.24") def test_quantile_regression_no_intercept_positive(self): X = numpy.array([[0.1, 0.2], [0.2, 0.3]]) @@ -64,7 +64,7 @@ def test_quantile_regression_intercept(self): self.assertEqualArray(clr.coef_, clq.coef_) @unittest.skipIf( - compare_module_version(sklver,"0.24") == -1, + compare_module_version(sklver, "0.24") == -1, reason="positive was introduce in 0.24") def test_quantile_regression_intercept_positive(self): X = numpy.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.3]]) diff --git a/_unittests/ut_mltree/test_tree_digitize.py b/_unittests/ut_mltree/test_tree_digitize.py new file mode 100644 index 00000000..cb8eb488 --- /dev/null +++ b/_unittests/ut_mltree/test_tree_digitize.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +""" +@brief test log(time=2s) +""" +import unittest +import numpy +from sklearn.tree import DecisionTreeRegressor, export_text +from sklearn.tree._tree import TREE_UNDEFINED +from pyquickhelper.pycode import ExtTestCase +from mlinsights.mltree import digitize2tree + + +class TestTreeDigitize(ExtTestCase): + + def test_cst(self): + self.assertEqual(TREE_UNDEFINED, -2) + + def test_tree_digitize1(self): + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([1.0]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + self.assertIsInstance(tree, DecisionTreeRegressor) + pred = tree.predict(x.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + + def test_tree_digitize2(self): + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([1.0, 2.0]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + print("".join(export_text(tree, feature_names=['f0']))) + print("A") + pred = tree.predict(x.reshape((-1, 1))) + print("B") + self.assertEqualArray(expected, pred) + + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([0.0, 1.0, 2.5, 4.0, 10.0]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + print("A") + pred = tree.predict(x.reshape((-1, 1))) + print("B") + self.assertEqualArray(expected, pred) + + +if __name__ == "__main__": + # TestTreeDigitize().test_tree_digitize2() + unittest.main() diff --git a/mlinsights/mltree/__init__.py b/mlinsights/mltree/__init__.py index 7afb368e..c3a953ab 100644 --- a/mlinsights/mltree/__init__.py +++ b/mlinsights/mltree/__init__.py @@ -2,4 +2,7 @@ @file @brief Shortcuts to *mltree*. """ -from .tree_structure import tree_leave_index, tree_node_range, tree_leave_neighbors, predict_leaves +from .tree_digitize import digitize2tree +from .tree_structure import ( + tree_leave_index, tree_node_range, tree_leave_neighbors, + predict_leaves) diff --git a/mlinsights/mltree/_tree_digitize.pyx b/mlinsights/mltree/_tree_digitize.pyx new file mode 100644 index 00000000..9af8bc4f --- /dev/null +++ b/mlinsights/mltree/_tree_digitize.pyx @@ -0,0 +1,45 @@ +""" +@file +@brief Access to the C API of scikit-learn (decision tree) +""" +from libc.stdio cimport printf + +import numpy +cimport numpy +numpy.import_array() + +ctypedef numpy.npy_intp SIZE_t + +from sklearn.tree._tree cimport Tree + +TREE_LEAF = -1 +TREE_UNDEFINED = -2 + + +cdef void _tree_add_node(Tree tree, + SIZE_t parent, bint is_left, bint is_leaf, + SIZE_t feature, double threshold, double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples): + if parent == -1: + parent = TREE_UNDEFINED + tree._add_node(parent, is_left, is_leaf, feature, threshold, impurity, + n_node_samples, weighted_n_node_samples) + + +def tree_add_node(tree, parent, is_left, is_leaf, feature, threshold, + impurity, n_node_samples, weighted_n_node_samples): + """ + Adds a node to tree. + + :param parent: parent index (-1 for the root) + :param is_left: is left node? + :param is_leaf: is leave? + :param feature: feature index + :param threshold: threshold (or value) + :param impurity: impurity + :param n_node_samples: number of samples this node represents + :param weighted_n_node_samples: node weight + """ + _tree_add_node(tree, parent, is_left, is_leaf, feature, threshold, + impurity, n_node_samples, weighted_n_node_samples) diff --git a/mlinsights/mltree/tree_digitize.py b/mlinsights/mltree/tree_digitize.py new file mode 100644 index 00000000..cc85453d --- /dev/null +++ b/mlinsights/mltree/tree_digitize.py @@ -0,0 +1,71 @@ +""" +@file +@brief Helpers to investigate a tree structure. +""" +import numpy +from sklearn.tree._tree import Tree # pylint: disable=E0611 +from sklearn.tree import DecisionTreeRegressor +from ._tree_digitize import tree_add_node + + +def digitize2tree(bins, right=False): + """ + Builds a decision tree which returns the same result as + `lambda x: numpy.digitize(x, bins, right=right)` + (see :epkg:`numpy:digitize`). + + :param bins: array of bins. It has to be 1-dimensional and monotonic. + :param right: Indicating whether the intervals include the right + or the left bin edge. Default behavior is (right==False) + indicating that the interval does not include the right edge. + The left bin end is open in this case, i.e., + `bins[i-1] <= x < bins[i]` is the default behavior for + monotonically increasing bins. + :return: decision tree + """ + if not right: + raise NotImplementedError("right must be true") + tree = Tree(1, numpy.array([1], dtype=numpy.intp), 1) + ascending = len(bins) <= 1 or bins[0] < bins[1] + if not ascending: + raise NotImplementedError("ascending must be true") + values = [] + + def add_root(index): + if index < 0 or index >= len(bins): + raise IndexError( # pragma: no cover + "Unexpected index %d / len(bins)=%d." % ( + index, len(bins))) + parent = -1 + is_left = False + is_leaf = False + threshold = bins[index] + tree_add_node(tree, parent, is_left, is_leaf, 0, threshold, 0, 1, 1.) + values.append(numpy.nan) + + def add_nodes(parent, i, j, is_left): + # add for bins[i:j] (j excluded) + if i == j == 0: + # leaf + tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) + values.append(0) + elif i + 1 == j: + # leaf + threshold = j if ascending else i + values.append(threshold) + tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) + else: + raise NotImplementedError("i=%r j=%r" % (i, j)) + + index = len(bins) // 2 + add_root(index) + add_nodes(0, 0, index, True) + add_nodes(0, index, len(bins), False) + + cl = DecisionTreeRegressor() + cl.tree_ = tree + cl.tree_.value[:, 0, 0] = numpy.array(values, dtype=numpy.float64) + cl.n_outputs = 1 + cl.n_outputs_ = 1 + cl.n_features_in_ = 1 + return cl diff --git a/setup.py b/setup.py index a9e91581..b84bf5a3 100644 --- a/setup.py +++ b/setup.py @@ -64,20 +64,22 @@ def get_extensions(): "piecewise_tree_regression_criterion", "piecewise_tree_regression_criterion_linear", "piecewise_tree_regression_criterion_fast", + "_tree_digitize", ]) - pattern1 = "mlinsights.mlmodel.%s" + pattern1 = "mlinsights.%s.%s" import numpy for name in extensions: + folder = "mltree" if name == "_tree_digitize" else "mlmodel" if isinstance(name, tuple): - m = Extension(pattern1 % name[0], - ['mlinsights/mlmodel/%s.pyx' % name[1]], + m = Extension(pattern1 % (folder, name[0]), + ['mlinsights/%s/%s.pyx' % (folder, name[1])], include_dirs=[numpy.get_include()], extra_compile_args=["-O3"], language='c') else: - m = Extension(pattern1 % name, - ['mlinsights/mlmodel/%s.pyx' % name], + m = Extension(pattern1 % (folder, name), + ['mlinsights/%s/%s.pyx' % (folder, name)], include_dirs=[numpy.get_include()], extra_compile_args=["-O3"], language='c') From a8fbd56572773e1be5f043cc1c6be4720658cd95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Sat, 21 Aug 2021 13:22:35 +0200 Subject: [PATCH 2/8] documentation --- mlinsights/mltree/tree_digitize.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mlinsights/mltree/tree_digitize.py b/mlinsights/mltree/tree_digitize.py index cc85453d..6bdc916f 100644 --- a/mlinsights/mltree/tree_digitize.py +++ b/mlinsights/mltree/tree_digitize.py @@ -1,6 +1,8 @@ """ @file @brief Helpers to investigate a tree structure. + +.. versionadded:: 0.4 """ import numpy from sklearn.tree._tree import Tree # pylint: disable=E0611 @@ -22,6 +24,8 @@ def digitize2tree(bins, right=False): `bins[i-1] <= x < bins[i]` is the default behavior for monotonically increasing bins. :return: decision tree + + .. versionadded:: 0.4 """ if not right: raise NotImplementedError("right must be true") From d31400f68a419d95c46307cdce54c4ac61f6eee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Sun, 22 Aug 2021 00:34:30 +0200 Subject: [PATCH 3/8] fix tree digitize --- _unittests/ut_mltree/test_tree_digitize.py | 60 ++++++++++++++-- mlinsights/mltree/_tree_digitize.pyx | 23 +++--- mlinsights/mltree/tree_digitize.py | 83 ++++++++++++++++++---- 3 files changed, 135 insertions(+), 31 deletions(-) diff --git a/_unittests/ut_mltree/test_tree_digitize.py b/_unittests/ut_mltree/test_tree_digitize.py index cb8eb488..88bd4fd6 100644 --- a/_unittests/ut_mltree/test_tree_digitize.py +++ b/_unittests/ut_mltree/test_tree_digitize.py @@ -15,6 +15,14 @@ class TestTreeDigitize(ExtTestCase): def test_cst(self): self.assertEqual(TREE_UNDEFINED, -2) + def test_exc(self): + bins = numpy.array([0.0, 1.0]) + self.assertRaise(lambda: digitize2tree(bins, right=False), + RuntimeError) + bins = numpy.array([1.0, 0.0]) + self.assertRaise(lambda: digitize2tree(bins, right=False), + RuntimeError) + def test_tree_digitize1(self): x = numpy.array([0.2, 6.4, 3.0, 1.6]) bins = numpy.array([1.0]) @@ -23,28 +31,66 @@ def test_tree_digitize1(self): self.assertIsInstance(tree, DecisionTreeRegressor) pred = tree.predict(x.reshape((-1, 1))) self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) + self.assertEqualArray(expected, pred) def test_tree_digitize2(self): x = numpy.array([0.2, 6.4, 3.0, 1.6]) bins = numpy.array([1.0, 2.0]) expected = numpy.digitize(x, bins, right=True) tree = digitize2tree(bins, right=True) - print("".join(export_text(tree, feature_names=['f0']))) - print("A") pred = tree.predict(x.reshape((-1, 1))) - print("B") + self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) self.assertEqualArray(expected, pred) + def test_tree_digitize3(self): x = numpy.array([0.2, 6.4, 3.0, 1.6]) - bins = numpy.array([0.0, 1.0, 2.5, 4.0, 10.0]) + bins = numpy.array([1.0, 2.0, 3.5]) expected = numpy.digitize(x, bins, right=True) tree = digitize2tree(bins, right=True) - print("A") pred = tree.predict(x.reshape((-1, 1))) - print("B") + self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + + def test_tree_digitize4(self): + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([0.0, 1.0, 2.5, 4.0]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + pred = tree.predict(x.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + + def test_tree_digitize5(self): + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + pred = tree.predict(x.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + + def test_tree_digitize5_false(self): + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0]) + bins[:] = bins[::-1].copy() + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + pred = tree.predict(x.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) self.assertEqualArray(expected, pred) if __name__ == "__main__": - # TestTreeDigitize().test_tree_digitize2() unittest.main() diff --git a/mlinsights/mltree/_tree_digitize.pyx b/mlinsights/mltree/_tree_digitize.pyx index 9af8bc4f..08610d89 100644 --- a/mlinsights/mltree/_tree_digitize.pyx +++ b/mlinsights/mltree/_tree_digitize.pyx @@ -16,15 +16,20 @@ TREE_LEAF = -1 TREE_UNDEFINED = -2 -cdef void _tree_add_node(Tree tree, - SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples): +cdef SIZE_t _tree_add_node(Tree tree, + SIZE_t parent, + bint is_left, + bint is_leaf, + SIZE_t feature, + double threshold, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples): if parent == -1: parent = TREE_UNDEFINED - tree._add_node(parent, is_left, is_leaf, feature, threshold, impurity, - n_node_samples, weighted_n_node_samples) + return tree._add_node(parent, is_left, is_leaf, feature, + threshold, impurity, + n_node_samples, weighted_n_node_samples) def tree_add_node(tree, parent, is_left, is_leaf, feature, threshold, @@ -41,5 +46,5 @@ def tree_add_node(tree, parent, is_left, is_leaf, feature, threshold, :param n_node_samples: number of samples this node represents :param weighted_n_node_samples: node weight """ - _tree_add_node(tree, parent, is_left, is_leaf, feature, threshold, - impurity, n_node_samples, weighted_n_node_samples) + return _tree_add_node(tree, parent, is_left, is_leaf, feature, threshold, + impurity, n_node_samples, weighted_n_node_samples) diff --git a/mlinsights/mltree/tree_digitize.py b/mlinsights/mltree/tree_digitize.py index 6bdc916f..adcbff46 100644 --- a/mlinsights/mltree/tree_digitize.py +++ b/mlinsights/mltree/tree_digitize.py @@ -28,12 +28,22 @@ def digitize2tree(bins, right=False): .. versionadded:: 0.4 """ if not right: - raise NotImplementedError("right must be true") - tree = Tree(1, numpy.array([1], dtype=numpy.intp), 1) + raise RuntimeError( + "right must be True not right=%r" % right) ascending = len(bins) <= 1 or bins[0] < bins[1] + if not ascending: - raise NotImplementedError("ascending must be true") + bins2 = bins[::-1] + cl = digitize2tree(bins2, right=right) + n = len(bins) + for i in range(cl.tree_.value.shape[0]): + cl.tree_.value[i, 0, 0] = n - cl.tree_.value[i, 0, 0] + return cl + + tree = Tree(1, numpy.array([1], dtype=numpy.intp), 1) values = [] + UNUSED = numpy.nan + n_nodes = [] def add_root(index): if index < 0 or index >= len(bins): @@ -44,22 +54,65 @@ def add_root(index): is_left = False is_leaf = False threshold = bins[index] - tree_add_node(tree, parent, is_left, is_leaf, 0, threshold, 0, 1, 1.) - values.append(numpy.nan) + n = tree_add_node( + tree, parent, is_left, is_leaf, 0, threshold, 0, 1, 1.) + values.append(UNUSED) + n_nodes.append(n) + return n def add_nodes(parent, i, j, is_left): # add for bins[i:j] (j excluded) - if i == j == 0: - # leaf - tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) - values.append(0) - elif i + 1 == j: - # leaf - threshold = j if ascending else i - values.append(threshold) - tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) + if is_left: + # it means j is the parent split + if i == j: + # leaf + n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) + n_nodes.append(n) + values.append(i) + return n + if i + 1 == j: + # split + values.append(UNUSED) + th = bins[i] + n = tree_add_node(tree, parent, is_left, + False, 0, th, 0, 1, 1.) + n_nodes.append(n) + add_nodes(n, i, i, True) + add_nodes(n, i, j, False) + return n + if i + 1 < j: + # split + values.append(UNUSED) + index = (i + j) // 2 + th = bins[index] + n = tree_add_node(tree, parent, is_left, + False, 0, th, 0, 1, 1.) + n_nodes.append(n) + add_nodes(n, i, index, True) + add_nodes(n, index, j, False) + return n else: - raise NotImplementedError("i=%r j=%r" % (i, j)) + # it means i is the parent split + if i + 1 == j: + # leaf + values.append(j) + n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) + n_nodes.append(n) + return n + if i + 1 < j: + # split + values.append(UNUSED) + index = (i + j) // 2 + th = bins[index] + n = tree_add_node(tree, parent, is_left, + False, 0, th, 0, 1, 1.) + n_nodes.append(n) + add_nodes(n, i, index, True) + add_nodes(n, index, j, False) + return n + raise NotImplementedError( # pragma: no cover + "Unexpected case where i=%r, j=%r, is_left=%r." % ( + i, j, is_left)) index = len(bins) // 2 add_root(index) From 95afdd5ce3a13aab56522634462c10c7afc5f436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Sun, 22 Aug 2021 00:45:59 +0200 Subject: [PATCH 4/8] lint --- _unittests/ut_mltree/test_tree_digitize.py | 2 +- mlinsights/mltree/tree_digitize.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/_unittests/ut_mltree/test_tree_digitize.py b/_unittests/ut_mltree/test_tree_digitize.py index 88bd4fd6..434f5bb7 100644 --- a/_unittests/ut_mltree/test_tree_digitize.py +++ b/_unittests/ut_mltree/test_tree_digitize.py @@ -4,7 +4,7 @@ """ import unittest import numpy -from sklearn.tree import DecisionTreeRegressor, export_text +from sklearn.tree import DecisionTreeRegressor from sklearn.tree._tree import TREE_UNDEFINED from pyquickhelper.pycode import ExtTestCase from mlinsights.mltree import digitize2tree diff --git a/mlinsights/mltree/tree_digitize.py b/mlinsights/mltree/tree_digitize.py index adcbff46..0c183f40 100644 --- a/mlinsights/mltree/tree_digitize.py +++ b/mlinsights/mltree/tree_digitize.py @@ -7,7 +7,7 @@ import numpy from sklearn.tree._tree import Tree # pylint: disable=E0611 from sklearn.tree import DecisionTreeRegressor -from ._tree_digitize import tree_add_node +from ._tree_digitize import tree_add_node # pylint: disable=E0611 def digitize2tree(bins, right=False): @@ -124,5 +124,10 @@ def add_nodes(parent, i, j, is_left): cl.tree_.value[:, 0, 0] = numpy.array(values, dtype=numpy.float64) cl.n_outputs = 1 cl.n_outputs_ = 1 - cl.n_features_in_ = 1 + try: + # scikit-learn >= 0.24 + cl.n_features_in_ = 1 + except AttributeError: + # scikit-learn < 0.24 + cl.n_features_ = 1 return cl From 4614868a428774133092375639c6d8bd8f4129ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Sun, 22 Aug 2021 01:06:06 +0200 Subject: [PATCH 5/8] lint --- _unittests/ut_mlbatch/test_pipeline_cache.py | 3 --- _unittests/ut_mltree/test_tree_digitize.py | 6 +++++- mlinsights/mlmodel/sklearn_testing.py | 2 +- mlinsights/plotting/visualize.py | 2 +- mlinsights/sklapi/sklearn_base_transform_learner.py | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/_unittests/ut_mlbatch/test_pipeline_cache.py b/_unittests/ut_mlbatch/test_pipeline_cache.py index da0e3d3d..df625a6c 100644 --- a/_unittests/ut_mlbatch/test_pipeline_cache.py +++ b/_unittests/ut_mlbatch/test_pipeline_cache.py @@ -121,9 +121,6 @@ def test_grid_search_model(self): def test_clone_with_fitted_parameters(self): X, y = make_classification(random_state=42) - param_grid = {'pca__n_components': [2, 3], - 'pca__whiten': [True, False], - 'lr__fit_intercept': [True, False]} pipe = Pipeline([('pca', PCA(2)), ('lr', LogisticRegression())]) pipe.fit(X, y) diff --git a/_unittests/ut_mltree/test_tree_digitize.py b/_unittests/ut_mltree/test_tree_digitize.py index 434f5bb7..7f9430a8 100644 --- a/_unittests/ut_mltree/test_tree_digitize.py +++ b/_unittests/ut_mltree/test_tree_digitize.py @@ -5,13 +5,17 @@ import unittest import numpy from sklearn.tree import DecisionTreeRegressor -from sklearn.tree._tree import TREE_UNDEFINED +try: + from sklearn.tree._tree import TREE_UNDEFINED # pylint: disable=E0611 +except ImportError: + TREE_UNDEFINED = None from pyquickhelper.pycode import ExtTestCase from mlinsights.mltree import digitize2tree class TestTreeDigitize(ExtTestCase): + @unittest.skipIf(TREE_UNDEFINED is None, reason="nothing to test") def test_cst(self): self.assertEqual(TREE_UNDEFINED, -2) diff --git a/mlinsights/mlmodel/sklearn_testing.py b/mlinsights/mlmodel/sklearn_testing.py index ca6c9287..6d5c4537 100644 --- a/mlinsights/mlmodel/sklearn_testing.py +++ b/mlinsights/mlmodel/sklearn_testing.py @@ -147,7 +147,7 @@ def test_sklearn_clone(fct_model, ext=None, copy_fitted=False): else: try: ext.assertEqual(p1[k], p2[k]) - except AssertionError as e: # pragma no cover + except AssertionError: # pragma no cover raise AssertionError( # pylint: disable=W0707 "Difference for key '{0}'\n==1 {1}\n==2 {2}".format( k, p1[k], p2[k])) diff --git a/mlinsights/plotting/visualize.py b/mlinsights/plotting/visualize.py index 231d0180..4265aea2 100644 --- a/mlinsights/plotting/visualize.py +++ b/mlinsights/plotting/visualize.py @@ -238,7 +238,7 @@ def pipeline2dot(pipe, data, **params): options.update(params) exp = ["digraph{"] - for opt in {'orientation', 'pad', 'nodesep', 'ranksep'}: + for opt in ['orientation', 'pad', 'nodesep', 'ranksep']: if opt in options: exp.append(" {}={};".format(opt, options[opt])) fontsize = 8 diff --git a/mlinsights/sklapi/sklearn_base_transform_learner.py b/mlinsights/sklapi/sklearn_base_transform_learner.py index 7d536c15..0123ca67 100644 --- a/mlinsights/sklapi/sklearn_base_transform_learner.py +++ b/mlinsights/sklapi/sklearn_base_transform_learner.py @@ -80,7 +80,7 @@ def __init__(self, model=None, method=None, **kwargs): if model is None: raise ValueError("value cannot be None") # pragma: no cover if method is None: - for name in {'predict_proba', 'predict', 'transform'}: + for name in ['predict_proba', 'predict', 'transform']: if hasattr(model.__class__, name): method = name if method is None: From f2f88c7bee5f33fa38d869a262ca9d0fc3f098f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Sun, 22 Aug 2021 12:40:17 +0200 Subject: [PATCH 6/8] documentation and example --- _doc/examples/plot_digitize.py | 120 +++++++++++++++++++++++++++++ _doc/sphinxdoc/source/api/tree.rst | 11 +++ mlinsights/mltree/tree_digitize.py | 26 +++++++ requirements.txt | 3 + 4 files changed, 160 insertions(+) create mode 100644 _doc/examples/plot_digitize.py diff --git a/_doc/examples/plot_digitize.py b/_doc/examples/plot_digitize.py new file mode 100644 index 00000000..40a403c4 --- /dev/null +++ b/_doc/examples/plot_digitize.py @@ -0,0 +1,120 @@ +""" + +.. _l-example-digitize + +======================== +numpy.digitize as a tree +======================== + +.. index:: digitize, decision tree, onnx, onnxruntime + +Function :epkg:`numpy:digitize` transforms a real variable +into a discrete one by returning the buckets the variable +falls into. This bucket can be efficiently retrieved by doing a +binary search over the bins. That's equivalent to decision tree. +Function :func:`digitize2tree +`. + +.. contents:: + :local: + +Simple example +============== +""" +import warnings +import numpy +from pandas import DataFrame, pivot_table +import matplotlib.pyplot as plt +from onnxruntime import InferenceSession +from sklearn.tree import export_text +from skl2onnx import to_onnx +from cpyquickhelper.numbers.speed_measure import measure_time +from mlinsights.mltree import digitize2tree +from tqdm import tqdm + +x = numpy.array([0.2, 6.4, 3.0, 1.6]) +bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0]) +expected = numpy.digitize(x, bins, right=True) +tree = digitize2tree(bins, right=True) +pred = tree.predict(x.reshape((-1, 1))) +print(expected, pred) + +########################################## +# The tree looks like the following. +print(export_text(tree, feature_names=['x'])) + +####################################### +# Benchmark +# ========= +# +# Let's measure the processing time. *numpy* should be +# much faster than *scikit-learn* as it is adding many verification. +# However, the benchmark also includes a conversion of the tree into +# ONNX and measure the processing time with :epkg:`onnxruntime`. + +obs = [] + +for shape in tqdm([1, 10, 100, 1000, 10000, 100000]): + x = numpy.random.random(shape).astype(numpy.float32) + if shape < 1000: + repeat = number = 100 + else: + repeat = number = 10 + for n_bins in [1, 10, 100]: + bins = (numpy.arange(n_bins) / n_bins).astype(numpy.float32) + + ti = measure_time( + "numpy.digitize(x, bins, right=True)", + context={'numpy': numpy, "x": x, "bins": bins}, + div_by_number=True, repeat=repeat, number=number) + ti['name'] = 'numpy' + ti['n_bins'] = n_bins + ti['shape'] = shape + obs.append(ti) + + tree = digitize2tree(bins, right=True) + + ti = measure_time( + "tree.predict(x)", + context={'numpy': numpy, "x": x.reshape((-1, 1)), "tree": tree}, + div_by_number=True, repeat=repeat, number=number) + ti['name'] = 'sklearn' + ti['n_bins'] = n_bins + ti['shape'] = shape + obs.append(ti) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + onx = to_onnx(tree, x.reshape((-1, 1))) + + sess = InferenceSession(onx.SerializeToString()) + + ti = measure_time( + "sess.run(None, {'X': x})", + context={'numpy': numpy, "x": x.reshape((-1, 1)), "sess": sess}, + div_by_number=True, repeat=repeat, number=number) + ti['name'] = 'ort' + ti['n_bins'] = n_bins + ti['shape'] = shape + obs.append(ti) + + +df = DataFrame(obs) +piv = pivot_table(data=df, index="shape", columns=["n_bins", "name"], + values=["average"]) +print(piv) + +########################################## +# Plotting +# ======== + +n_bins = list(sorted(set(df.n_bins))) +fig, ax = plt.subplots(1, len(n_bins), figsize=(14, 4)) + +for i, nb in enumerate(n_bins): + piv = pivot_table(data=df[df.n_bins == nb], index="shape", + columns=["name"], + values=["average"]) + piv.plot(title="Benchmark digitize / onnxruntime\nn_bins=%d" % nb, + logx=True, logy=True, ax=ax[i]) +plt.show() diff --git a/_doc/sphinxdoc/source/api/tree.rst b/_doc/sphinxdoc/source/api/tree.rst index ac7442b1..b1da38b2 100644 --- a/_doc/sphinxdoc/source/api/tree.rst +++ b/_doc/sphinxdoc/source/api/tree.rst @@ -2,6 +2,12 @@ Trees ===== +.. contents:: + :local: + +Digging into the tree structure ++++++++++++++++++++++++++++++++ + .. autosignature:: mlinsights.mltree.tree_structure.predict_leaves .. autosignature:: mlinsights.mltree.tree_structure.tree_find_common_node @@ -15,3 +21,8 @@ Trees .. autosignature:: mlinsights.mltree.tree_structure.tree_leave_index .. autosignature:: mlinsights.mltree.tree_structure.tree_leave_neighbors + +Experiments, exercise ++++++++++++++++++++++ + +.. autosignature:: mlinsights.mltree.tree_digitize.digitize2tree diff --git a/mlinsights/mltree/tree_digitize.py b/mlinsights/mltree/tree_digitize.py index 0c183f40..e7be511c 100644 --- a/mlinsights/mltree/tree_digitize.py +++ b/mlinsights/mltree/tree_digitize.py @@ -25,6 +25,32 @@ def digitize2tree(bins, right=False): monotonically increasing bins. :return: decision tree + .. note:: + The implementation of decision trees in :epkg:`scikit-learn` + only allows one type of decision (`<=`). That's why the + function throws an exception when `right=False`. However, + this could be overcome by using :epkg:`ONNX` where all + kind of decision rules are implemented. + + The following example shows what the tree looks like. + + .. runpython:: + :showcode: + + import numpy + from sklearn.tree import export_text + from mlinsights.mltree import digitize2tree + + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + pred = tree.predict(x.reshape((-1, 1))) + print("Comparison with numpy:") + print(expected, pred) + print("Tree:") + print(export_text(tree, feature_names=['x'])) + .. versionadded:: 0.4 """ if not right: diff --git a/requirements.txt b/requirements.txt index 259348c9..86d6dab1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,8 @@ matplotlib memory_profiler>=0.55 nbconvert>=6.0.2 numpy +onnx +onnxruntime pandas_streaming pybind11 pycodestyle @@ -19,6 +21,7 @@ pylint>=2.9.3 scikit-learn>=0.22.1 scipy seaborn +skl2onnx sphinx>=3.0 sphinxcontrib.imagesvg sphinx_gallery From 305e60a2f31cf20c6549d5572f7435b932b6a3d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Sun, 22 Aug 2021 13:07:25 +0200 Subject: [PATCH 7/8] documentation --- mlinsights/mltree/tree_digitize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlinsights/mltree/tree_digitize.py b/mlinsights/mltree/tree_digitize.py index e7be511c..050b47bf 100644 --- a/mlinsights/mltree/tree_digitize.py +++ b/mlinsights/mltree/tree_digitize.py @@ -30,7 +30,8 @@ def digitize2tree(bins, right=False): only allows one type of decision (`<=`). That's why the function throws an exception when `right=False`. However, this could be overcome by using :epkg:`ONNX` where all - kind of decision rules are implemented. + kind of decision rules are implemented. Default value for + right is still *False* to follow *numpy* API. The following example shows what the tree looks like. From 2ddb61b233482d640e6d60fc8aee3986568522ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Sun, 22 Aug 2021 13:14:05 +0200 Subject: [PATCH 8/8] documentation --- mlinsights/mltree/tree_digitize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlinsights/mltree/tree_digitize.py b/mlinsights/mltree/tree_digitize.py index 050b47bf..5d91c342 100644 --- a/mlinsights/mltree/tree_digitize.py +++ b/mlinsights/mltree/tree_digitize.py @@ -31,7 +31,8 @@ def digitize2tree(bins, right=False): function throws an exception when `right=False`. However, this could be overcome by using :epkg:`ONNX` where all kind of decision rules are implemented. Default value for - right is still *False* to follow *numpy* API. + right is still *False* to follow *numpy* API even though + this value raises an exception in *digitize2tree*. The following example shows what the tree looks like.