diff --git a/.gitignore b/.gitignore index 736bc0ab..5dbb4522 100644 --- a/.gitignore +++ b/.gitignore @@ -285,3 +285,4 @@ _unittests/unittests.out _doc/notebooks/explore/simages/* _unittests/ut_mlbatch/cache__2/ _doc/sphinxdoc/source/_temp_custom_run_script* +mlinsights/mltree/_tree_digitize.c diff --git a/_doc/examples/plot_digitize.py b/_doc/examples/plot_digitize.py new file mode 100644 index 00000000..40a403c4 --- /dev/null +++ b/_doc/examples/plot_digitize.py @@ -0,0 +1,120 @@ +""" + +.. _l-example-digitize + +======================== +numpy.digitize as a tree +======================== + +.. index:: digitize, decision tree, onnx, onnxruntime + +Function :epkg:`numpy:digitize` transforms a real variable +into a discrete one by returning the buckets the variable +falls into. This bucket can be efficiently retrieved by doing a +binary search over the bins. That's equivalent to decision tree. +Function :func:`digitize2tree +`. + +.. contents:: + :local: + +Simple example +============== +""" +import warnings +import numpy +from pandas import DataFrame, pivot_table +import matplotlib.pyplot as plt +from onnxruntime import InferenceSession +from sklearn.tree import export_text +from skl2onnx import to_onnx +from cpyquickhelper.numbers.speed_measure import measure_time +from mlinsights.mltree import digitize2tree +from tqdm import tqdm + +x = numpy.array([0.2, 6.4, 3.0, 1.6]) +bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0]) +expected = numpy.digitize(x, bins, right=True) +tree = digitize2tree(bins, right=True) +pred = tree.predict(x.reshape((-1, 1))) +print(expected, pred) + +########################################## +# The tree looks like the following. +print(export_text(tree, feature_names=['x'])) + +####################################### +# Benchmark +# ========= +# +# Let's measure the processing time. *numpy* should be +# much faster than *scikit-learn* as it is adding many verification. +# However, the benchmark also includes a conversion of the tree into +# ONNX and measure the processing time with :epkg:`onnxruntime`. + +obs = [] + +for shape in tqdm([1, 10, 100, 1000, 10000, 100000]): + x = numpy.random.random(shape).astype(numpy.float32) + if shape < 1000: + repeat = number = 100 + else: + repeat = number = 10 + for n_bins in [1, 10, 100]: + bins = (numpy.arange(n_bins) / n_bins).astype(numpy.float32) + + ti = measure_time( + "numpy.digitize(x, bins, right=True)", + context={'numpy': numpy, "x": x, "bins": bins}, + div_by_number=True, repeat=repeat, number=number) + ti['name'] = 'numpy' + ti['n_bins'] = n_bins + ti['shape'] = shape + obs.append(ti) + + tree = digitize2tree(bins, right=True) + + ti = measure_time( + "tree.predict(x)", + context={'numpy': numpy, "x": x.reshape((-1, 1)), "tree": tree}, + div_by_number=True, repeat=repeat, number=number) + ti['name'] = 'sklearn' + ti['n_bins'] = n_bins + ti['shape'] = shape + obs.append(ti) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + onx = to_onnx(tree, x.reshape((-1, 1))) + + sess = InferenceSession(onx.SerializeToString()) + + ti = measure_time( + "sess.run(None, {'X': x})", + context={'numpy': numpy, "x": x.reshape((-1, 1)), "sess": sess}, + div_by_number=True, repeat=repeat, number=number) + ti['name'] = 'ort' + ti['n_bins'] = n_bins + ti['shape'] = shape + obs.append(ti) + + +df = DataFrame(obs) +piv = pivot_table(data=df, index="shape", columns=["n_bins", "name"], + values=["average"]) +print(piv) + +########################################## +# Plotting +# ======== + +n_bins = list(sorted(set(df.n_bins))) +fig, ax = plt.subplots(1, len(n_bins), figsize=(14, 4)) + +for i, nb in enumerate(n_bins): + piv = pivot_table(data=df[df.n_bins == nb], index="shape", + columns=["name"], + values=["average"]) + piv.plot(title="Benchmark digitize / onnxruntime\nn_bins=%d" % nb, + logx=True, logy=True, ax=ax[i]) +plt.show() diff --git a/_doc/sphinxdoc/source/api/tree.rst b/_doc/sphinxdoc/source/api/tree.rst index ac7442b1..b1da38b2 100644 --- a/_doc/sphinxdoc/source/api/tree.rst +++ b/_doc/sphinxdoc/source/api/tree.rst @@ -2,6 +2,12 @@ Trees ===== +.. contents:: + :local: + +Digging into the tree structure ++++++++++++++++++++++++++++++++ + .. autosignature:: mlinsights.mltree.tree_structure.predict_leaves .. autosignature:: mlinsights.mltree.tree_structure.tree_find_common_node @@ -15,3 +21,8 @@ Trees .. autosignature:: mlinsights.mltree.tree_structure.tree_leave_index .. autosignature:: mlinsights.mltree.tree_structure.tree_leave_neighbors + +Experiments, exercise ++++++++++++++++++++++ + +.. autosignature:: mlinsights.mltree.tree_digitize.digitize2tree diff --git a/_unittests/ut_mlbatch/test_pipeline_cache.py b/_unittests/ut_mlbatch/test_pipeline_cache.py index da0e3d3d..df625a6c 100644 --- a/_unittests/ut_mlbatch/test_pipeline_cache.py +++ b/_unittests/ut_mlbatch/test_pipeline_cache.py @@ -121,9 +121,6 @@ def test_grid_search_model(self): def test_clone_with_fitted_parameters(self): X, y = make_classification(random_state=42) - param_grid = {'pca__n_components': [2, 3], - 'pca__whiten': [True, False], - 'lr__fit_intercept': [True, False]} pipe = Pipeline([('pca', PCA(2)), ('lr', LogisticRegression())]) pipe.fit(X, y) diff --git a/_unittests/ut_mlmodel/test_quantile_regression.py b/_unittests/ut_mlmodel/test_quantile_regression.py index 08103342..4b6d40b3 100644 --- a/_unittests/ut_mlmodel/test_quantile_regression.py +++ b/_unittests/ut_mlmodel/test_quantile_regression.py @@ -33,7 +33,7 @@ def test_quantile_regression_no_intercept(self): self.assertEqualArray(clr.intercept_, clq.intercept_) @unittest.skipIf( - compare_module_version(sklver,"0.24") == -1, + compare_module_version(sklver, "0.24") == -1, reason="positive was introduce in 0.24") def test_quantile_regression_no_intercept_positive(self): X = numpy.array([[0.1, 0.2], [0.2, 0.3]]) @@ -64,7 +64,7 @@ def test_quantile_regression_intercept(self): self.assertEqualArray(clr.coef_, clq.coef_) @unittest.skipIf( - compare_module_version(sklver,"0.24") == -1, + compare_module_version(sklver, "0.24") == -1, reason="positive was introduce in 0.24") def test_quantile_regression_intercept_positive(self): X = numpy.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.3]]) diff --git a/_unittests/ut_mltree/test_tree_digitize.py b/_unittests/ut_mltree/test_tree_digitize.py new file mode 100644 index 00000000..7f9430a8 --- /dev/null +++ b/_unittests/ut_mltree/test_tree_digitize.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +""" +@brief test log(time=2s) +""" +import unittest +import numpy +from sklearn.tree import DecisionTreeRegressor +try: + from sklearn.tree._tree import TREE_UNDEFINED # pylint: disable=E0611 +except ImportError: + TREE_UNDEFINED = None +from pyquickhelper.pycode import ExtTestCase +from mlinsights.mltree import digitize2tree + + +class TestTreeDigitize(ExtTestCase): + + @unittest.skipIf(TREE_UNDEFINED is None, reason="nothing to test") + def test_cst(self): + self.assertEqual(TREE_UNDEFINED, -2) + + def test_exc(self): + bins = numpy.array([0.0, 1.0]) + self.assertRaise(lambda: digitize2tree(bins, right=False), + RuntimeError) + bins = numpy.array([1.0, 0.0]) + self.assertRaise(lambda: digitize2tree(bins, right=False), + RuntimeError) + + def test_tree_digitize1(self): + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([1.0]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + self.assertIsInstance(tree, DecisionTreeRegressor) + pred = tree.predict(x.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + + def test_tree_digitize2(self): + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([1.0, 2.0]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + pred = tree.predict(x.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + + def test_tree_digitize3(self): + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([1.0, 2.0, 3.5]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + pred = tree.predict(x.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + + def test_tree_digitize4(self): + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([0.0, 1.0, 2.5, 4.0]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + pred = tree.predict(x.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + + def test_tree_digitize5(self): + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + pred = tree.predict(x.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + + def test_tree_digitize5_false(self): + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0]) + bins[:] = bins[::-1].copy() + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + pred = tree.predict(x.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + expected = numpy.digitize(bins, bins, right=True) + pred = tree.predict(bins.reshape((-1, 1))) + self.assertEqualArray(expected, pred) + + +if __name__ == "__main__": + unittest.main() diff --git a/mlinsights/mlmodel/sklearn_testing.py b/mlinsights/mlmodel/sklearn_testing.py index ca6c9287..6d5c4537 100644 --- a/mlinsights/mlmodel/sklearn_testing.py +++ b/mlinsights/mlmodel/sklearn_testing.py @@ -147,7 +147,7 @@ def test_sklearn_clone(fct_model, ext=None, copy_fitted=False): else: try: ext.assertEqual(p1[k], p2[k]) - except AssertionError as e: # pragma no cover + except AssertionError: # pragma no cover raise AssertionError( # pylint: disable=W0707 "Difference for key '{0}'\n==1 {1}\n==2 {2}".format( k, p1[k], p2[k])) diff --git a/mlinsights/mltree/__init__.py b/mlinsights/mltree/__init__.py index 7afb368e..c3a953ab 100644 --- a/mlinsights/mltree/__init__.py +++ b/mlinsights/mltree/__init__.py @@ -2,4 +2,7 @@ @file @brief Shortcuts to *mltree*. """ -from .tree_structure import tree_leave_index, tree_node_range, tree_leave_neighbors, predict_leaves +from .tree_digitize import digitize2tree +from .tree_structure import ( + tree_leave_index, tree_node_range, tree_leave_neighbors, + predict_leaves) diff --git a/mlinsights/mltree/_tree_digitize.pyx b/mlinsights/mltree/_tree_digitize.pyx new file mode 100644 index 00000000..08610d89 --- /dev/null +++ b/mlinsights/mltree/_tree_digitize.pyx @@ -0,0 +1,50 @@ +""" +@file +@brief Access to the C API of scikit-learn (decision tree) +""" +from libc.stdio cimport printf + +import numpy +cimport numpy +numpy.import_array() + +ctypedef numpy.npy_intp SIZE_t + +from sklearn.tree._tree cimport Tree + +TREE_LEAF = -1 +TREE_UNDEFINED = -2 + + +cdef SIZE_t _tree_add_node(Tree tree, + SIZE_t parent, + bint is_left, + bint is_leaf, + SIZE_t feature, + double threshold, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples): + if parent == -1: + parent = TREE_UNDEFINED + return tree._add_node(parent, is_left, is_leaf, feature, + threshold, impurity, + n_node_samples, weighted_n_node_samples) + + +def tree_add_node(tree, parent, is_left, is_leaf, feature, threshold, + impurity, n_node_samples, weighted_n_node_samples): + """ + Adds a node to tree. + + :param parent: parent index (-1 for the root) + :param is_left: is left node? + :param is_leaf: is leave? + :param feature: feature index + :param threshold: threshold (or value) + :param impurity: impurity + :param n_node_samples: number of samples this node represents + :param weighted_n_node_samples: node weight + """ + return _tree_add_node(tree, parent, is_left, is_leaf, feature, threshold, + impurity, n_node_samples, weighted_n_node_samples) diff --git a/mlinsights/mltree/tree_digitize.py b/mlinsights/mltree/tree_digitize.py new file mode 100644 index 00000000..5d91c342 --- /dev/null +++ b/mlinsights/mltree/tree_digitize.py @@ -0,0 +1,161 @@ +""" +@file +@brief Helpers to investigate a tree structure. + +.. versionadded:: 0.4 +""" +import numpy +from sklearn.tree._tree import Tree # pylint: disable=E0611 +from sklearn.tree import DecisionTreeRegressor +from ._tree_digitize import tree_add_node # pylint: disable=E0611 + + +def digitize2tree(bins, right=False): + """ + Builds a decision tree which returns the same result as + `lambda x: numpy.digitize(x, bins, right=right)` + (see :epkg:`numpy:digitize`). + + :param bins: array of bins. It has to be 1-dimensional and monotonic. + :param right: Indicating whether the intervals include the right + or the left bin edge. Default behavior is (right==False) + indicating that the interval does not include the right edge. + The left bin end is open in this case, i.e., + `bins[i-1] <= x < bins[i]` is the default behavior for + monotonically increasing bins. + :return: decision tree + + .. note:: + The implementation of decision trees in :epkg:`scikit-learn` + only allows one type of decision (`<=`). That's why the + function throws an exception when `right=False`. However, + this could be overcome by using :epkg:`ONNX` where all + kind of decision rules are implemented. Default value for + right is still *False* to follow *numpy* API even though + this value raises an exception in *digitize2tree*. + + The following example shows what the tree looks like. + + .. runpython:: + :showcode: + + import numpy + from sklearn.tree import export_text + from mlinsights.mltree import digitize2tree + + x = numpy.array([0.2, 6.4, 3.0, 1.6]) + bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0]) + expected = numpy.digitize(x, bins, right=True) + tree = digitize2tree(bins, right=True) + pred = tree.predict(x.reshape((-1, 1))) + print("Comparison with numpy:") + print(expected, pred) + print("Tree:") + print(export_text(tree, feature_names=['x'])) + + .. versionadded:: 0.4 + """ + if not right: + raise RuntimeError( + "right must be True not right=%r" % right) + ascending = len(bins) <= 1 or bins[0] < bins[1] + + if not ascending: + bins2 = bins[::-1] + cl = digitize2tree(bins2, right=right) + n = len(bins) + for i in range(cl.tree_.value.shape[0]): + cl.tree_.value[i, 0, 0] = n - cl.tree_.value[i, 0, 0] + return cl + + tree = Tree(1, numpy.array([1], dtype=numpy.intp), 1) + values = [] + UNUSED = numpy.nan + n_nodes = [] + + def add_root(index): + if index < 0 or index >= len(bins): + raise IndexError( # pragma: no cover + "Unexpected index %d / len(bins)=%d." % ( + index, len(bins))) + parent = -1 + is_left = False + is_leaf = False + threshold = bins[index] + n = tree_add_node( + tree, parent, is_left, is_leaf, 0, threshold, 0, 1, 1.) + values.append(UNUSED) + n_nodes.append(n) + return n + + def add_nodes(parent, i, j, is_left): + # add for bins[i:j] (j excluded) + if is_left: + # it means j is the parent split + if i == j: + # leaf + n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) + n_nodes.append(n) + values.append(i) + return n + if i + 1 == j: + # split + values.append(UNUSED) + th = bins[i] + n = tree_add_node(tree, parent, is_left, + False, 0, th, 0, 1, 1.) + n_nodes.append(n) + add_nodes(n, i, i, True) + add_nodes(n, i, j, False) + return n + if i + 1 < j: + # split + values.append(UNUSED) + index = (i + j) // 2 + th = bins[index] + n = tree_add_node(tree, parent, is_left, + False, 0, th, 0, 1, 1.) + n_nodes.append(n) + add_nodes(n, i, index, True) + add_nodes(n, index, j, False) + return n + else: + # it means i is the parent split + if i + 1 == j: + # leaf + values.append(j) + n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.) + n_nodes.append(n) + return n + if i + 1 < j: + # split + values.append(UNUSED) + index = (i + j) // 2 + th = bins[index] + n = tree_add_node(tree, parent, is_left, + False, 0, th, 0, 1, 1.) + n_nodes.append(n) + add_nodes(n, i, index, True) + add_nodes(n, index, j, False) + return n + raise NotImplementedError( # pragma: no cover + "Unexpected case where i=%r, j=%r, is_left=%r." % ( + i, j, is_left)) + + index = len(bins) // 2 + add_root(index) + add_nodes(0, 0, index, True) + add_nodes(0, index, len(bins), False) + + cl = DecisionTreeRegressor() + cl.tree_ = tree + cl.tree_.value[:, 0, 0] = numpy.array(values, dtype=numpy.float64) + cl.n_outputs = 1 + cl.n_outputs_ = 1 + try: + # scikit-learn >= 0.24 + cl.n_features_in_ = 1 + except AttributeError: + # scikit-learn < 0.24 + cl.n_features_ = 1 + return cl diff --git a/mlinsights/plotting/visualize.py b/mlinsights/plotting/visualize.py index 231d0180..4265aea2 100644 --- a/mlinsights/plotting/visualize.py +++ b/mlinsights/plotting/visualize.py @@ -238,7 +238,7 @@ def pipeline2dot(pipe, data, **params): options.update(params) exp = ["digraph{"] - for opt in {'orientation', 'pad', 'nodesep', 'ranksep'}: + for opt in ['orientation', 'pad', 'nodesep', 'ranksep']: if opt in options: exp.append(" {}={};".format(opt, options[opt])) fontsize = 8 diff --git a/mlinsights/sklapi/sklearn_base_transform_learner.py b/mlinsights/sklapi/sklearn_base_transform_learner.py index 7d536c15..0123ca67 100644 --- a/mlinsights/sklapi/sklearn_base_transform_learner.py +++ b/mlinsights/sklapi/sklearn_base_transform_learner.py @@ -80,7 +80,7 @@ def __init__(self, model=None, method=None, **kwargs): if model is None: raise ValueError("value cannot be None") # pragma: no cover if method is None: - for name in {'predict_proba', 'predict', 'transform'}: + for name in ['predict_proba', 'predict', 'transform']: if hasattr(model.__class__, name): method = name if method is None: diff --git a/requirements.txt b/requirements.txt index 259348c9..86d6dab1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,8 @@ matplotlib memory_profiler>=0.55 nbconvert>=6.0.2 numpy +onnx +onnxruntime pandas_streaming pybind11 pycodestyle @@ -19,6 +21,7 @@ pylint>=2.9.3 scikit-learn>=0.22.1 scipy seaborn +skl2onnx sphinx>=3.0 sphinxcontrib.imagesvg sphinx_gallery diff --git a/setup.py b/setup.py index a9e91581..b84bf5a3 100644 --- a/setup.py +++ b/setup.py @@ -64,20 +64,22 @@ def get_extensions(): "piecewise_tree_regression_criterion", "piecewise_tree_regression_criterion_linear", "piecewise_tree_regression_criterion_fast", + "_tree_digitize", ]) - pattern1 = "mlinsights.mlmodel.%s" + pattern1 = "mlinsights.%s.%s" import numpy for name in extensions: + folder = "mltree" if name == "_tree_digitize" else "mlmodel" if isinstance(name, tuple): - m = Extension(pattern1 % name[0], - ['mlinsights/mlmodel/%s.pyx' % name[1]], + m = Extension(pattern1 % (folder, name[0]), + ['mlinsights/%s/%s.pyx' % (folder, name[1])], include_dirs=[numpy.get_include()], extra_compile_args=["-O3"], language='c') else: - m = Extension(pattern1 % name, - ['mlinsights/mlmodel/%s.pyx' % name], + m = Extension(pattern1 % (folder, name), + ['mlinsights/%s/%s.pyx' % (folder, name)], include_dirs=[numpy.get_include()], extra_compile_args=["-O3"], language='c')