sdpython · sdpython · Aug 22, 2021 · Aug 21, 2021 · Aug 21, 2021 · Aug 21, 2021
diff --git a/.gitignore b/.gitignore
@@ -285,3 +285,4 @@ _unittests/unittests.out
 _doc/notebooks/explore/simages/*
 _unittests/ut_mlbatch/cache__2/
 _doc/sphinxdoc/source/_temp_custom_run_script*
+mlinsights/mltree/_tree_digitize.c
diff --git a/_doc/examples/plot_digitize.py b/_doc/examples/plot_digitize.py
@@ -0,0 +1,120 @@
+"""
+
+.. _l-example-digitize
+
+========================
+numpy.digitize as a tree
+========================
+
+.. index:: digitize, decision tree, onnx, onnxruntime
+
+Function :epkg:`numpy:digitize` transforms a real variable
+into a discrete one by returning the buckets the variable
+falls into. This bucket can be efficiently retrieved by doing a
+binary search over the bins. That's equivalent to decision tree.
+Function :func:`digitize2tree
+<mlinsights.mltree.tree_digitize.digitize2tree>`.
+
+.. contents::
+    :local:
+
+Simple example
+==============
+"""
+import warnings
+import numpy
+from pandas import DataFrame, pivot_table
+import matplotlib.pyplot as plt
+from onnxruntime import InferenceSession
+from sklearn.tree import export_text
+from skl2onnx import to_onnx
+from cpyquickhelper.numbers.speed_measure import measure_time
+from mlinsights.mltree import digitize2tree
+from tqdm import tqdm
+
+x = numpy.array([0.2, 6.4, 3.0, 1.6])
+bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0])
+expected = numpy.digitize(x, bins, right=True)
+tree = digitize2tree(bins, right=True)
+pred = tree.predict(x.reshape((-1, 1)))
+print(expected, pred)
+
+##########################################
+# The tree looks like the following.
+print(export_text(tree, feature_names=['x']))
+
+#######################################
+# Benchmark
+# =========
+#
+# Let's measure the processing time. *numpy* should be
+# much faster than *scikit-learn* as it is adding many verification.
+# However, the benchmark also includes a conversion of the tree into
+# ONNX and measure the processing time with :epkg:`onnxruntime`.
+
+obs = []
+
+for shape in tqdm([1, 10, 100, 1000, 10000, 100000]):
+    x = numpy.random.random(shape).astype(numpy.float32)
+    if shape < 1000:
+        repeat = number = 100
+    else:
+        repeat = number = 10
+    for n_bins in [1, 10, 100]:
+        bins = (numpy.arange(n_bins) / n_bins).astype(numpy.float32)
+
+        ti = measure_time(
+            "numpy.digitize(x, bins, right=True)",
+            context={'numpy': numpy, "x": x, "bins": bins},
+            div_by_number=True, repeat=repeat, number=number)
+        ti['name'] = 'numpy'
+        ti['n_bins'] = n_bins
+        ti['shape'] = shape
+        obs.append(ti)
+
+        tree = digitize2tree(bins, right=True)
+
+        ti = measure_time(
+            "tree.predict(x)",
+            context={'numpy': numpy, "x": x.reshape((-1, 1)), "tree": tree},
+            div_by_number=True, repeat=repeat, number=number)
+        ti['name'] = 'sklearn'
+        ti['n_bins'] = n_bins
+        ti['shape'] = shape
+        obs.append(ti)
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=FutureWarning)
+            onx = to_onnx(tree, x.reshape((-1, 1)))
+
+        sess = InferenceSession(onx.SerializeToString())
+
+        ti = measure_time(
+            "sess.run(None, {'X': x})",
+            context={'numpy': numpy, "x": x.reshape((-1, 1)), "sess": sess},
+            div_by_number=True, repeat=repeat, number=number)
+        ti['name'] = 'ort'
+        ti['n_bins'] = n_bins
+        ti['shape'] = shape
+        obs.append(ti)
+
+
+df = DataFrame(obs)
+piv = pivot_table(data=df, index="shape", columns=["n_bins", "name"],
+                  values=["average"])
+print(piv)
+
+##########################################
+# Plotting
+# ========
+
+n_bins = list(sorted(set(df.n_bins)))
+fig, ax = plt.subplots(1, len(n_bins), figsize=(14, 4))
+
+for i, nb in enumerate(n_bins):
+    piv = pivot_table(data=df[df.n_bins == nb], index="shape",
+                      columns=["name"],
+                      values=["average"])
+    piv.plot(title="Benchmark digitize / onnxruntime\nn_bins=%d" % nb,
+             logx=True, logy=True, ax=ax[i])
+plt.show()
diff --git a/_doc/sphinxdoc/source/api/tree.rst b/_doc/sphinxdoc/source/api/tree.rst
@@ -2,6 +2,12 @@
 Trees
 =====
 
+.. contents::
+    :local:
+
+Digging into the tree structure
++++++++++++++++++++++++++++++++
+
 .. autosignature:: mlinsights.mltree.tree_structure.predict_leaves
 
 .. autosignature:: mlinsights.mltree.tree_structure.tree_find_common_node
@@ -15,3 +21,8 @@ Trees
 .. autosignature:: mlinsights.mltree.tree_structure.tree_leave_index
 
 .. autosignature:: mlinsights.mltree.tree_structure.tree_leave_neighbors
+
+Experiments, exercise
++++++++++++++++++++++
+
+.. autosignature:: mlinsights.mltree.tree_digitize.digitize2tree
diff --git a/_unittests/ut_mlbatch/test_pipeline_cache.py b/_unittests/ut_mlbatch/test_pipeline_cache.py
@@ -121,9 +121,6 @@ def test_grid_search_model(self):
 
     def test_clone_with_fitted_parameters(self):
         X, y = make_classification(random_state=42)
-        param_grid = {'pca__n_components': [2, 3],
-                      'pca__whiten': [True, False],
-                      'lr__fit_intercept': [True, False]}
         pipe = Pipeline([('pca', PCA(2)),
                          ('lr', LogisticRegression())])
         pipe.fit(X, y)

diff --git a/_unittests/ut_mlmodel/test_quantile_regression.py b/_unittests/ut_mlmodel/test_quantile_regression.py
@@ -33,7 +33,7 @@ def test_quantile_regression_no_intercept(self):
         self.assertEqualArray(clr.intercept_, clq.intercept_)
 
     @unittest.skipIf(
-        compare_module_version(sklver,"0.24") == -1,
+        compare_module_version(sklver, "0.24") == -1,
         reason="positive was introduce in 0.24")
     def test_quantile_regression_no_intercept_positive(self):
         X = numpy.array([[0.1, 0.2], [0.2, 0.3]])
@@ -64,7 +64,7 @@ def test_quantile_regression_intercept(self):
         self.assertEqualArray(clr.coef_, clq.coef_)
 
     @unittest.skipIf(
-        compare_module_version(sklver,"0.24") == -1,
+        compare_module_version(sklver, "0.24") == -1,
         reason="positive was introduce in 0.24")
     def test_quantile_regression_intercept_positive(self):
         X = numpy.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.3]])

diff --git a/_unittests/ut_mltree/test_tree_digitize.py b/_unittests/ut_mltree/test_tree_digitize.py
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+"""
+@brief      test log(time=2s)
+"""
+import unittest
+import numpy
+from sklearn.tree import DecisionTreeRegressor
+try:
+    from sklearn.tree._tree import TREE_UNDEFINED  # pylint: disable=E0611
+except ImportError:
+    TREE_UNDEFINED = None
+from pyquickhelper.pycode import ExtTestCase
+from mlinsights.mltree import digitize2tree
+
+
+class TestTreeDigitize(ExtTestCase):
+
+    @unittest.skipIf(TREE_UNDEFINED is None, reason="nothing to test")
+    def test_cst(self):
+        self.assertEqual(TREE_UNDEFINED, -2)
+
+    def test_exc(self):
+        bins = numpy.array([0.0, 1.0])
+        self.assertRaise(lambda: digitize2tree(bins, right=False),
+                         RuntimeError)
+        bins = numpy.array([1.0, 0.0])
+        self.assertRaise(lambda: digitize2tree(bins, right=False),
+                         RuntimeError)
+
+    def test_tree_digitize1(self):
+        x = numpy.array([0.2, 6.4, 3.0, 1.6])
+        bins = numpy.array([1.0])
+        expected = numpy.digitize(x, bins, right=True)
+        tree = digitize2tree(bins, right=True)
+        self.assertIsInstance(tree, DecisionTreeRegressor)
+        pred = tree.predict(x.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+        expected = numpy.digitize(bins, bins, right=True)
+        pred = tree.predict(bins.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+
+    def test_tree_digitize2(self):
+        x = numpy.array([0.2, 6.4, 3.0, 1.6])
+        bins = numpy.array([1.0, 2.0])
+        expected = numpy.digitize(x, bins, right=True)
+        tree = digitize2tree(bins, right=True)
+        pred = tree.predict(x.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+        expected = numpy.digitize(bins, bins, right=True)
+        pred = tree.predict(bins.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+
+    def test_tree_digitize3(self):
+        x = numpy.array([0.2, 6.4, 3.0, 1.6])
+        bins = numpy.array([1.0, 2.0, 3.5])
+        expected = numpy.digitize(x, bins, right=True)
+        tree = digitize2tree(bins, right=True)
+        pred = tree.predict(x.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+        expected = numpy.digitize(bins, bins, right=True)
+        pred = tree.predict(bins.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+
+    def test_tree_digitize4(self):
+        x = numpy.array([0.2, 6.4, 3.0, 1.6])
+        bins = numpy.array([0.0, 1.0, 2.5, 4.0])
+        expected = numpy.digitize(x, bins, right=True)
+        tree = digitize2tree(bins, right=True)
+        pred = tree.predict(x.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+        expected = numpy.digitize(bins, bins, right=True)
+        pred = tree.predict(bins.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+
+    def test_tree_digitize5(self):
+        x = numpy.array([0.2, 6.4, 3.0, 1.6])
+        bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0])
+        expected = numpy.digitize(x, bins, right=True)
+        tree = digitize2tree(bins, right=True)
+        pred = tree.predict(x.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+        expected = numpy.digitize(bins, bins, right=True)
+        pred = tree.predict(bins.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+
+    def test_tree_digitize5_false(self):
+        x = numpy.array([0.2, 6.4, 3.0, 1.6])
+        bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0])
+        bins[:] = bins[::-1].copy()
+        expected = numpy.digitize(x, bins, right=True)
+        tree = digitize2tree(bins, right=True)
+        pred = tree.predict(x.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+        expected = numpy.digitize(bins, bins, right=True)
+        pred = tree.predict(bins.reshape((-1, 1)))
+        self.assertEqualArray(expected, pred)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/mlinsights/mlmodel/sklearn_testing.py b/mlinsights/mlmodel/sklearn_testing.py
@@ -147,7 +147,7 @@ def test_sklearn_clone(fct_model, ext=None, copy_fitted=False):
         else:
             try:
                 ext.assertEqual(p1[k], p2[k])
-            except AssertionError as e:  # pragma no cover
+            except AssertionError:  # pragma no cover
                 raise AssertionError(  # pylint: disable=W0707
                     "Difference for key '{0}'\n==1 {1}\n==2 {2}".format(
                         k, p1[k], p2[k]))

diff --git a/mlinsights/mltree/__init__.py b/mlinsights/mltree/__init__.py
@@ -2,4 +2,7 @@
 @file
 @brief Shortcuts to *mltree*.
 """
-from .tree_structure import tree_leave_index, tree_node_range, tree_leave_neighbors, predict_leaves
+from .tree_digitize import digitize2tree
+from .tree_structure import (
+    tree_leave_index, tree_node_range, tree_leave_neighbors,
+    predict_leaves)
diff --git a/mlinsights/mltree/_tree_digitize.pyx b/mlinsights/mltree/_tree_digitize.pyx
@@ -0,0 +1,50 @@
+"""
+@file
+@brief Access to the C API of scikit-learn (decision tree)
+"""
+from libc.stdio cimport printf
+
+import numpy
+cimport numpy
+numpy.import_array()
+
+ctypedef numpy.npy_intp SIZE_t
+
+from sklearn.tree._tree cimport Tree
+
+TREE_LEAF = -1
+TREE_UNDEFINED = -2
+
+
+cdef SIZE_t _tree_add_node(Tree tree,
+                           SIZE_t parent,
+                           bint is_left,
+                           bint is_leaf,
+                           SIZE_t feature,
+                           double threshold,
+                           double impurity,
+                           SIZE_t n_node_samples,
+                           double weighted_n_node_samples):
+    if parent == -1:
+        parent = TREE_UNDEFINED
+    return tree._add_node(parent, is_left, is_leaf, feature,
+                          threshold, impurity,
+                          n_node_samples, weighted_n_node_samples)
+
+
+def tree_add_node(tree, parent, is_left, is_leaf, feature, threshold,
+                  impurity, n_node_samples, weighted_n_node_samples):
+    """
+    Adds a node to tree.
+
+    :param parent: parent index (-1 for the root)
+    :param is_left: is left node?
+    :param is_leaf: is leave?
+    :param feature: feature index
+    :param threshold: threshold (or value)
+    :param impurity: impurity
+    :param n_node_samples: number of samples this node represents
+    :param weighted_n_node_samples: node weight
+    """
+    return _tree_add_node(tree, parent, is_left, is_leaf, feature, threshold,
+                          impurity, n_node_samples, weighted_n_node_samples)