Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,4 @@ _unittests/unittests.out
_doc/notebooks/explore/simages/*
_unittests/ut_mlbatch/cache__2/
_doc/sphinxdoc/source/_temp_custom_run_script*
mlinsights/mltree/_tree_digitize.c
120 changes: 120 additions & 0 deletions _doc/examples/plot_digitize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""

.. _l-example-digitize

========================
numpy.digitize as a tree
========================

.. index:: digitize, decision tree, onnx, onnxruntime

Function :epkg:`numpy:digitize` transforms a real variable
into a discrete one by returning the buckets the variable
falls into. This bucket can be efficiently retrieved by doing a
binary search over the bins. That's equivalent to decision tree.
Function :func:`digitize2tree
<mlinsights.mltree.tree_digitize.digitize2tree>`.

.. contents::
:local:

Simple example
==============
"""
import warnings
import numpy
from pandas import DataFrame, pivot_table
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession
from sklearn.tree import export_text
from skl2onnx import to_onnx
from cpyquickhelper.numbers.speed_measure import measure_time
from mlinsights.mltree import digitize2tree
from tqdm import tqdm

x = numpy.array([0.2, 6.4, 3.0, 1.6])
bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0])
expected = numpy.digitize(x, bins, right=True)
tree = digitize2tree(bins, right=True)
pred = tree.predict(x.reshape((-1, 1)))
print(expected, pred)

##########################################
# The tree looks like the following.
print(export_text(tree, feature_names=['x']))

#######################################
# Benchmark
# =========
#
# Let's measure the processing time. *numpy* should be
# much faster than *scikit-learn* as it is adding many verification.
# However, the benchmark also includes a conversion of the tree into
# ONNX and measure the processing time with :epkg:`onnxruntime`.

obs = []

for shape in tqdm([1, 10, 100, 1000, 10000, 100000]):
x = numpy.random.random(shape).astype(numpy.float32)
if shape < 1000:
repeat = number = 100
else:
repeat = number = 10
for n_bins in [1, 10, 100]:
bins = (numpy.arange(n_bins) / n_bins).astype(numpy.float32)

ti = measure_time(
"numpy.digitize(x, bins, right=True)",
context={'numpy': numpy, "x": x, "bins": bins},
div_by_number=True, repeat=repeat, number=number)
ti['name'] = 'numpy'
ti['n_bins'] = n_bins
ti['shape'] = shape
obs.append(ti)

tree = digitize2tree(bins, right=True)

ti = measure_time(
"tree.predict(x)",
context={'numpy': numpy, "x": x.reshape((-1, 1)), "tree": tree},
div_by_number=True, repeat=repeat, number=number)
ti['name'] = 'sklearn'
ti['n_bins'] = n_bins
ti['shape'] = shape
obs.append(ti)

with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
onx = to_onnx(tree, x.reshape((-1, 1)))

sess = InferenceSession(onx.SerializeToString())

ti = measure_time(
"sess.run(None, {'X': x})",
context={'numpy': numpy, "x": x.reshape((-1, 1)), "sess": sess},
div_by_number=True, repeat=repeat, number=number)
ti['name'] = 'ort'
ti['n_bins'] = n_bins
ti['shape'] = shape
obs.append(ti)


df = DataFrame(obs)
piv = pivot_table(data=df, index="shape", columns=["n_bins", "name"],
values=["average"])
print(piv)

##########################################
# Plotting
# ========

n_bins = list(sorted(set(df.n_bins)))
fig, ax = plt.subplots(1, len(n_bins), figsize=(14, 4))

for i, nb in enumerate(n_bins):
piv = pivot_table(data=df[df.n_bins == nb], index="shape",
columns=["name"],
values=["average"])
piv.plot(title="Benchmark digitize / onnxruntime\nn_bins=%d" % nb,
logx=True, logy=True, ax=ax[i])
plt.show()
11 changes: 11 additions & 0 deletions _doc/sphinxdoc/source/api/tree.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
Trees
=====

.. contents::
:local:

Digging into the tree structure
+++++++++++++++++++++++++++++++

.. autosignature:: mlinsights.mltree.tree_structure.predict_leaves

.. autosignature:: mlinsights.mltree.tree_structure.tree_find_common_node
Expand All @@ -15,3 +21,8 @@ Trees
.. autosignature:: mlinsights.mltree.tree_structure.tree_leave_index

.. autosignature:: mlinsights.mltree.tree_structure.tree_leave_neighbors

Experiments, exercise
+++++++++++++++++++++

.. autosignature:: mlinsights.mltree.tree_digitize.digitize2tree
3 changes: 0 additions & 3 deletions _unittests/ut_mlbatch/test_pipeline_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,6 @@ def test_grid_search_model(self):

def test_clone_with_fitted_parameters(self):
X, y = make_classification(random_state=42)
param_grid = {'pca__n_components': [2, 3],
'pca__whiten': [True, False],
'lr__fit_intercept': [True, False]}
pipe = Pipeline([('pca', PCA(2)),
('lr', LogisticRegression())])
pipe.fit(X, y)
Expand Down
4 changes: 2 additions & 2 deletions _unittests/ut_mlmodel/test_quantile_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_quantile_regression_no_intercept(self):
self.assertEqualArray(clr.intercept_, clq.intercept_)

@unittest.skipIf(
compare_module_version(sklver,"0.24") == -1,
compare_module_version(sklver, "0.24") == -1,
reason="positive was introduce in 0.24")
def test_quantile_regression_no_intercept_positive(self):
X = numpy.array([[0.1, 0.2], [0.2, 0.3]])
Expand Down Expand Up @@ -64,7 +64,7 @@ def test_quantile_regression_intercept(self):
self.assertEqualArray(clr.coef_, clq.coef_)

@unittest.skipIf(
compare_module_version(sklver,"0.24") == -1,
compare_module_version(sklver, "0.24") == -1,
reason="positive was introduce in 0.24")
def test_quantile_regression_intercept_positive(self):
X = numpy.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.3]])
Expand Down
100 changes: 100 additions & 0 deletions _unittests/ut_mltree/test_tree_digitize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# -*- coding: utf-8 -*-
"""
@brief test log(time=2s)
"""
import unittest
import numpy
from sklearn.tree import DecisionTreeRegressor
try:
from sklearn.tree._tree import TREE_UNDEFINED # pylint: disable=E0611
except ImportError:
TREE_UNDEFINED = None
from pyquickhelper.pycode import ExtTestCase
from mlinsights.mltree import digitize2tree


class TestTreeDigitize(ExtTestCase):

@unittest.skipIf(TREE_UNDEFINED is None, reason="nothing to test")
def test_cst(self):
self.assertEqual(TREE_UNDEFINED, -2)

def test_exc(self):
bins = numpy.array([0.0, 1.0])
self.assertRaise(lambda: digitize2tree(bins, right=False),
RuntimeError)
bins = numpy.array([1.0, 0.0])
self.assertRaise(lambda: digitize2tree(bins, right=False),
RuntimeError)

def test_tree_digitize1(self):
x = numpy.array([0.2, 6.4, 3.0, 1.6])
bins = numpy.array([1.0])
expected = numpy.digitize(x, bins, right=True)
tree = digitize2tree(bins, right=True)
self.assertIsInstance(tree, DecisionTreeRegressor)
pred = tree.predict(x.reshape((-1, 1)))
self.assertEqualArray(expected, pred)
expected = numpy.digitize(bins, bins, right=True)
pred = tree.predict(bins.reshape((-1, 1)))
self.assertEqualArray(expected, pred)

def test_tree_digitize2(self):
x = numpy.array([0.2, 6.4, 3.0, 1.6])
bins = numpy.array([1.0, 2.0])
expected = numpy.digitize(x, bins, right=True)
tree = digitize2tree(bins, right=True)
pred = tree.predict(x.reshape((-1, 1)))
self.assertEqualArray(expected, pred)
expected = numpy.digitize(bins, bins, right=True)
pred = tree.predict(bins.reshape((-1, 1)))
self.assertEqualArray(expected, pred)

def test_tree_digitize3(self):
x = numpy.array([0.2, 6.4, 3.0, 1.6])
bins = numpy.array([1.0, 2.0, 3.5])
expected = numpy.digitize(x, bins, right=True)
tree = digitize2tree(bins, right=True)
pred = tree.predict(x.reshape((-1, 1)))
self.assertEqualArray(expected, pred)
expected = numpy.digitize(bins, bins, right=True)
pred = tree.predict(bins.reshape((-1, 1)))
self.assertEqualArray(expected, pred)

def test_tree_digitize4(self):
x = numpy.array([0.2, 6.4, 3.0, 1.6])
bins = numpy.array([0.0, 1.0, 2.5, 4.0])
expected = numpy.digitize(x, bins, right=True)
tree = digitize2tree(bins, right=True)
pred = tree.predict(x.reshape((-1, 1)))
self.assertEqualArray(expected, pred)
expected = numpy.digitize(bins, bins, right=True)
pred = tree.predict(bins.reshape((-1, 1)))
self.assertEqualArray(expected, pred)

def test_tree_digitize5(self):
x = numpy.array([0.2, 6.4, 3.0, 1.6])
bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0])
expected = numpy.digitize(x, bins, right=True)
tree = digitize2tree(bins, right=True)
pred = tree.predict(x.reshape((-1, 1)))
self.assertEqualArray(expected, pred)
expected = numpy.digitize(bins, bins, right=True)
pred = tree.predict(bins.reshape((-1, 1)))
self.assertEqualArray(expected, pred)

def test_tree_digitize5_false(self):
x = numpy.array([0.2, 6.4, 3.0, 1.6])
bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0])
bins[:] = bins[::-1].copy()
expected = numpy.digitize(x, bins, right=True)
tree = digitize2tree(bins, right=True)
pred = tree.predict(x.reshape((-1, 1)))
self.assertEqualArray(expected, pred)
expected = numpy.digitize(bins, bins, right=True)
pred = tree.predict(bins.reshape((-1, 1)))
self.assertEqualArray(expected, pred)


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion mlinsights/mlmodel/sklearn_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def test_sklearn_clone(fct_model, ext=None, copy_fitted=False):
else:
try:
ext.assertEqual(p1[k], p2[k])
except AssertionError as e: # pragma no cover
except AssertionError: # pragma no cover
raise AssertionError( # pylint: disable=W0707
"Difference for key '{0}'\n==1 {1}\n==2 {2}".format(
k, p1[k], p2[k]))
Expand Down
5 changes: 4 additions & 1 deletion mlinsights/mltree/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@
@file
@brief Shortcuts to *mltree*.
"""
from .tree_structure import tree_leave_index, tree_node_range, tree_leave_neighbors, predict_leaves
from .tree_digitize import digitize2tree
from .tree_structure import (
tree_leave_index, tree_node_range, tree_leave_neighbors,
predict_leaves)
50 changes: 50 additions & 0 deletions mlinsights/mltree/_tree_digitize.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
@file
@brief Access to the C API of scikit-learn (decision tree)
"""
from libc.stdio cimport printf

import numpy
cimport numpy
numpy.import_array()

ctypedef numpy.npy_intp SIZE_t

from sklearn.tree._tree cimport Tree

TREE_LEAF = -1
TREE_UNDEFINED = -2


cdef SIZE_t _tree_add_node(Tree tree,
SIZE_t parent,
bint is_left,
bint is_leaf,
SIZE_t feature,
double threshold,
double impurity,
SIZE_t n_node_samples,
double weighted_n_node_samples):
if parent == -1:
parent = TREE_UNDEFINED
return tree._add_node(parent, is_left, is_leaf, feature,
threshold, impurity,
n_node_samples, weighted_n_node_samples)


def tree_add_node(tree, parent, is_left, is_leaf, feature, threshold,
impurity, n_node_samples, weighted_n_node_samples):
"""
Adds a node to tree.

:param parent: parent index (-1 for the root)
:param is_left: is left node?
:param is_leaf: is leave?
:param feature: feature index
:param threshold: threshold (or value)
:param impurity: impurity
:param n_node_samples: number of samples this node represents
:param weighted_n_node_samples: node weight
"""
return _tree_add_node(tree, parent, is_left, is_leaf, feature, threshold,
impurity, n_node_samples, weighted_n_node_samples)
Loading