diff --git a/.circleci/config.yml b/.circleci/config.yml index f4a83aaf..bee9071d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -18,8 +18,8 @@ jobs: name: Install pandoc command: | sudo apt-get update - wget https://github.com/jgm/pandoc/releases/download/2.14.1/pandoc-2.14.1-1-amd64.deb - sudo dpkg -i pandoc-2.14.1-1-amd64.deb + wget https://github.com/jgm/pandoc/releases/download/2.18/pandoc-2.18-1-amd64.deb + sudo dpkg -i pandoc-2.18-1-amd64.deb - run: name: Install tex diff --git a/.travis.yml b/.travis.yml index 989256bc..ccb7602b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,11 +9,6 @@ matrix: env: - sklver=">=0.24.2" - jlver=">=1.0" - - python: 3.8 - name: "Py38-023" - env: - - sklver="==0.23.2" - - jlver="==0.17.0" before_install: - sudo apt-get install libgeos-dev libproj-dev proj-data graphviz libblas-dev liblapack-dev diff --git a/README.rst b/README.rst index 1c2a9dac..6f7b61a5 100644 --- a/README.rst +++ b/README.rst @@ -76,4 +76,4 @@ Function ``pipeline2dot`` converts a pipeline into a graph: from mlinsights.plotting import pipeline2dot dot = pipeline2dot(clf, df) -.. image:: https://github.com/sdpython/mlinsights/raw/master/_doc/pipeline.png +.. image:: https://raw.githubusercontent.com/sdpython/mlinsights/master/_doc/sphinxdoc/source/pipeline.png diff --git a/_unittests/ut_mlmodel/test_kmeans_l1.py b/_unittests/ut_mlmodel/test_kmeans_l1.py index b2af4a4b..20e83007 100644 --- a/_unittests/ut_mlmodel/test_kmeans_l1.py +++ b/_unittests/ut_mlmodel/test_kmeans_l1.py @@ -72,8 +72,9 @@ def test_kmeans_l2_iris(self): self.assertEqual({0, 1, 2, 3}, cls) def test_kmeans_l1_check(self): - X = numpy.array([[-10, 1, 2, 3, 4, 10], - [-10, 1, 2, 3, 4, 10]]).T + X = numpy.ascontiguousarray( + numpy.array([[-10, 1, 2, 3, 4, 10], + [-10, 1, 2, 3, 4, 10]]).T) clr = KMeansL1L2(2, norm='L1') clr.fit(X) cls = set(clr.predict(X)) diff --git a/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment.py b/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment.py index b0607b21..1179b3d7 100644 --- a/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment.py +++ b/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment.py @@ -26,7 +26,7 @@ def test_criterions(self): X = numpy.array([[1., 2.]]).T y = numpy.array([1., 2.]) c1 = MSE(1, X.shape[0]) - c2 = SimpleRegressorCriterion(X) + c2 = SimpleRegressorCriterion(1, X.shape[0]) self.assertNotEmpty(c1) self.assertNotEmpty(c2) w = numpy.ones((y.shape[0],)) @@ -49,7 +49,7 @@ def test_criterions(self): X = numpy.array([[1., 2., 3.]]).T y = numpy.array([1., 2., 3.]) c1 = MSE(1, X.shape[0]) - c2 = SimpleRegressorCriterion(X) + c2 = SimpleRegressorCriterion(1, X.shape[0]) w = numpy.ones((y.shape[0],)) ind = numpy.arange(y.shape[0]).astype(numpy.int64) ys = y.astype(float).reshape((y.shape[0], 1)) @@ -68,7 +68,7 @@ def test_criterions(self): X = numpy.array([[1., 2., 10., 11.]]).T y = numpy.array([0.9, 1.1, 1.9, 2.1]) c1 = MSE(1, X.shape[0]) - c2 = SimpleRegressorCriterion(X) + c2 = SimpleRegressorCriterion(1, X.shape[0]) w = numpy.ones((y.shape[0],)) ind = numpy.arange(y.shape[0]).astype(numpy.int64) ys = y.astype(float).reshape((y.shape[0], 1)) @@ -121,7 +121,7 @@ def test_criterions(self): X = numpy.array([[1., 2., 10., 11.]]).T y = numpy.array([0.9, 1.1, 1.9, 2.1]) c1 = MSE(1, X.shape[0]) - c2 = SimpleRegressorCriterion(X) + c2 = SimpleRegressorCriterion(1, X.shape[0]) w = numpy.ones((y.shape[0],)) ind = numpy.array([0, 3, 2, 1], dtype=ind.dtype) ys = y.astype(float).reshape((y.shape[0], 1)) @@ -166,7 +166,8 @@ def test_decision_tree_criterion(self): clr1.fit(X, y) p1 = clr1.predict(X) - crit = SimpleRegressorCriterion(X) + crit = SimpleRegressorCriterion( + 1 if len(y.shape) <= 1 else y.shape[1], X.shape[0]) clr2 = DecisionTreeRegressor(criterion=crit, max_depth=1) clr2.fit(X, y) p2 = clr2.predict(X) @@ -179,7 +180,9 @@ def test_decision_tree_criterion_iris(self): clr1 = DecisionTreeRegressor() clr1.fit(X, y) p1 = clr1.predict(X) - clr2 = DecisionTreeRegressor(criterion=SimpleRegressorCriterion(X)) + clr2 = DecisionTreeRegressor( + criterion=SimpleRegressorCriterion( + 1 if len(y.shape) <= 1 else y.shape[1], X.shape[0])) clr2.fit(X, y) p2 = clr2.predict(X) self.assertEqual(p1[:10], p2[:10]) diff --git a/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment_fast.py b/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment_fast.py index 74c634d5..5bc5a7a9 100644 --- a/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment_fast.py +++ b/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment_fast.py @@ -27,7 +27,7 @@ def test_criterions(self): X = numpy.array([[1., 2.]]).T y = numpy.array([1., 2.]) c1 = MSE(1, X.shape[0]) - c2 = SimpleRegressorCriterionFast(X) + c2 = SimpleRegressorCriterionFast(1, X.shape[0]) self.assertNotEmpty(c1) self.assertNotEmpty(c2) w = numpy.ones((y.shape[0],)) @@ -51,7 +51,7 @@ def test_criterions(self): X = numpy.array([[1., 2., 3.]]).T y = numpy.array([1., 2., 3.]) c1 = MSE(1, X.shape[0]) - c2 = SimpleRegressorCriterionFast(X) + c2 = SimpleRegressorCriterionFast(1, X.shape[0]) w = numpy.ones((y.shape[0],)) ind = numpy.arange(y.shape[0]).astype(numpy.int64) ys = y.astype(float).reshape((y.shape[0], 1)) @@ -71,7 +71,7 @@ def test_criterions(self): X = numpy.array([[1., 2., 10., 11.]]).T y = numpy.array([0.9, 1.1, 1.9, 2.1]) c1 = MSE(1, X.shape[0]) - c2 = SimpleRegressorCriterionFast(X) + c2 = SimpleRegressorCriterionFast(1, X.shape[0]) w = numpy.ones((y.shape[0],)) ind = numpy.arange(y.shape[0]).astype(numpy.int64) ys = y.astype(float).reshape((y.shape[0], 1)) @@ -115,7 +115,7 @@ def test_criterions(self): X = numpy.array([[1., 2., 10., 11.]]).T y = numpy.array([0.9, 1.1, 1.9, 2.1]) c1 = MSE(1, X.shape[0]) - c2 = SimpleRegressorCriterionFast(X) + c2 = SimpleRegressorCriterionFast(1, X.shape[0]) w = numpy.ones((y.shape[0],)) ind = numpy.array([0, 3, 2, 1], dtype=ind.dtype) ys = y.astype(float).reshape((y.shape[0], 1)) @@ -162,7 +162,7 @@ def test_decision_tree_criterion(self): clr1.fit(X, y) p1 = clr1.predict(X) - crit = SimpleRegressorCriterionFast(X) + crit = SimpleRegressorCriterionFast(1, X.shape[0]) clr2 = DecisionTreeRegressor(criterion=crit, max_depth=1) clr2.fit(X, y) p2 = clr2.predict(X) @@ -175,7 +175,7 @@ def test_decision_tree_criterion_iris(self): clr1 = DecisionTreeRegressor() clr1.fit(X, y) p1 = clr1.predict(X) - clr2 = DecisionTreeRegressor(criterion=SimpleRegressorCriterionFast(X)) + clr2 = DecisionTreeRegressor(criterion=SimpleRegressorCriterionFast(1, X.shape[0])) clr2.fit(X, y) p2 = clr2.predict(X) self.assertEqual(p1[:10], p2[:10]) diff --git a/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment_linear.py b/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment_linear.py index fd2927ad..8ae7a976 100644 --- a/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment_linear.py +++ b/_unittests/ut_mlmodel/test_piecewise_decision_tree_experiment_linear.py @@ -25,7 +25,7 @@ def test_criterions(self): X = numpy.array([[10., 12., 13.]]).T y = numpy.array([20., 22., 23.]) c1 = MSE(1, X.shape[0]) - c2 = LinearRegressorCriterion(X) + c2 = LinearRegressorCriterion(1, X) self.assertNotEmpty(c1) self.assertNotEmpty(c2) w = numpy.ones((y.shape[0],)) @@ -49,7 +49,7 @@ def test_criterions(self): X = numpy.array([[1., 2., 3.]]).T y = numpy.array([1., 2., 3.]) c1 = MSE(1, X.shape[0]) - c2 = LinearRegressorCriterion(X) + c2 = LinearRegressorCriterion(1, X) w = numpy.ones((y.shape[0],)) ind = numpy.arange(y.shape[0]).astype(numpy.int64) ys = y.astype(float).reshape((y.shape[0], 1)) @@ -68,7 +68,7 @@ def test_criterions(self): X = numpy.array([[1., 2., 10., 11.]]).T y = numpy.array([0.9, 1.1, 1.9, 2.1]) c1 = MSE(1, X.shape[0]) - c2 = LinearRegressorCriterion(X) + c2 = LinearRegressorCriterion(1, X) w = numpy.ones((y.shape[0],)) ind = numpy.arange(y.shape[0]).astype(numpy.int64) ys = y.astype(float).reshape((y.shape[0], 1)) @@ -87,7 +87,7 @@ def test_criterions(self): X = numpy.array([[1., 2., 10., 11.]]).T y = numpy.array([0.9, 1.1, 1.9, 2.1]) c1 = MSE(1, X.shape[0]) - c2 = LinearRegressorCriterion(X) + c2 = LinearRegressorCriterion(1, X) w = numpy.ones((y.shape[0],)) ind = numpy.array([0, 3, 2, 1], dtype=ind.dtype) ys = y.astype(float).reshape((y.shape[0], 1)) @@ -145,7 +145,7 @@ def test_decision_tree_criterion(self): clr1.fit(X, y) p1 = clr1.predict(X) - crit = LinearRegressorCriterion(X) + crit = LinearRegressorCriterion(1, X) clr2 = DecisionTreeRegressor(criterion=crit, max_depth=1) clr2.fit(X, y) p2 = clr2.predict(X) @@ -158,7 +158,7 @@ def test_decision_tree_criterion_iris(self): clr1 = DecisionTreeRegressor() clr1.fit(X, y) p1 = clr1.predict(X) - clr2 = DecisionTreeRegressor(criterion=LinearRegressorCriterion(X)) + clr2 = DecisionTreeRegressor(criterion=LinearRegressorCriterion(1, X)) clr2.fit(X, y) p2 = clr2.predict(X) self.assertEqual(p1.shape, p2.shape) diff --git a/_unittests/ut_timeseries/test_plot_timeseries.py b/_unittests/ut_timeseries/test_plot_timeseries.py index 19ecbe94..2974a23a 100644 --- a/_unittests/ut_timeseries/test_plot_timeseries.py +++ b/_unittests/ut_timeseries/test_plot_timeseries.py @@ -3,6 +3,8 @@ """ import unittest import datetime +import warnings +import sys from pyquickhelper.pycode import ExtTestCase from mlinsights.timeseries.datasets import artificial_data from mlinsights.timeseries.agg import aggregate_timeseries @@ -11,8 +13,17 @@ class TestPlotTimeSeries(ExtTestCase): + @unittest.skipIf( + sys.platform == "win32" and __name__ != "__main__", + reason="issue with matplotlib") def test_plot_data(self): - import matplotlib.pyplot as plt # pylint: disable=C0415 + try: + import matplotlib.pyplot as plt # pylint: disable=C0415 + except Exception as e: + if 'generated new fontManager' in str(e): + warnings.warn(e) + return + raise e dt1 = datetime.datetime(2019, 8, 1) dt2 = datetime.datetime(2019, 8, 15) data = artificial_data(dt1, dt2, minutes=15) diff --git a/appveyor.yml b/appveyor.yml index ca2d49ac..af396400 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -10,10 +10,6 @@ environment: PYTHON_VERSION: "3.9.x" PYTHON_ARCH: "64" SKL: '==0.24.2' - - PYTHON: "C:\\Python38-x64" - PYTHON_VERSION: "3.8.x" - PYTHON_ARCH: "64" - SKL: '==0.23.2' init: - "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5dd96a19..f402c96e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -18,8 +18,8 @@ jobs: displayName: 'Install Inkscape' - script: sudo apt-get install -y pandoc displayName: 'Install Pandoc' - - script: sudo apt-get install -y texlive texlive-latex-extra texlive-xetex dvipng - displayName: 'Install Latex' + # - script: sudo apt-get install -y texlive texlive-latex-extra texlive-xetex dvipng + # displayName: 'Install Latex' - script: sudo apt-get install -y libgeos-dev libproj-dev proj-data graphviz libblas-dev liblapack-dev displayName: 'Install Geos packages' - script: | @@ -37,7 +37,7 @@ jobs: displayName: 'Install tools' - script: pip install numpy displayName: 'Install numpy' - - script: pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html + - script: pip install install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu displayName: 'Install pytorch' - script: | export LLVM_CONFIG=/usr/bin/llvm-config-10 diff --git a/mlinsights/mlmodel/_kmeans_022.py b/mlinsights/mlmodel/_kmeans_022.py index b2fedac1..500694c5 100644 --- a/mlinsights/mlmodel/_kmeans_022.py +++ b/mlinsights/mlmodel/_kmeans_022.py @@ -21,7 +21,7 @@ def _labels_inertia_precompute_dense(norm, X, sample_weight, centers, distances) This will overwrite the 'distances' array in-place. - :param norm: 'l1' or 'l2' + :param norm: 'L1' or 'L2' :param X: numpy array, shape (n_sample, n_features) Input data. :param sample_weight: array-like, shape (n_samples,) @@ -37,10 +37,10 @@ def _labels_inertia_precompute_dense(norm, X, sample_weight, centers, distances) cluster center. """ n_samples = X.shape[0] - if norm == 'l2': + if norm == 'L2': labels, mindist = pairwise_distances_argmin_min( X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True}) - elif norm == 'l1': + elif norm == 'L1': labels, mindist = pairwise_distances_argmin_min( X=X, Y=centers, metric='manhattan') else: # pragma no cover diff --git a/mlinsights/mlmodel/_piecewise_tree_regression_common.pxd b/mlinsights/mlmodel/_piecewise_tree_regression_common.pxd index 4d4c0ab8..d5658a5e 100644 --- a/mlinsights/mlmodel/_piecewise_tree_regression_common.pxd +++ b/mlinsights/mlmodel/_piecewise_tree_regression_common.pxd @@ -9,8 +9,6 @@ from sklearn.tree._criterion cimport SIZE_t, DOUBLE_t cdef class CommonRegressorCriterion(Criterion): - cdef const DOUBLE_t[:, ::1] sample_X - cdef void _update_weights(self, SIZE_t start, SIZE_t end, SIZE_t old_pos, SIZE_t new_pos) nogil diff --git a/mlinsights/mlmodel/_piecewise_tree_regression_common023.pyx b/mlinsights/mlmodel/_piecewise_tree_regression_common023.pyx deleted file mode 100644 index 1450b649..00000000 --- a/mlinsights/mlmodel/_piecewise_tree_regression_common023.pyx +++ /dev/null @@ -1,277 +0,0 @@ -""" -@file -@brief Implements a custom criterion to train a decision tree. -""" -from libc.stdlib cimport calloc, free -from libc.stdio cimport printf -from libc.math cimport NAN - -import numpy -cimport numpy -numpy.import_array() - -from sklearn.tree._criterion cimport Criterion -from sklearn.tree._criterion cimport SIZE_t, DOUBLE_t - - -cdef class CommonRegressorCriterion(Criterion): - """ - Common class to implement various version of `mean square error - `_. - The code was inspired from - `hellinger_distance_criterion.pyx - `_, - `Cython example of exposing C-computed arrays in Python without data copies - `_, - `_criterion.pyx - `_. - This implementation is not efficient but was made that way on purpose. - It adds the features to the class. - - If the file does not compile, some explanations are given - in :ref:`scikit-learn internal API - `_. - """ - def __getstate__(self): - return {} - - def __setstate__(self, d): - pass - - def __cinit__(self, const DOUBLE_t[:, ::1] X): - self.sample_X = X - - cdef void _update_weights(self, SIZE_t start, SIZE_t end, SIZE_t old_pos, SIZE_t new_pos) nogil: - """ - Unused. - """ - pass - - cdef int reset(self) nogil except -1: - """ - Resets the criterion at *pos=start*. - This method must be implemented by the subclass. - """ - self.pos = self.start - - cdef int reverse_reset(self) nogil except -1: - """ - Resets the criterion at *pos=end*. - This method must be implemented by the subclass. - """ - self.pos = self.end - - cdef int update(self, SIZE_t new_pos) nogil except -1: - """ - Updates statistics by moving ``samples[pos:new_pos]`` to the left child. - This updates the collected statistics by moving ``samples[pos:new_pos]`` - from the right child to the left child. It must be implemented by - the subclass. - - :param new_pos: SIZE_t - New starting index position of the samples in the right child - """ - self.pos = new_pos - - cdef void _mean(self, SIZE_t start, SIZE_t end, DOUBLE_t *mean, DOUBLE_t *weight) nogil: - """ - Computes the mean of *y* between *start* and *end*. - """ - raise NotImplementedError("Method _mean must be overloaded.") - - cdef double _mse(self, SIZE_t start, SIZE_t end, DOUBLE_t mean, DOUBLE_t weight) nogil: - """ - Computes mean square error between *start* and *end* - assuming corresponding points are approximated by a constant. - """ - raise NotImplementedError("Method _mean must be overloaded.") - - cdef void children_impurity_weights(self, double* impurity_left, - double* impurity_right, - double* weight_left, - double* weight_right) nogil: - """ - Calculates the impurity of children, - evaluates the impurity in - children nodes, i.e. the impurity of ``samples[start:pos]`` - the impurity of ``samples[pos:end]``. - - :param impurity_left: double pointer - The memory address where the impurity of the left child should be - stored. - :param impurity_right: double pointer - The memory address where the impurity of the right child should be - stored. - :param weight_left: double pointer - The memory address where the weight of the left child should be - stored. - :param weight_right: double pointer - The memory address where the weight of the right child should be - stored. - """ - cdef DOUBLE_t mleft, mright - self._mean(self.start, self.pos, &mleft, weight_left) - self._mean(self.pos, self.end, &mright, weight_right) - impurity_left[0] = self._mse(self.start, self.pos, mleft, weight_left[0]) - impurity_right[0] = self._mse(self.pos, self.end, mright, weight_right[0]) - - #################### - # functions used by a the tree optimizer - #################### - - cdef double node_impurity(self) nogil: - """ - Calculates the impurity of the node, - the impurity of ``samples[start:end]``. - This is the primary function of the criterion class. - """ - cdef DOUBLE_t mean, weight - self._mean(self.start, self.end, &mean, &weight) - return self._mse(self.start, self.end, mean, weight) - - cdef void children_impurity(self, double* impurity_left, - double* impurity_right) nogil: - """ - Calculates the impurity of children. - - :param impurity_left: double pointer - The memory address where the impurity of the left child should be - stored. - :param impurity_right: double pointer - The memory address where the impurity of the right child should be - stored. - """ - cdef DOUBLE_t wl, wr - self.children_impurity_weights(impurity_left, impurity_right, &wl, &wr) - - cdef void node_value(self, double* dest) nogil: - """ - Computes the node value, usually, the prediction - the tree would do. Stores the value into *dest*. - - :param dest: double pointer - The memory address where the node value should be stored. - """ - cdef DOUBLE_t weight - self._mean(self.start, self.end, dest, &weight) - - cdef double proxy_impurity_improvement(self) nogil: - """ - Computes a proxy of the impurity reduction - This method is used to speed up the search for the best split. - It is a proxy quantity such that the split that maximizes this value - also maximizes the impurity improvement. It neglects all constant terms - of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the - *impurity_improvement* method once the best split has been found. - """ - cdef double impurity_left - cdef double impurity_right - self.children_impurity_weights(&impurity_left, &impurity_right, - &self.weighted_n_left, &self.weighted_n_right) - if self.pos == self.start or self.pos == self.end: - return NAN - - return (- self.weighted_n_right * impurity_right - - self.weighted_n_left * impurity_left) - - cdef double impurity_improvement(self, double impurity) nogil: - """ - Computes the improvement in impurity - This method computes the improvement in impurity when a split occurs. - The weighted impurity improvement equation is the following:: - - N_t / N * (impurity - N_t_R / N_t * right_impurity - - N_t_L / N_t * left_impurity) - - where *N* is the total number of samples, *N_t* is the number of samples - at the current node, *N_t_L* is the number of samples in the left child, - and *N_t_R* is the number of samples in the right child, - - :param impurity: double - The initial impurity of the node before the split - :return: double, improvement in impurity after the split occurs - """ - cdef double impurity_left - cdef double impurity_right - self.children_impurity_weights(&impurity_left, &impurity_right, - &self.weighted_n_left, &self.weighted_n_right) - if self.pos == self.start or self.pos == self.end: - return NAN - - cdef double weight = self.weighted_n_left + self.weighted_n_right - return ((weight / self.weighted_n_samples) * - (impurity - (self.weighted_n_right / weight * impurity_right) - - (self.weighted_n_left / weight * impurity_left))) - - -def _test_criterion_init(Criterion criterion, - const DOUBLE_t[:, ::1] y, - DOUBLE_t[:] sample_weight, - double weighted_n_samples, - SIZE_t[:] samples, - SIZE_t start, SIZE_t end): - "Test purposes. Methods cannot be directly called from python." - criterion.init(y, - &sample_weight[0], weighted_n_samples, - &samples[0], start, end) - - -def _test_criterion_check(Criterion criterion): - "Unused" - pass - - -def assert_criterion_equal(Criterion c1, Criterion c2): - "Unused" - pass - - -def _test_criterion_node_impurity(Criterion criterion): - "Test purposes. Methods cannot be directly called from python." - return criterion.node_impurity() - - -def _test_criterion_proxy_impurity_improvement(Criterion criterion): - "Test purposes. Methods cannot be directly called from python." - return criterion.proxy_impurity_improvement() - - -def _test_criterion_impurity_improvement(Criterion criterion, double impurity): - "Test purposes. Methods cannot be directly called from python." - return criterion.impurity_improvement(impurity) - - -def _test_criterion_node_impurity_children(Criterion criterion): - "Test purposes. Methods cannot be directly called from python." - cdef DOUBLE_t left, right - criterion.children_impurity(&left, &right) - return left, right - - -def _test_criterion_node_value(Criterion criterion): - "Test purposes. Methods cannot be directly called from python." - cdef DOUBLE_t value - criterion.node_value(&value) - return value - - -def _test_criterion_update(Criterion criterion, SIZE_t new_pos): - "Test purposes. Methods cannot be directly called from python." - return criterion.update(new_pos) - - -def _test_criterion_printf(Criterion crit): - "Test purposes. Methods cannot be directly called from python." - printf("start=%zu pos=%zu end=%zu\n", crit.start, crit.pos, crit.end) - cdef DOUBLE_t left, right, value - cdef int i; - crit.children_impurity(&left, &right) - crit.node_value(&value) - printf("value: %f total=%f left=%f right=%f\n", value, - crit.node_impurity(), left, right) - cdef int n = crit.y.shape[0] - for i in range(0, n): - printf("-- %d: y=%f\n", i, crit.y[i, 0]) diff --git a/mlinsights/mlmodel/_piecewise_tree_regression_common024.pyx b/mlinsights/mlmodel/_piecewise_tree_regression_common024.pyx index 01f4c89b..28bd4b32 100644 --- a/mlinsights/mlmodel/_piecewise_tree_regression_common024.pyx +++ b/mlinsights/mlmodel/_piecewise_tree_regression_common024.pyx @@ -40,15 +40,12 @@ cdef class CommonRegressorCriterion(Criterion): def __setstate__(self, d): pass - def __cinit__(self, const DOUBLE_t[:, ::1] X): - self.sample_X = X - def __deepcopy__(self, memo=None): """ This does not a copy but mostly creates a new instance of the same criterion initialized with the same data. """ - inst = self.__class__(self.sample_X) + inst = self.__class__(self.n_outputs, self.n_samples) return inst cdef void _update_weights(self, SIZE_t start, SIZE_t end, SIZE_t old_pos, SIZE_t new_pos) nogil: @@ -225,11 +222,11 @@ cdef class CommonRegressorCriterion(Criterion): def _test_criterion_init(Criterion criterion, - const DOUBLE_t[:, ::1] y, - DOUBLE_t[:] sample_weight, - double weighted_n_samples, - SIZE_t[:] samples, - SIZE_t start, SIZE_t end): + const DOUBLE_t[:, ::1] y, + DOUBLE_t[:] sample_weight, + double weighted_n_samples, + SIZE_t[:] samples, + SIZE_t start, SIZE_t end): "Test purposes. Methods cannot be directly called from python." criterion.init(y, &sample_weight[0], weighted_n_samples, diff --git a/mlinsights/mlmodel/kmeans_l1.py b/mlinsights/mlmodel/kmeans_l1.py index 7938c054..1680ff86 100644 --- a/mlinsights/mlmodel/kmeans_l1.py +++ b/mlinsights/mlmodel/kmeans_l1.py @@ -20,6 +20,13 @@ except ImportError: # pragma: no cover from sklearn.cluster._kmeans import ( _check_normalize_sample_weight as _check_sample_weight) +try: + from sklearn.utils._param_validation import StrOptions +except ImportError: + def StrOptions(*args): + "Dummy replacement for a class introduced in scikit-learn==1.1." + return None + from ._kmeans_022 import ( _labels_inertia_skl, _labels_inertia_precompute_dense) @@ -28,7 +35,7 @@ def _k_init(norm, X, n_clusters, random_state, n_local_trials=None): """Init n_clusters seeds according to k-means++ - :param norm: `l1` or `l2` + :param norm: `L1` or `L2` manhattan or euclidean distance :param X: array or sparse matrix, shape (n_samples, n_features) The data to pick seeds for. To avoid memory copy, the input data @@ -64,13 +71,13 @@ def _k_init(norm, X, n_clusters, random_state, n_local_trials=None): centers[0] = X[center_id] # Initialize list of closest distances and calculate current potential - if norm.lower() == 'l2': + if norm == 'L2': dist_fct = lambda x, y: euclidean_distances(x, y, squared=True) - elif norm.lower() == 'l1': + elif norm == 'L1': dist_fct = lambda x, y: manhattan_distances(x, y) else: raise NotImplementedError( # pragma no cover - "norm must be 'l1' or 'l2' not '{}'.".format(norm)) + "norm must be 'L1' or 'L2' not '{}'.".format(norm)) closest_dist_sq = dist_fct(centers[0, numpy.newaxis], X) current_pot = closest_dist_sq.sum() @@ -112,7 +119,7 @@ def _init_centroids(norm, X, k, init, random_state=None, init_size=None): """Compute the initial centroids - :param norm: 'l1' or 'l2' + :param norm: 'L1' or 'L2' :param X: array, shape (n_samples, n_features) :param k: int number of centroids @@ -244,7 +251,7 @@ def _kmeans_single_lloyd(norm, X, sample_weight, n_clusters, max_iter=300, """ A single run of k-means, assumes preparation completed prior. - :param norm: 'l1' or 'l2' + :param norm: 'L1' or 'L2' :param X: array-like of floats, shape (n_samples, n_features) The observations to cluster. :param n_clusters: int @@ -349,7 +356,7 @@ def _labels_inertia(norm, X, sample_weight, centers, distances=None): Computes the labels and the inertia of the given samples and centers. This will compute the distances in-place. - :param norm: 'l1' or 'l2' + :param norm: 'L1' or 'L2' :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features) The input samples to assign to the labels. :param sample_weight: array-like, shape (n_samples,) @@ -375,7 +382,7 @@ def _labels_inertia(norm, X, sample_weight, centers, distances=None): # distances will be changed in-place if issparse(X): raise NotImplementedError( # pragma no cover - "Sparse matrix is not implemented for norm 'l1'.") + "Sparse matrix is not implemented for norm 'L1'.") return _labels_inertia_precompute_dense( norm=norm, X=X, sample_weight=sample_weight, centers=centers, distances=distances) @@ -383,9 +390,9 @@ def _labels_inertia(norm, X, sample_weight, centers, distances=None): def _tolerance(norm, X, tol): """Return a tolerance which is independent of the dataset""" - if norm == 'l2': + if norm == 'L2': return _tolerance_skl(X, tol) - if norm == 'l1': + if norm == 'L1': variances = numpy.sum(numpy.abs(X), axis=0) / X.shape[0] return variances.sum() raise NotImplementedError( # pragma no cover @@ -478,6 +485,11 @@ class KMeansL1L2(KMeans): Number of iterations run. """ + _parameter_constraints = { + **getattr(KMeans, '_parameter_constraints', {}), + "norm": [StrOptions({"L1", "L2"})], + } + def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=1e-4, verbose=0, random_state=None, copy_x=True, @@ -487,8 +499,8 @@ def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=max_iter, tol=tol, verbose=verbose, random_state=random_state, copy_x=copy_x, algorithm=algorithm) - self.norm = norm.lower() - if self.norm == 'l1' and self.algorithm != 'full': + self.norm = norm + if self.norm == 'L1' and self.algorithm != 'full': raise NotImplementedError( # pragma no cover "Only algorithm 'full' is implemented with norm 'l1'.") @@ -508,18 +520,18 @@ def fit(self, X, y=None, sample_weight=None): :return: self Fitted estimator. """ - if self.norm == 'l2': + if self.norm == 'L2': KMeans.fit(self, X=X, y=y, sample_weight=sample_weight) - elif self.norm == 'l1': + elif self.norm == 'L1': self._fit_l1(X=X, y=y, sample_weight=sample_weight) else: raise NotImplementedError( # pragma no cover - "Norm is not L1 or L2 but '{}'.".format(self.norm)) + "Norm is not 'L1' or 'L2' but '{}'.".format(self.norm)) return self def _fit_l1(self, X, y=None, sample_weight=None): """ - Computes k-means clustering with norm `'l1'`. + Computes k-means clustering with norm `'L1'`. :param X: array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. It must be noted that the data @@ -630,9 +642,9 @@ def transform(self, X): :return: X_new : array, shape [n_samples, k] X transformed in the new space. """ - if self.norm == 'l2': + if self.norm == 'L2': return KMeans.transform(self, X) - if self.norm == 'l1': + if self.norm == 'L1': return self._transform_l1(X) raise NotImplementedError( # pragma no cover "Norm is not L1 or L2 but '{}'.".format(self.norm)) @@ -662,9 +674,9 @@ def predict(self, X, sample_weight=None): :return: labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ - if self.norm == 'l2': + if self.norm == 'L2': return KMeans.predict(self, X) - if self.norm == 'l1': + if self.norm == 'L1': return self._predict_l1(X, sample_weight=sample_weight) raise NotImplementedError( # pragma no cover "Norm is not L1 or L2 but '{}'.".format(self.norm)) diff --git a/mlinsights/mlmodel/piecewise_tree_regression.py b/mlinsights/mlmodel/piecewise_tree_regression.py index 35d91ba7..b782b884 100644 --- a/mlinsights/mlmodel/piecewise_tree_regression.py +++ b/mlinsights/mlmodel/piecewise_tree_regression.py @@ -34,8 +34,7 @@ def __init__(self, criterion='mselin', splitter='best', max_depth=None, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease) - def fit(self, X, y, sample_weight=None, check_input=True, - X_idx_sorted=None): + def fit(self, X, y, sample_weight=None, check_input=True): """ Replaces the string stored in criterion by an instance of a class. """ @@ -45,17 +44,21 @@ def fit(self, X, y, sample_weight=None, check_input=True, from .piecewise_tree_regression_criterion_linear import ( # pylint: disable=E0611,C0415 LinearRegressorCriterion) replace = self.criterion - self.criterion = LinearRegressorCriterion(X) + self.criterion = LinearRegressorCriterion( + 1 if len(y.shape) <= 1 else y.shape[1], X) elif self.criterion == "simple": from .piecewise_tree_regression_criterion_fast import ( # pylint: disable=E0611,C0415 SimpleRegressorCriterionFast) replace = self.criterion - self.criterion = SimpleRegressorCriterionFast(X) + self.criterion = SimpleRegressorCriterionFast( + 1 if len(y.shape) <= 1 else y.shape[1], X.shape[0]) else: replace = None - DecisionTreeRegressor.fit(self, X, y, sample_weight=sample_weight, check_input=check_input, - X_idx_sorted=X_idx_sorted) + DecisionTreeRegressor.fit( + self, X, y, + sample_weight=sample_weight, + check_input=check_input) if replace: self.criterion = replace diff --git a/mlinsights/mlmodel/piecewise_tree_regression_criterion.pyx b/mlinsights/mlmodel/piecewise_tree_regression_criterion.pyx index 75a929bd..2441ec60 100644 --- a/mlinsights/mlmodel/piecewise_tree_regression_criterion.pyx +++ b/mlinsights/mlmodel/piecewise_tree_regression_criterion.pyx @@ -52,27 +52,26 @@ cdef class SimpleRegressorCriterion(CommonRegressorCriterion): def __setstate__(self, d): pass - def __cinit__(self, const DOUBLE_t[:, ::1] X): - self.sample_X = X + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): + self.n_outputs = n_outputs + self.n_samples = n_samples + # Allocate memory for the accumulators self.sample_w = NULL self.sample_wy = NULL self.sample_i = NULL - + # Criterion interface self.sample_weight = NULL self.samples = NULL - self.sum_total = NULL - self.sum_left = NULL - self.sum_right = NULL # allocation if self.sample_w == NULL: - self.sample_w = calloc(X.shape[0], sizeof(DOUBLE_t)) + self.sample_w = calloc(n_samples, sizeof(DOUBLE_t)) if self.sample_wy == NULL: - self.sample_wy = calloc(X.shape[0], sizeof(DOUBLE_t)) + self.sample_wy = calloc(n_samples, sizeof(DOUBLE_t)) if self.sample_i == NULL: - self.sample_i = calloc(X.shape[0], sizeof(SIZE_t)) + self.sample_i = calloc(n_samples, sizeof(SIZE_t)) cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, @@ -82,14 +81,14 @@ cdef class SimpleRegressorCriterion(CommonRegressorCriterion): This function is overwritten to check *y* and *X* size are the same. This API has changed in 0.21. """ - if y.shape[0] != self.sample_X.shape[0]: - raise ValueError("X.shape={} -- y.shape={}".format(self.sample_X.shape, y.shape)) + if y.shape[0] != self.n_samples: + raise ValueError("n_samples={} -- y.shape={}".format(self.n_samples, y.shape)) if y.shape[1] != 1: raise ValueError("This class only works for a single vector.") - return self.init_with_X(self.sample_X, y, sample_weight, weighted_n_samples, + return self.init_with_X(y, sample_weight, weighted_n_samples, samples, start, end) - cdef int init_with_X(self, const DOUBLE_t[:, ::1] X, + cdef int init_with_X(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, @@ -99,7 +98,6 @@ cdef class SimpleRegressorCriterion(CommonRegressorCriterion): Returns -1 in case of failure to allocate memory (and raise *MemoryError*) or 0 otherwise. - :param X: array-like, features, dtype=DOUBLE_t :param y: array-like, dtype=DOUBLE_t y is a buffer that can store values for n_outputs target variables :param sample_weight: array-like, dtype=DOUBLE_t @@ -168,7 +166,7 @@ cdef class SimpleRegressorCriterion(CommonRegressorCriterion): w += self.sample_w[k] weight[0] = w mean[0] = 0. if w == 0. else m / w - + cdef double _mse(self, SIZE_t start, SIZE_t end, DOUBLE_t mean, DOUBLE_t weight) nogil: """ Computes mean square error between *start* and *end* diff --git a/mlinsights/mlmodel/piecewise_tree_regression_criterion_fast.pyx b/mlinsights/mlmodel/piecewise_tree_regression_criterion_fast.pyx index 11053687..5c7965ec 100644 --- a/mlinsights/mlmodel/piecewise_tree_regression_criterion_fast.pyx +++ b/mlinsights/mlmodel/piecewise_tree_regression_criterion_fast.pyx @@ -45,27 +45,26 @@ cdef class SimpleRegressorCriterionFast(CommonRegressorCriterion): def __setstate__(self, d): pass - def __cinit__(self, const DOUBLE_t[:, ::1] X): - self.sample_X = X + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): + self.n_outputs = n_outputs + self.n_samples = n_samples + # Allocate memory for the accumulators self.sample_w_left = NULL self.sample_wy_left = NULL self.sample_wy2_left = NULL - + # Criterion interface self.sample_weight = NULL self.samples = NULL - self.sum_total = NULL - self.sum_left = NULL - self.sum_right = NULL - + # allocations if self.sample_w_left == NULL: - self.sample_w_left = calloc(X.shape[0], sizeof(DOUBLE_t)) + self.sample_w_left = calloc(n_samples, sizeof(DOUBLE_t)) if self.sample_wy_left == NULL: - self.sample_wy_left = calloc(X.shape[0], sizeof(DOUBLE_t)) + self.sample_wy_left = calloc(n_samples, sizeof(DOUBLE_t)) if self.sample_wy2_left == NULL: - self.sample_wy2_left = calloc(X.shape[0], sizeof(DOUBLE_t)) + self.sample_wy2_left = calloc(n_samples, sizeof(DOUBLE_t)) cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, @@ -75,14 +74,14 @@ cdef class SimpleRegressorCriterionFast(CommonRegressorCriterion): This function is overwritten to check *y* and *X* size are the same. This API has changed in 0.21. """ - if y.shape[0] != self.sample_X.shape[0]: - raise ValueError("X.shape={} -- y.shape={}".format(self.sample_X.shape, y.shape)) + if y.shape[0] != self.n_samples: + raise ValueError("n_samples={} -- y.shape={}".format(self.n_samples, y.shape)) if y.shape[1] != 1: raise ValueError("This class only works for a single vector.") - return self.init_with_X(self.sample_X, y, sample_weight, weighted_n_samples, + return self.init_with_X(y, sample_weight, weighted_n_samples, samples, start, end) - cdef int init_with_X(self, const DOUBLE_t[:, ::1] X, + cdef int init_with_X(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, @@ -92,7 +91,6 @@ cdef class SimpleRegressorCriterionFast(CommonRegressorCriterion): Returns -1 in case of failure to allocate memory (and raise *MemoryError*) or 0 otherwise. - :param X: array-like, features, dtype=DOUBLE_t :param y: array-like, dtype=DOUBLE_t y is a buffer that can store values for n_outputs target variables :param sample_weight: array-like, dtype=DOUBLE_t @@ -117,7 +115,7 @@ cdef class SimpleRegressorCriterionFast(CommonRegressorCriterion): self.y = y # we need to do that in case start > 0 or end < X.shape[0] - for i in range(0, X.shape[0]): + for i in range(0, self.n_samples): self.sample_w_left[i] = 0 self.sample_wy_left[i] = 0 self.sample_wy2_left[i] = 0 diff --git a/mlinsights/mlmodel/piecewise_tree_regression_criterion_linear.pyx b/mlinsights/mlmodel/piecewise_tree_regression_criterion_linear.pyx index e2153a08..245fcb07 100644 --- a/mlinsights/mlmodel/piecewise_tree_regression_criterion_linear.pyx +++ b/mlinsights/mlmodel/piecewise_tree_regression_criterion_linear.pyx @@ -27,6 +27,8 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): ` and is even slow as the criterion is more complex to compute. """ + cdef SIZE_t n_features + cdef const DOUBLE_t[:, ::1] sample_X cdef DOUBLE_t* sample_w cdef DOUBLE_t* sample_y cdef DOUBLE_t* sample_wy @@ -36,7 +38,7 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): cdef DOUBLE_t* sample_work cdef SIZE_t* sample_i cdef DOUBLE_t* sample_f_buffer - + cdef DOUBLE_t sample_sum_wy cdef DOUBLE_t sample_sum_w cdef SIZE_t nbvar @@ -70,8 +72,12 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): def __setstate__(self, d): pass - def __cinit__(self, const DOUBLE_t[:, ::1] X): + def __cinit__(self, SIZE_t n_outputs, const DOUBLE_t[:, ::1] X): + self.n_outputs = n_outputs self.sample_X = X + self.n_samples = X.shape[0] + self.n_features = X.shape[1] + # Allocate memory for the accumulators self.sample_w = NULL self.sample_y = NULL @@ -82,33 +88,30 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): self.sample_pC = NULL self.sample_pS = NULL self.sample_work = NULL - + # Criterion interface self.sample_weight = NULL self.samples = NULL - self.sum_total = NULL - self.sum_left = NULL - self.sum_right = NULL # allocation if self.sample_w == NULL: - self.sample_w = calloc(X.shape[0], sizeof(DOUBLE_t)) + self.sample_w = calloc(self.n_samples, sizeof(DOUBLE_t)) if self.sample_wy == NULL: - self.sample_wy = calloc(X.shape[0], sizeof(DOUBLE_t)) + self.sample_wy = calloc(self.n_samples, sizeof(DOUBLE_t)) if self.sample_y == NULL: - self.sample_y = calloc(X.shape[0], sizeof(DOUBLE_t)) + self.sample_y = calloc(self.n_samples, sizeof(DOUBLE_t)) if self.sample_i == NULL: - self.sample_i = calloc(X.shape[0], sizeof(SIZE_t)) + self.sample_i = calloc(self.n_samples, sizeof(SIZE_t)) if self.sample_f == NULL: - self.sample_f = calloc(X.shape[0] * (X.shape[1] + 1), sizeof(DOUBLE_t)) - - self.nbvar = X.shape[1] + 1 - self.nbrows = X.shape[0] + self.sample_f = calloc(self.n_samples * (self.n_features + 1), sizeof(DOUBLE_t)) + + self.nbvar = self.n_features + 1 + self.nbrows = self.n_samples self.work = (min(self.nbrows, self.nbvar) * 3 + max(max(self.nbrows, self.nbvar), min(self.nbrows, self.nbvar) * 2)) if self.sample_f_buffer == NULL: - self.sample_f_buffer = calloc(X.shape[0] * self.nbvar, sizeof(DOUBLE_t)) + self.sample_f_buffer = calloc(self.n_samples * self.nbvar, sizeof(DOUBLE_t)) if self.sample_pC == NULL: self.sample_pC = calloc(max(self.nbrows, self.nbvar), sizeof(DOUBLE_t)) if self.sample_work == NULL: @@ -116,6 +119,14 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): if self.sample_pS == NULL: self.sample_pS = calloc(self.nbvar, sizeof(DOUBLE_t)) + def __deepcopy__(self, memo=None): + """ + This does not a copy but mostly creates a new instance + of the same criterion initialized with the same data. + """ + inst = self.__class__(self.n_outputs, self.sample_X) + return inst + @staticmethod def create(DOUBLE_t[:, ::1] X, DOUBLE_t[:, ::1] y, DOUBLE_t[::1] sample_weight=None): """ @@ -139,7 +150,7 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): sum = sample_weight.sum() ws = &sample_weight[0] - obj = LinearRegressorCriterion(X) + obj = LinearRegressorCriterion(1 if len(y.shape) <= 1 else y.shape[0], X) obj.init(y, ws, sum, parr, 0, y.shape[0]) free(parr) return obj @@ -152,6 +163,8 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): This function is overwritten to check *y* and *X* size are the same. This API changed in 0.21. """ + if y.shape[0] != self.n_samples: + raise ValueError("n_samples={} -- y.shape={}".format(self.n_samples, y.shape)) if y.shape[0] != self.sample_X.shape[0]: raise ValueError("X.shape={} -- y.shape={}".format(self.sample_X.shape, y.shape)) if y.shape[1] != 1: @@ -231,7 +244,7 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): w += self.sample_w[k] weight[0] = w mean[0] = 0. if w == 0. else m / w - + cdef void _reglin(self, SIZE_t start, SIZE_t end, int low_rank) nogil: """ Solves the linear regression between *start* and *end* @@ -250,7 +263,7 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): sample_f_buffer[pos] = self.sample_f[idx] * w idx += self.nbvar pos += 1 - + cdef DOUBLE_t* pC = self.sample_pC for i in range(start, end): pC[i-start] = self.sample_wy[i] @@ -264,7 +277,7 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): cdef DOUBLE_t rcond = -1 cdef int rank cdef int work = self.work - + if row < col: if low_rank: ldb = col @@ -274,7 +287,7 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): sample_f_buffer, &lda, pC, &ldb, # 4-7 self.sample_pS, &rcond, &rank, # 8-10 self.sample_work, &work, &info) # 11-13 - + cdef double _mse(self, SIZE_t start, SIZE_t end, DOUBLE_t mean, DOUBLE_t weight) nogil: """ Computes mean square error between *start* and *end* @@ -284,12 +297,12 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): if end - start <= self.nbvar: # More coefficients than the number of observations. return 0. - + self._reglin(start, end, 0) - + cdef double* pC = self.sample_pC cdef SIZE_t j, idx - + # replaces what follows by gemm cdef DOUBLE_t squ = 0. cdef DOUBLE_t d @@ -303,7 +316,7 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): d -= self.sample_y[k] squ += d * d * self.sample_w[k] return 0. if weight == 0. else squ / weight - + cdef void _node_beta(self, double* dest) nogil: """ Stores the results of the linear regression @@ -318,7 +331,7 @@ cdef class LinearRegressorCriterion(CommonRegressorCriterion): """ Stores the results of the linear regression in an allocated numpy array. - + :param dest: allocated array """ if dest.shape[0] < self.nbvar: diff --git a/mlinsights/mlmodel/sklearn_transform_inv_fct.py b/mlinsights/mlmodel/sklearn_transform_inv_fct.py index 596eff93..b83687fe 100644 --- a/mlinsights/mlmodel/sklearn_transform_inv_fct.py +++ b/mlinsights/mlmodel/sklearn_transform_inv_fct.py @@ -137,7 +137,8 @@ def fit(self, X=None, y=None, sample_weight=None): self.random_state) # pylint: disable=E1101 lin = rs.permutation(lin) - for u in perm: + perm_keys = list(perm.keys()) + for u in perm_keys: perm[u] = lin[perm[u]] self.permutation_ = perm diff --git a/mlinsights/mltree/tree_structure.py b/mlinsights/mltree/tree_structure.py index 1f96517e..5909e1bb 100644 --- a/mlinsights/mltree/tree_structure.py +++ b/mlinsights/mltree/tree_structure.py @@ -225,7 +225,8 @@ def tree_leave_neighbors(model): if fe not in features: features[fe] = [] features[fe].append(th) - for fe in features: + features_keys = features.keys() + for fe in features_keys: features[fe] = list(sorted(set(features[fe]))) for fe, v in features.items(): if len(v) == 1: diff --git a/mlinsights/search_rank/search_engine_vectors.py b/mlinsights/search_rank/search_engine_vectors.py index f3ebba3d..eeec1207 100644 --- a/mlinsights/search_rank/search_engine_vectors.py +++ b/mlinsights/search_rank/search_engine_vectors.py @@ -228,7 +228,7 @@ def to_zip(self, zipfilename, **kwargs): else: zf = zipfilename close = False - if 'index' is not kwargs: + if 'index' not in kwargs: kwargs['index'] = False to_zip(self.features_, zf, 'SearchEngineVectors-features.npy') to_zip(self.metadata_, zf, 'SearchEngineVectors-metadata.csv', **kwargs) diff --git a/mlinsights/timeseries/plotting.py b/mlinsights/timeseries/plotting.py index 495a99a6..0fba6a08 100644 --- a/mlinsights/timeseries/plotting.py +++ b/mlinsights/timeseries/plotting.py @@ -4,8 +4,6 @@ """ import calendar import datetime -import matplotlib.pyplot as plt -import matplotlib.patches as patches # pylint: disable=R0402 def plot_week_timeseries(time, value, normalise=True, @@ -15,16 +13,16 @@ def plot_week_timeseries(time, value, normalise=True, """ Shows a timeseries dispatched by days as bars. - @param time dates - @param value values to display as bars. - @param normalise normalise data before showing it - @param label label of the series - @param values2 second series to show as a line - @param label2 label of the second series - @param daynames names to use for week day names (default is English) - @param xfmt format number of the X axis - @param ax existing axis - @return axis + :param time: dates + :param value: values to display as bars. + :param normalise: normalise data before showing it + :param label: label of the series + :param values2: second series to show as a line + :param label2: label of the second series + :param daynames: names to use for week day names (default is English) + :param xfmt: format number of the X axis + :param ax: existing axis + :return: axis .. plot:: @@ -62,8 +60,11 @@ def coor(ti): input_maxy = 1. if ax is None: + import matplotlib.pyplot as plt # pylint: disable=C0415 ax = plt.gca() + import matplotlib.patches as patches # pylint: disable=R0402,C0415 + # bars delta = None maxx, maxy = None, None diff --git a/requirements-win.txt b/requirements-win.txt index c3636314..5549c546 100644 --- a/requirements-win.txt +++ b/requirements-win.txt @@ -13,5 +13,5 @@ psutil pylint pymyinstall pyshp -scikit-learn +scikit-learn>=1.0 threadpoolctl diff --git a/requirements.txt b/requirements.txt index 4f9ac683..9ed4db24 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,7 +19,7 @@ pydata-sphinx-theme pyquickhelper>=1.10 pyquicksetup pylint>=2.9.3 -scikit-learn>=0.22.1 +scikit-learn>=1.0 scipy seaborn skl2onnx diff --git a/setup.py b/setup.py index 98e0eb3d..901ccb4e 100644 --- a/setup.py +++ b/setup.py @@ -59,8 +59,8 @@ def get_extensions(): extensions.append(("_piecewise_tree_regression_common", "_piecewise_tree_regression_common024")) else: - extensions.append(("_piecewise_tree_regression_common", - "_piecewise_tree_regression_common023")) + raise ImportError("Cannot build mlisinghts for scikit-learn<1.0.") + extensions.extend([ "piecewise_tree_regression_criterion", "piecewise_tree_regression_criterion_linear", @@ -122,7 +122,7 @@ def get_extensions(): package_dir=package_dir, package_data=package_data, setup_requires=["pyquicksetup", 'cython', 'scipy', 'scikit-learn'], - install_requires=['cython', 'scikit-learn>=0.22.1', 'pandas', 'scipy', + install_requires=['cython', 'scikit-learn>=1.0', 'pandas', 'scipy', 'matplotlib', 'pandas_streaming', 'numpy>=1.16'], ext_modules=ext_modules, # cythonize(ext_modules), )