Some fixes before sklearn 1.0 (#641)

OnlyDeniko · web-flow · commit fba7cbe8b2e0 · 2021-05-04T20:22:43.000+03:00
* tsne fix

* fix logreg

* fixes

* fix fixes :-)

* wrong argument
diff --git a/daal4py/sklearn/cluster/_k_means_0_23.py b/daal4py/sklearn/cluster/_k_means_0_23.py
@@ -15,6 +15,7 @@
 #===============================================================================
 
 import numpy as np
+import numbers
 from scipy import sparse as sp
 
 from sklearn.utils import (check_random_state, check_array)
@@ -284,14 +285,15 @@ def _fit(self, X, y=None, sample_weight=None):
         raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
                          " {}".format(str(algorithm)))
 
-    daal_ready = True
-    if daal_ready:
-        X_len = _num_samples(X)
-        daal_ready = (self.n_clusters <= X_len)
-        if daal_ready and sample_weight is not None:
+    X_len = _num_samples(X)
+    daal_ready = self.n_clusters <= X_len
+    if daal_ready and sample_weight is not None:
+        if isinstance(sample_weight, numbers.Number):
+            sample_weight = np.full(X_len, sample_weight, dtype=np.float64)
+        else:
             sample_weight = np.asarray(sample_weight)
-            daal_ready = (sample_weight.shape == (X_len,)) and (
-                np.allclose(sample_weight, np.ones_like(sample_weight)))
+        daal_ready = (sample_weight.shape == (X_len,)) and (
+            np.allclose(sample_weight, np.ones_like(sample_weight)))
 
     if daal_ready:
         logging.info(
diff --git a/daal4py/sklearn/ensemble/_forest.py b/daal4py/sklearn/ensemble/_forest.py
@@ -775,7 +775,7 @@ class RandomForestRegressor(RandomForestRegressor_original):
 
     def __init__(self,
                  n_estimators=100, *,
-                 criterion="mse",
+                 criterion="squared_error" if sklearn_check_version('1.0') else "mse",
                  max_depth=None,
                  min_samples_split=2,
                  min_samples_leaf=1,
diff --git a/daal4py/sklearn/linear_model/_linear_0_24.py b/daal4py/sklearn/linear_model/_linear_0_24.py
@@ -40,7 +40,7 @@ def __init__(self, fit_intercept=True, normalize=False, copy_X=True,
     def fit(self, X, y, sample_weight=None):
         if sklearn_check_version('1.0'):
             from sklearn.linear_model._base import _deprecate_normalize
-            self.normalize = _deprecate_normalize(
+            self._normalize = _deprecate_normalize(
                 self.normalize, default=False,
                 estimator_name=self.__class__.__name__
             )
diff --git a/daal4py/sklearn/linear_model/_ridge_0_22.py b/daal4py/sklearn/linear_model/_ridge_0_22.py
@@ -102,7 +102,7 @@ def _fit_ridge(self, X, y, sample_weight=None):
     """
     if sklearn_check_version('1.0'):
         from sklearn.linear_model._base import _deprecate_normalize
-        self.normalize = _deprecate_normalize(
+        self._normalize = _deprecate_normalize(
             self.normalize, default=False,
             estimator_name=self.__class__.__name__
         )
diff --git a/daal4py/sklearn/linear_model/logistic_path.py b/daal4py/sklearn/linear_model/logistic_path.py
@@ -54,7 +54,7 @@
 from sklearn.linear_model._base import (LinearClassifierMixin, SparseCoefMixin,
                                         BaseEstimator)
 from .._utils import (daal_check_version, getFPType,
-                      get_patch_message)
+                      get_patch_message, sklearn_check_version)
 import logging
 
 
@@ -243,8 +243,13 @@ def __logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     daal_ready = daal_ready and sample_weight is None and class_weight is None
 
     if not daal_ready:
-        sample_weight = _check_sample_weight(sample_weight, X,
-                                             dtype=X.dtype)
+        if sklearn_check_version('0.24'):
+            sample_weight = _check_sample_weight(sample_weight, X,
+                                                 dtype=X.dtype,
+                                                 copy=True)
+        else:
+            sample_weight = _check_sample_weight(sample_weight, X,
+                                                 dtype=X.dtype)
     # If class_weights is a dict (provided by the user), the weights
     # are assigned to the original labels. If it is "balanced", then
     # the class_weights are assigned after masking the labels with a OvR.
diff --git a/daal4py/sklearn/manifold/_t_sne.py b/daal4py/sklearn/manifold/_t_sne.py
@@ -40,10 +40,35 @@ class TSNE(BaseTSNE):
     def _fit(self, X, skip_num_points=0):
         """Private function to fit the model using X as training data."""
 
+        if isinstance(self.init, str) and self.init == 'warn':
+            warnings.warn("The default initialization in TSNE will change "
+                          "from 'random' to 'pca' in 1.2.", FutureWarning)
+            self._init = 'random'
+        else:
+            self._init = self.init
+
+        if isinstance(self._init, str) and self._init == 'pca' and issparse(X):
+            raise TypeError("PCA initialization is currently not suported "
+                            "with the sparse input matrix. Use "
+                            "init=\"random\" instead.")
+
         if self.method not in ['barnes_hut', 'exact']:
             raise ValueError("'method' must be 'barnes_hut' or 'exact'")
         if self.angle < 0.0 or self.angle > 1.0:
             raise ValueError("'angle' must be between 0.0 - 1.0")
+        if self.learning_rate == 'warn':
+            warnings.warn("The default learning rate in TSNE will change "
+                          "from 200.0 to 'auto' in 1.2.", FutureWarning)
+            self._learning_rate = 200.0
+        else:
+            self._learning_rate = self.learning_rate
+        if self._learning_rate == 'auto':
+            self._learning_rate = X.shape[0] / self.early_exaggeration / 4
+            self._learning_rate = np.maximum(self._learning_rate, 50)
+        else:
+            if not (self._learning_rate > 0):
+                raise ValueError("'learning_rate' must be a positive number "
+                                 "or 'auto'.")
 
         if hasattr(self, 'square_distances'):
             if self.square_distances not in [True, 'legacy']:
@@ -74,7 +99,7 @@ def _fit(self, X, skip_num_points=0):
                 X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                                 dtype=[np.float32, np.float64])
         if self.metric == "precomputed":
-            if isinstance(self.init, str) and self.init == 'pca':
+            if isinstance(self._init, str) and self._init == 'pca':
                 raise ValueError("The parameter init=\"pca\" cannot be "
                                  "used with metric=\"precomputed\".")
             if X.shape[0] != X.shape[1]:
@@ -187,13 +212,17 @@ def _fit(self, X, skip_num_points=0):
             P = _joint_probabilities_nn(distances_nn, self.perplexity,
                                         self.verbose)
 
-        if isinstance(self.init, np.ndarray):
-            X_embedded = self.init
-        elif self.init == 'pca':
+        if isinstance(self._init, np.ndarray):
+            X_embedded = self._init
+        elif self._init == 'pca':
             pca = PCA(n_components=self.n_components, svd_solver='randomized',
                       random_state=random_state)
             X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
-        elif self.init == 'random':
+            warnings.warn("The PCA initialization in TSNE will change to "
+                          "have the standard deviation of PC1 equal to 1e-4 "
+                          "in 1.2. This will ensure better convergence.",
+                          FutureWarning)
+        elif self._init == 'random':
             # The embedding is initialized with iid samples from Gaussians with
             # standard deviation 1e-4.
             X_embedded = 1e-4 * random_state.randn(

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ def __init__(self, fit_intercept=True, normalize=False, copy_X=True,`
`40`	`40`	`def fit(self, X, y, sample_weight=None):`
`41`	`41`	`if sklearn_check_version('1.0'):`
`42`	`42`	`from sklearn.linear_model._base import _deprecate_normalize`
`43`		`- self.normalize = _deprecate_normalize(`
	`43`	`+ self._normalize = _deprecate_normalize(`
`44`	`44`	`self.normalize, default=False,`
`45`	`45`	`estimator_name=self.__class__.__name__`
`46`	`46`	`)`
Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ def _fit_ridge(self, X, y, sample_weight=None):`
`102`	`102`	`"""`
`103`	`103`	`if sklearn_check_version('1.0'):`
`104`	`104`	`from sklearn.linear_model._base import _deprecate_normalize`
`105`		`- self.normalize = _deprecate_normalize(`
	`105`	`+ self._normalize = _deprecate_normalize(`
`106`	`106`	`self.normalize, default=False,`
`107`	`107`	`estimator_name=self.__class__.__name__`
`108`	`108`	`)`