Skip to content

Commit 2694e6d

Browse files
Alexsandrussnapetrov
authored andcommitted
Update algorithm wrappers to latest Scikit-learn changes (#1032)
* Update algorithm wrappers to latest sklearn changes * PEP8 and lasso fixes * Fix pca versioning * Update KMeans versioning * Rewrite kmeans.fit versions branching * Fix TSNE perplexity to pass parameter check (perplexity < n_samples) * Future warnings fix * Restore auto kmeans algorithm for compatibility with sklearn<1.1
1 parent 44a0220 commit 2694e6d

File tree

17 files changed

+251
-177
lines changed

17 files changed

+251
-177
lines changed

daal4py/sklearn/cluster/_dbscan.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ def fit(self, X, y=None, sample_weight=None):
233233
Returns a fitted instance of self.
234234
"""
235235
if self.eps <= 0.0:
236-
raise ValueError("eps must be positive.")
236+
raise ValueError(f"eps == {self.eps}, must be > 0.0.")
237237

238238
if sklearn_check_version("1.0"):
239239
self._check_feature_names(X, reset=True)

daal4py/sklearn/cluster/_k_means_0_23.py

Lines changed: 80 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@
4141
PatchingConditionsChain)
4242
from .._device_offload import support_usm_ndarray
4343

44+
if sklearn_check_version('1.1'):
45+
from sklearn.utils.validation import (
46+
_check_sample_weight, _is_arraylike_not_scalar)
47+
4448

4549
def _validate_center_shape(X, n_centers, centers):
4650
"""Check if centers is compatible with X and n_centers"""
@@ -242,53 +246,82 @@ def _fit(self, X, y=None, sample_weight=None):
242246
are assigned equal weight (default: None)
243247
244248
"""
245-
if hasattr(self, 'precompute_distances'):
246-
if self.precompute_distances != 'deprecated':
247-
if sklearn_check_version('0.24'):
248-
warnings.warn("'precompute_distances' was deprecated in version "
249-
"0.23 and will be removed in 1.0 (renaming of 0.25)."
250-
" It has no effect", FutureWarning)
251-
elif sklearn_check_version('0.23'):
252-
warnings.warn("'precompute_distances' was deprecated in version "
253-
"0.23 and will be removed in 0.25. It has no "
254-
"effect", FutureWarning)
255-
256-
self._n_threads = None
257-
if hasattr(self, 'n_jobs'):
258-
if self.n_jobs != 'deprecated':
259-
if sklearn_check_version('0.24'):
260-
warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
261-
" removed in 1.0 (renaming of 0.25).", FutureWarning)
262-
elif sklearn_check_version('0.23'):
263-
warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
264-
" removed in 0.25.", FutureWarning)
265-
self._n_threads = self.n_jobs
266-
self._n_threads = _openmp_effective_n_threads(self._n_threads)
267-
268-
if self.n_init <= 0:
269-
raise ValueError(
270-
f"n_init should be > 0, got {self.n_init} instead.")
271-
272-
random_state = check_random_state(self.random_state)
273-
if sklearn_check_version("1.0"):
274-
self._check_feature_names(X, reset=True)
275-
276-
if self.max_iter <= 0:
277-
raise ValueError(
278-
f"max_iter should be > 0, got {self.max_iter} instead.")
249+
init = self.init
250+
if sklearn_check_version('1.1'):
251+
if sklearn_check_version('1.2'):
252+
self._validate_params()
253+
254+
X = self._validate_data(
255+
X,
256+
accept_sparse="csr",
257+
dtype=[np.float64, np.float32],
258+
order="C",
259+
copy=self.copy_x,
260+
accept_large_sparse=False,
261+
)
279262

280-
algorithm = self.algorithm
281-
if algorithm == "elkan" and self.n_clusters == 1:
282-
warnings.warn("algorithm='elkan' doesn't make sense for a single "
283-
"cluster. Using 'full' instead.", RuntimeWarning)
284-
algorithm = "full"
263+
if sklearn_check_version('1.2'):
264+
self._check_params_vs_input(X)
265+
else:
266+
self._check_params(X)
285267

286-
if algorithm == "auto":
287-
algorithm = "full" if self.n_clusters == 1 else "elkan"
268+
random_state = check_random_state(self.random_state)
269+
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
270+
self._n_threads = _openmp_effective_n_threads()
288271

289-
if algorithm not in ["full", "elkan"]:
290-
raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
291-
" {}".format(str(algorithm)))
272+
# Validate init array
273+
init_is_array_like = _is_arraylike_not_scalar(init)
274+
if init_is_array_like:
275+
init = check_array(init, dtype=X.dtype, copy=True, order="C")
276+
self._validate_center_shape(X, init)
277+
else:
278+
if hasattr(self, 'precompute_distances'):
279+
if self.precompute_distances != 'deprecated':
280+
if sklearn_check_version('0.24'):
281+
warnings.warn("'precompute_distances' was deprecated in version "
282+
"0.23 and will be removed in 1.0 (renaming of 0.25)."
283+
" It has no effect", FutureWarning)
284+
elif sklearn_check_version('0.23'):
285+
warnings.warn("'precompute_distances' was deprecated in version "
286+
"0.23 and will be removed in 0.25. It has no "
287+
"effect", FutureWarning)
288+
289+
self._n_threads = None
290+
if hasattr(self, 'n_jobs'):
291+
if self.n_jobs != 'deprecated':
292+
if sklearn_check_version('0.24'):
293+
warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
294+
" removed in 1.0 (renaming of 0.25).", FutureWarning)
295+
elif sklearn_check_version('0.23'):
296+
warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
297+
" removed in 0.25.", FutureWarning)
298+
self._n_threads = self.n_jobs
299+
self._n_threads = _openmp_effective_n_threads(self._n_threads)
300+
301+
if self.n_init <= 0:
302+
raise ValueError(
303+
f"n_init should be > 0, got {self.n_init} instead.")
304+
305+
random_state = check_random_state(self.random_state)
306+
if sklearn_check_version("1.0"):
307+
self._check_feature_names(X, reset=True)
308+
309+
if self.max_iter <= 0:
310+
raise ValueError(
311+
f"max_iter should be > 0, got {self.max_iter} instead.")
312+
313+
algorithm = self.algorithm
314+
if algorithm == "elkan" and self.n_clusters == 1:
315+
warnings.warn("algorithm='elkan' doesn't make sense for a single "
316+
"cluster. Using 'full' instead.", RuntimeWarning)
317+
algorithm = "full"
318+
319+
if algorithm == "auto":
320+
algorithm = "full" if self.n_clusters == 1 else "elkan"
321+
322+
if algorithm not in ["full", "elkan"]:
323+
raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
324+
" {}".format(str(algorithm)))
292325

293326
X_len = _num_samples(X)
294327

@@ -317,8 +350,10 @@ def _fit(self, X, y=None, sample_weight=None):
317350
self.n_features_in_ = X.shape[1]
318351
self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
319352
_daal4py_k_means_fit(
320-
X, self.n_clusters, self.max_iter, self.tol, self.init, self.n_init,
353+
X, self.n_clusters, self.max_iter, self.tol, init, self.n_init,
321354
self.verbose, random_state)
355+
if sklearn_check_version('1.1'):
356+
self._n_features_out = self.cluster_centers_.shape[0]
322357
else:
323358
super(KMeans, self).fit(X, y=y, sample_weight=sample_weight)
324359
return self

daal4py/sklearn/decomposition/_pca.py

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ def __init__(
5252
svd_solver='auto',
5353
tol=0.0,
5454
iterated_power='auto',
55+
n_oversamples=10,
56+
power_iteration_normalizer="auto",
5557
random_state=None
5658
):
5759
self.n_components = n_components
@@ -60,6 +62,8 @@ def __init__(
6062
self.svd_solver = svd_solver
6163
self.tol = tol
6264
self.iterated_power = iterated_power
65+
self.n_oversamples = n_oversamples
66+
self.power_iteration_normalizer = power_iteration_normalizer
6367
self.random_state = random_state
6468

6569
def _validate_n_components(self, n_components, n_samples, n_features):
@@ -200,26 +204,37 @@ def _fit(self, X):
200204
shape_good_for_daal = X.shape[1] / X.shape[0] < 2
201205

202206
if self._fit_svd_solver == 'auto':
203-
if n_components == 'mle':
204-
self._fit_svd_solver = 'full'
205-
else:
206-
n, p, k = X.shape[0], X.shape[1], n_components
207-
# These coefficients are result of training of Logistic Regression
208-
# (max_iter=10000, solver="liblinear", fit_intercept=False)
209-
# on different datasets and number of components. X is a dataset with
210-
# npk, np^2, and n^2 columns. And y is speedup of patched scikit-learn's
211-
# full PCA against stock scikit-learn's randomized PCA.
212-
regression_coefs = np.array([
213-
[9.779873e-11, n * p * k],
214-
[-1.122062e-11, n * p * p],
215-
[1.127905e-09, n ** 2],
216-
])
217-
218-
if n_components >= 1 \
219-
and np.dot(regression_coefs[:, 0], regression_coefs[:, 1]) <= 0:
220-
self._fit_svd_solver = 'randomized'
207+
if sklearn_check_version('1.1'):
208+
# Small problem or n_components == 'mle', just call full PCA
209+
if max(X.shape) <= 500 or n_components == "mle":
210+
self._fit_svd_solver = "full"
211+
elif 1 <= n_components < 0.8 * min(X.shape):
212+
self._fit_svd_solver = "randomized"
213+
# This is also the case of n_components in (0,1)
221214
else:
215+
self._fit_svd_solver = "full"
216+
else:
217+
if n_components == 'mle':
222218
self._fit_svd_solver = 'full'
219+
else:
220+
n, p, k = X.shape[0], X.shape[1], n_components
221+
# These coefficients are result of training of Logistic Regression
222+
# (max_iter=10000, solver="liblinear", fit_intercept=False)
223+
# on different datasets and number of components.
224+
# X is a dataset with npk, np^2, and n^2 columns.
225+
# And y is speedup of patched scikit-learn's
226+
# full PCA against stock scikit-learn's randomized PCA.
227+
regression_coefs = np.array([
228+
[9.779873e-11, n * p * k],
229+
[-1.122062e-11, n * p * p],
230+
[1.127905e-09, n ** 2],
231+
])
232+
233+
if n_components >= 1 and np.dot(
234+
regression_coefs[:, 0], regression_coefs[:, 1]) <= 0:
235+
self._fit_svd_solver = 'randomized'
236+
else:
237+
self._fit_svd_solver = 'full'
223238

224239
if not shape_good_for_daal or self._fit_svd_solver != 'full':
225240
if sklearn_check_version('0.23'):
@@ -346,7 +361,11 @@ def fit_transform(self, X, y=None):
346361
This method returns a Fortran-ordered array. To convert it to a
347362
C-ordered array, use 'np.ascontiguousarray'.
348363
"""
349-
U, S, _ = self._fit(X)
364+
365+
if sklearn_check_version('1.2'):
366+
self._validate_params()
367+
368+
U, S, Vt = self._fit(X)
350369

351370
_patching_status = PatchingConditionsChain(
352371
"sklearn.decomposition.PCA.fit_transform")

0 commit comments

Comments
 (0)