diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 2689daa247..2fb6c8b1e4 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -83,6 +83,7 @@ test: # - dpnp # next deps are synced with requirements-test.txt - pytest + - pytest-mock - pandas - xgboost - lightgbm diff --git a/daal4py/sklearn/_utils.py b/daal4py/sklearn/_utils.py index 5c23ef1f84..e0c2b2b343 100644 --- a/daal4py/sklearn/_utils.py +++ b/daal4py/sklearn/_utils.py @@ -21,6 +21,7 @@ from typing import Any, Tuple import numpy as np +import scipy.sparse as sp from numpy.lib.recfunctions import require_fields from sklearn import __version__ as sklearn_version @@ -195,6 +196,10 @@ def convert_to_old_tree_nodes(tree_nodes): return convert_to_old_tree_nodes(tree_nodes) +def is_sparse(x): + return sp.issparse(x) or (is_DataFrame(x) and hasattr(x, "sparse")) + + class PatchingConditionsChain: def __init__(self, scope_name): self.scope_name = scope_name diff --git a/daal4py/sklearn/linear_model/logistic_path.py b/daal4py/sklearn/linear_model/logistic_path.py index d9b5a16537..4c09829368 100755 --- a/daal4py/sklearn/linear_model/logistic_path.py +++ b/daal4py/sklearn/linear_model/logistic_path.py @@ -18,7 +18,6 @@ import numpy as np import scipy.optimize as optimize -import scipy.sparse as sparse import sklearn.linear_model._logistic as logistic_module from sklearn.linear_model._logistic import _LOGISTIC_SOLVER_CONVERGENCE_MSG from sklearn.linear_model._logistic import ( @@ -32,7 +31,7 @@ import daal4py as d4p from .._n_jobs_support import control_n_jobs -from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version +from .._utils import PatchingConditionsChain, getFPType, is_sparse, sklearn_check_version from ..utils.validation import check_feature_names from .logistic_loss import ( _daal4py_cross_entropy_loss_extra_args, @@ -400,9 +399,9 @@ def daal4py_predict(self, X, resultsToEvaluate): _dal_ready = _patching_status.and_conditions( [ - (not sparse.issparse(X), "X is sparse. Sparse input is not supported."), + (not is_sparse(X), "X is sparse. Sparse input is not supported."), ( - not sparse.issparse(self.coef_), + not is_sparse(self.coef_), "self.coef_ is sparse. Sparse coefficients are not supported.", ), (fptype is not None, "Unable to get dtype."), @@ -466,7 +465,7 @@ def logistic_regression_path(*args, **kwargs): f"'{kwargs['solver']}' solver is not supported. " "Only 'lbfgs' and 'newton-cg' solvers are supported.", ), - (not sparse.issparse(args[0]), "X is sparse. Sparse input is not supported."), + (not is_sparse(args[0]), "X is sparse. Sparse input is not supported."), (kwargs["sample_weight"] is None, "Sample weights are not supported."), (kwargs["class_weight"] is None, "Class weights are not supported."), ( diff --git a/daal4py/sklearn/manifold/_t_sne.py b/daal4py/sklearn/manifold/_t_sne.py index 88d964e1fa..376e06de24 100755 --- a/daal4py/sklearn/manifold/_t_sne.py +++ b/daal4py/sklearn/manifold/_t_sne.py @@ -20,7 +20,6 @@ from time import time import numpy as np -from scipy.sparse import issparse from sklearn.decomposition import PCA from sklearn.manifold import TSNE as BaseTSNE from sklearn.manifold._t_sne import _joint_probabilities, _joint_probabilities_nn @@ -32,6 +31,7 @@ from daal4py.sklearn._utils import ( PatchingConditionsChain, daal_check_version, + is_sparse, sklearn_check_version, ) @@ -149,7 +149,7 @@ def _fit(self, X, skip_num_points=0): ), ( not ( - isinstance(self.init, str) and self.init == "pca" and issparse(X) + isinstance(self.init, str) and self.init == "pca" and is_sparse(X) ), "PCA initialization is not supported with sparse input matrices.", ), @@ -273,7 +273,7 @@ def _fit(self, X, skip_num_points=0): "should contain positive distances.", ) - if self.method == "exact" and issparse(X): + if self.method == "exact" and is_sparse(X): raise TypeError( 'TSNE with method="exact" does not accept sparse ' 'precomputed distance matrix. Use method="barnes_hut" ' diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 5b1f30ad93..9b4796505a 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -219,8 +219,8 @@ deselected_tests: - tests/test_common.py::test_estimators[NuSVC()-check_class_weight_classifiers] <1.0 - tests/test_multioutput.py::test_multi_output_classification - # Linear Regression - minor mismatches in error/warning messages - - linear_model/tests/test_base.py::test_linear_regression_pd_sparse_dataframe_warning + # Requires SciPy version which is not available in too old environments + - linear_model/tests/test_base.py::test_linear_regression_pd_sparse_dataframe_warning <1.1 # L1 Linear models with sklearn 1.1 + numpy > 1.25 - extra warnings from numpy lead to test fail - linear_model/tests/test_coordinate_descent.py::test_assure_warning_when_normalize[True-1-LassoCV] >=1.1,<1.2 diff --git a/onedal/svm/svm.py b/onedal/svm/svm.py index b8315320e0..68b97b98d9 100644 --- a/onedal/svm/svm.py +++ b/onedal/svm/svm.py @@ -20,6 +20,7 @@ import numpy as np from scipy import sparse as sp +from daal4py.sklearn._utils import is_sparse from onedal._device_offload import supports_queue from onedal.common._backend import bind_default_backend from onedal.utils import _sycl_queue_manager as QM @@ -155,7 +156,7 @@ def _fit(self, X, y, sample_weight): data = (X, y, sample_weight) else: data = (X, y) - self._sparse = sp.issparse(X) + self._sparse = is_sparse(X) if self.kernel == "linear": self._scale_, self._sigma_ = 1.0, 1.0 @@ -251,7 +252,7 @@ def _predict(self, X): if self._sparse: X.sort_indices() - if sp.issparse(X) and not self._sparse and not callable(self.kernel): + if is_sparse(X) and not self._sparse and not callable(self.kernel): raise ValueError( "cannot use sparse input in %r trained on dense data" % type(self).__name__ @@ -299,7 +300,7 @@ def _decision_function(self, X): if self._sparse: X.sort_indices() - if sp.issparse(X) and not self._sparse and not callable(self.kernel): + if is_sparse(X) and not self._sparse and not callable(self.kernel): raise ValueError( "cannot use sparse input in %r trained on dense data" % type(self).__name__ diff --git a/requirements-test.txt b/requirements-test.txt index 0cb95657bf..717ce98a77 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -2,6 +2,7 @@ pytest==7.4.4 ; python_version <= '3.10' pytest==9.0.2 ; python_version >= '3.11' pytest-json-report==1.5.0 pytest-cov==7.0.0 +pytest-mock==3.15.1 numpy>=1.19.5 ; python_version <= '3.9' numpy>=1.21.6 ; python_version == '3.10' numpy>=1.23.5 ; python_version == '3.11' diff --git a/sklearnex/basic_statistics/basic_statistics.py b/sklearnex/basic_statistics/basic_statistics.py index 9f646a32df..285834180f 100644 --- a/sklearnex/basic_statistics/basic_statistics.py +++ b/sklearnex/basic_statistics/basic_statistics.py @@ -16,11 +16,10 @@ import warnings -from scipy.sparse import issparse from sklearn.base import BaseEstimator from daal4py.sklearn._n_jobs_support import control_n_jobs -from daal4py.sklearn._utils import daal_check_version, sklearn_check_version +from daal4py.sklearn._utils import daal_check_version, is_sparse, sklearn_check_version from onedal.basic_statistics import BasicStatistics as onedal_BasicStatistics from onedal.utils.validation import _is_csr @@ -160,11 +159,11 @@ def _onedal_gpu_supported(self, method_name, *data): ) X, sample_weight = data - is_data_supported = not issparse(X) or ( + is_data_supported = not is_sparse(X) or ( _is_csr(X) and daal_check_version((2025, "P", 200)) ) - is_sample_weight_supported = sample_weight is None or not issparse(X) + is_sample_weight_supported = sample_weight is None or not is_sparse(X) patching_status.and_conditions( [ diff --git a/sklearnex/cluster/dbscan.py b/sklearnex/cluster/dbscan.py index 68a062d179..1ac380512d 100755 --- a/sklearnex/cluster/dbscan.py +++ b/sklearnex/cluster/dbscan.py @@ -14,11 +14,10 @@ # limitations under the License. # =============================================================================== -from scipy import sparse as sp from sklearn.cluster import DBSCAN as _sklearn_DBSCAN from daal4py.sklearn._n_jobs_support import control_n_jobs -from daal4py.sklearn._utils import sklearn_check_version +from daal4py.sklearn._utils import is_sparse, sklearn_check_version from onedal.cluster import DBSCAN as onedal_DBSCAN from onedal.utils._array_api import _is_numpy_namespace @@ -132,7 +131,7 @@ def _onedal_supported(self, method_name, *data): f"'{self.metric}' (p={self.p}) metric is not supported. " "Only 'euclidean' or 'minkowski' with p=2 metrics are supported.", ), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (not is_sparse(X), "X is sparse. Sparse input is not supported."), ] ) return patching_status diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 93dafa893b..bdfc9e67be 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -24,7 +24,6 @@ import warnings import numpy as np - from scipy.sparse import issparse from sklearn.cluster import KMeans as _sklearn_KMeans from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils.validation import ( @@ -34,7 +33,7 @@ ) from daal4py.sklearn._n_jobs_support import control_n_jobs - from daal4py.sklearn._utils import sklearn_check_version + from daal4py.sklearn._utils import is_sparse, sklearn_check_version from onedal._device_offload import support_input_format from onedal.cluster import KMeans as onedal_KMeans from onedal.utils.validation import _is_csr @@ -111,7 +110,7 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): is_data_supported = ( _is_csr(X) and daal_check_version((2024, "P", 700)) - ) or not issparse(X) + ) or not is_sparse(X) _acceptable_sample_weights = self._validate_sample_weight(sample_weight, X) @@ -208,7 +207,7 @@ def _onedal_predict_supported(self, method_name, *data): is_data_supported = ( _is_csr(X) and daal_check_version((2024, "P", 700)) - ) or not issparse(X) + ) or not is_sparse(X) # algorithm "auto" has been deprecated since 1.1, # algorithm "full" has been replaced by "lloyd" diff --git a/sklearnex/decomposition/pca.py b/sklearnex/decomposition/pca.py index f0c5bbe817..8e1afd42d5 100755 --- a/sklearnex/decomposition/pca.py +++ b/sklearnex/decomposition/pca.py @@ -16,14 +16,13 @@ import logging -from daal4py.sklearn._utils import daal_check_version +from daal4py.sklearn._utils import daal_check_version, is_sparse if daal_check_version((2024, "P", 100)): from numbers import Integral from warnings import warn import numpy as np - from scipy.sparse import issparse from sklearn.decomposition._pca import _infer_dimension from sklearn.utils.extmath import stable_cumsum from sklearn.utils.validation import check_is_fitted @@ -171,7 +170,7 @@ def _onedal_supported(self, method_name, *data): "solvers are supported." ), ), - (not issparse(X), "oneDAL PCA does not support sparse data"), + (not is_sparse(X), "oneDAL PCA does not support sparse data"), ] ) return patching_status diff --git a/sklearnex/dummy/_dummy.py b/sklearnex/dummy/_dummy.py index 9b374b13b9..0354a20e56 100644 --- a/sklearnex/dummy/_dummy.py +++ b/sklearnex/dummy/_dummy.py @@ -20,12 +20,11 @@ comments guiding code development should be removed if reused unless pertinent to the derivative implementation.""" import numpy as np -import scipy.sparse as sp from sklearn.dummy import DummyRegressor as _sklearn_DummyRegressor from sklearn.utils.validation import check_is_fitted from daal4py.sklearn._n_jobs_support import control_n_jobs -from daal4py.sklearn._utils import daal_check_version, sklearn_check_version +from daal4py.sklearn._utils import daal_check_version, is_sparse, sklearn_check_version from onedal._device_offload import support_input_format from onedal.dummy import DummyEstimator as onedal_DummyEstimator @@ -463,9 +462,9 @@ def _onedal_cpu_supported(self, method_name, *data): # of the oneDAL implementation to the aspects of the sklearn # estimator. For example, oneDAL may not support sparse inputs # where sklearn might, that would need to be checked with - # scipy.sparse.issparse(X). In general the conditions will - # correspond to information in the metadata and/or the estimator - # parameters. + # 'is_sparse(X)' (which is broader that SciPy's 'sparse.issparse')'. + # In general the conditions will correspond to information in the + # metadata and/or the estimator parameters. # # In no circumstance should ``validate_data`` be called here or # in _onedal_gpu_supoorted to get the data into the proper form. @@ -480,7 +479,7 @@ def _onedal_cpu_supported(self, method_name, *data): patching_status.and_conditions( [ ( - not sp.issparse(X), + not is_sparse(X), "sparse data is not supported", ), ( @@ -514,7 +513,7 @@ def _onedal_cpu_supported(self, method_name, *data): [ (hasattr(self, "_onedal_estimator"), "oneDAL model was not trained."), ( - not sp.issparse(X), + not is_sparse(X), "sparse data is not supported", ), ] @@ -538,7 +537,7 @@ def _onedal_gpu_supported(self, method_name, *data): patching_status.and_conditions( [ ( - not sp.issparse(X), + not is_sparse(X), "sparse data is not supported", ), ( @@ -564,7 +563,7 @@ def _onedal_gpu_supported(self, method_name, *data): [ (hasattr(self, "_onedal_estimator"), "oneDAL model was not trained."), ( - not sp.issparse(X), + not is_sparse(X), "sparse data is not supported", ), ] diff --git a/sklearnex/ensemble/_forest.py b/sklearnex/ensemble/_forest.py index a6e5caf69b..b2647810f1 100644 --- a/sklearnex/ensemble/_forest.py +++ b/sklearnex/ensemble/_forest.py @@ -22,7 +22,6 @@ from functools import partial import numpy as np -from scipy import sparse as sp from sklearn.base import clone, is_classifier from sklearn.ensemble import ExtraTreesClassifier as _sklearn_ExtraTreesClassifier from sklearn.ensemble import ExtraTreesRegressor as _sklearn_ExtraTreesRegressor @@ -48,6 +47,7 @@ from daal4py.sklearn._utils import ( check_tree_nodes, daal_check_version, + is_sparse, sklearn_check_version, ) from onedal._device_offload import support_input_format @@ -250,7 +250,7 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported.", ), ( - not sp.issparse(X) and not sp.issparse(y), + not is_sparse(X) and not is_sparse(y), "Sparse inputs are not supported.", ), ( @@ -308,7 +308,7 @@ def _onedal_cpu_supported(self, method_name, *data): patching_status.and_conditions( [ (hasattr(self, "_onedal_estimator"), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (not is_sparse(X), "X is sparse. Sparse input is not supported."), (self.warm_start is False, "Warm start is not supported."), ( daal_check_version((2023, "P", 200)) @@ -369,7 +369,7 @@ def _onedal_gpu_supported(self, method_name, *data): [ (hasattr(self, "_onedal_estimator"), "oneDAL model was not trained"), ( - not sp.issparse(X), + not is_sparse(X), "X is sparse. Sparse input is not supported.", ), (self.warm_start is False, "Warm start is not supported."), diff --git a/sklearnex/linear_model/linear.py b/sklearnex/linear_model/linear.py index 7d79ac9d8f..5c3c2a652c 100644 --- a/sklearnex/linear_model/linear.py +++ b/sklearnex/linear_model/linear.py @@ -16,13 +16,12 @@ import logging -from scipy.sparse import issparse from sklearn.linear_model import LinearRegression as _sklearn_LinearRegression from sklearn.metrics import r2_score from sklearn.utils.validation import check_is_fitted from daal4py.sklearn._n_jobs_support import control_n_jobs -from daal4py.sklearn._utils import daal_check_version, sklearn_check_version +from daal4py.sklearn._utils import daal_check_version, is_sparse, sklearn_check_version from onedal.linear_model import LinearRegression as onedal_LinearRegression from onedal.utils.validation import _num_features, _num_samples @@ -211,7 +210,7 @@ def _onedal_fit_supported(self, patching_status, method_name, *data): [ (sample_weight is None, "Sample weight is not supported."), ( - not issparse(X) and not issparse(y), + not is_sparse(X) and not is_sparse(y), "Sparse input is not supported.", ), (not normalize_is_set, "Normalization is not supported."), @@ -235,13 +234,13 @@ def _onedal_fit_supported(self, patching_status, method_name, *data): def _onedal_predict_supported(self, patching_status, method_name, *data): n_samples = _num_samples(data[0]) - model_is_sparse = issparse(self.coef_) or ( - self.fit_intercept and issparse(self.intercept_) + model_is_sparse = is_sparse(self.coef_) or ( + self.fit_intercept and is_sparse(self.intercept_) ) patching_status.and_conditions( [ (n_samples > 0, "Number of samples is less than 1."), - (not issparse(data[0]), "Sparse input is not supported."), + (not is_sparse(data[0]), "Sparse input is not supported."), (not model_is_sparse, "Sparse coefficients are not supported."), ] ) diff --git a/sklearnex/linear_model/logistic_regression.py b/sklearnex/linear_model/logistic_regression.py index c90e64aea1..977b753a74 100644 --- a/sklearnex/linear_model/logistic_regression.py +++ b/sklearnex/linear_model/logistic_regression.py @@ -23,14 +23,13 @@ if daal_check_version((2024, "P", 1)): import numpy as np - from scipy.sparse import issparse from sklearn.linear_model import LogisticRegression as _sklearn_LogisticRegression from sklearn.metrics import accuracy_score from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import check_is_fitted from daal4py.sklearn._n_jobs_support import control_n_jobs - from daal4py.sklearn._utils import sklearn_check_version + from daal4py.sklearn._utils import is_sparse, sklearn_check_version from daal4py.sklearn.linear_model.logistic_path import daal4py_fit, daal4py_predict from onedal._device_offload import support_input_format from onedal.linear_model import LogisticRegression as onedal_LogisticRegression @@ -310,7 +309,7 @@ def _onedal_gpu_predict_supported(self, method_name, *data): (n_samples > 0, "Number of samples is less than 1."), ( (_sparsity_enabled and method_name != "decision_function") - or (not any([issparse(i) for i in data])), + or (not any([is_sparse(i) for i in data])), "Sparse input is not supported.", ), ( diff --git a/sklearnex/linear_model/ridge.py b/sklearnex/linear_model/ridge.py index 954687c56e..2af669c150 100644 --- a/sklearnex/linear_model/ridge.py +++ b/sklearnex/linear_model/ridge.py @@ -22,12 +22,12 @@ import numbers import numpy as np - import scipy.sparse as sp from sklearn.linear_model import Ridge as _sklearn_Ridge from sklearn.metrics import r2_score from sklearn.utils.validation import check_is_fitted from daal4py.sklearn._n_jobs_support import control_n_jobs + from daal4py.sklearn._utils import is_sparse if not sklearn_check_version("1.2"): from sklearn.linear_model._base import _deprecate_normalize @@ -173,7 +173,7 @@ def _onedal_fit_supported(self, patching_status, method_name, *data): "Only 'auto' solver is supported.", ), ( - not sp.issparse(X) and not sp.issparse(y), + not is_sparse(X) and not is_sparse(y), "Sparse input is not supported.", ), (sample_weight is None, "Sample weight is not supported."), @@ -196,8 +196,8 @@ def _onedal_predict_supported(self, patching_status, method_name, *data): assert len(data) <= 2 n_samples = _num_samples(data[0]) - model_is_sparse = sp.issparse(self.coef_) or ( - self.fit_intercept and sp.issparse(self.intercept_) + model_is_sparse = is_sparse(self.coef_) or ( + self.fit_intercept and is_sparse(self.intercept_) ) patching_status.and_conditions( [ @@ -207,7 +207,7 @@ def _onedal_predict_supported(self, patching_status, method_name, *data): "Only 'auto' solver is supported.", ), (n_samples > 0, "Number of samples is less than 1."), - (not sp.issparse(data[0]), "Sparse input is not supported."), + (not is_sparse(data[0]), "Sparse input is not supported."), (not model_is_sparse, "Sparse coefficients are not supported."), ] ) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index f358b4f1c5..d3f2668710 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -24,7 +24,7 @@ from sklearn.neighbors._kd_tree import KDTree from sklearn.utils.validation import check_is_fitted -from daal4py.sklearn._utils import sklearn_check_version +from daal4py.sklearn._utils import is_sparse, sklearn_check_version from onedal._device_offload import _transfer_to_host from onedal.utils.validation import _check_array, _num_features, _num_samples @@ -205,7 +205,7 @@ def _onedal_supported(self, device, method_name, *data): return patching_status if not patching_status.and_condition( - not sp.issparse(data[0]), "Sparse input is not supported." + not is_sparse(data[0]), "Sparse input is not supported." ): return patching_status diff --git a/sklearnex/preview/covariance/covariance.py b/sklearnex/preview/covariance/covariance.py index 5b226eb4cf..639cf9799c 100644 --- a/sklearnex/preview/covariance/covariance.py +++ b/sklearnex/preview/covariance/covariance.py @@ -18,13 +18,12 @@ from functools import partial import numpy as np -import scipy.sparse as sp from sklearn.base import clone from sklearn.covariance import EmpiricalCovariance as _sklearn_EmpiricalCovariance from sklearn.utils.validation import check_array, check_is_fitted from daal4py.sklearn._n_jobs_support import control_n_jobs -from daal4py.sklearn._utils import daal_check_version, sklearn_check_version +from daal4py.sklearn._utils import daal_check_version, is_sparse, sklearn_check_version from daal4py.sklearn.metrics import pairwise_distances from onedal._device_offload import support_input_format, support_sycl_format from onedal.covariance import EmpiricalCovariance as onedal_EmpiricalCovariance @@ -109,7 +108,7 @@ def _onedal_supported(self, method_name, *data): (X,) = data patching_status.and_conditions( [ - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (not is_sparse(X), "X is sparse. Sparse input is not supported."), ] ) return patching_status diff --git a/sklearnex/preview/decomposition/incremental_pca.py b/sklearnex/preview/decomposition/incremental_pca.py index 02d251b03c..dd8c1bb265 100644 --- a/sklearnex/preview/decomposition/incremental_pca.py +++ b/sklearnex/preview/decomposition/incremental_pca.py @@ -14,13 +14,12 @@ # limitations under the License. # =============================================================================== -import scipy.sparse as sp from sklearn.decomposition import IncrementalPCA as _sklearn_IncrementalPCA from sklearn.utils import check_array, gen_batches from sklearn.utils.validation import check_is_fitted from daal4py.sklearn._n_jobs_support import control_n_jobs -from daal4py.sklearn._utils import sklearn_check_version +from daal4py.sklearn._utils import is_sparse, sklearn_check_version from onedal.decomposition import IncrementalPCA as onedal_IncrementalPCA from ..._config import get_config @@ -248,12 +247,12 @@ def _onedal_cpu_supported(self, method_name, *data): X = data[0] if "fit" in method_name: patching_status.and_conditions( - [(not sp.issparse(X), "Sparse input is not supported")] + [(not is_sparse(X), "Sparse input is not supported")] ) else: patching_status.and_conditions( [ - (not sp.issparse(X), "Sparse input is not supported"), + (not is_sparse(X), "Sparse input is not supported"), (hasattr(self, "_onedal_estimator"), "oneDAL model was not trained"), ] ) @@ -268,14 +267,14 @@ def _onedal_gpu_supported(self, method_name, *data): if "fit" in method_name: patching_status.and_conditions( [ - (not sp.issparse(X), "Sparse input is not supported"), + (not is_sparse(X), "Sparse input is not supported"), (self.svd_solver != "onedal_svd", "onedal_svd not supported on GPU"), ] ) else: patching_status.and_conditions( [ - (not sp.issparse(X), "Sparse input is not supported"), + (not is_sparse(X), "Sparse input is not supported"), (hasattr(self, "_onedal_estimator"), "oneDAL model was not trained"), ] ) diff --git a/sklearnex/svm/svc.py b/sklearnex/svm/svc.py index d0f8cc6127..022f2a3af9 100644 --- a/sklearnex/svm/svc.py +++ b/sklearnex/svm/svc.py @@ -17,7 +17,6 @@ from functools import wraps import numpy as np -from scipy import sparse as sp from sklearn.exceptions import NotFittedError from sklearn.metrics import accuracy_score from sklearn.svm import SVC as _sklearn_SVC @@ -29,7 +28,7 @@ ) from daal4py.sklearn._n_jobs_support import control_n_jobs -from daal4py.sklearn._utils import sklearn_check_version +from daal4py.sklearn._utils import is_sparse, sklearn_check_version from onedal.svm import SVC as onedal_SVC from .._device_offload import dispatch, wrap_output_data @@ -189,7 +188,7 @@ def _onedal_gpu_supported(self, method_name, *data): ) if len(data) > 1: self._class_count = len(np.unique(data[1])) - self._is_sparse = sp.issparse(data[0]) + self._is_sparse = is_sparse(data[0]) conditions = [ ( self.kernel in ["linear", "rbf"], diff --git a/sklearnex/tests/test_sparse_processing.py b/sklearnex/tests/test_sparse_processing.py new file mode 100644 index 0000000000..dfe95cafec --- /dev/null +++ b/sklearnex/tests/test_sparse_processing.py @@ -0,0 +1,279 @@ +# ============================================================================== +# Copyright contributors to the oneDAL project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import warnings + +import numpy as np +import pandas as pd +import pytest +import scipy +import scipy.sparse as sp + +import daal4py # Note: this is used through 'eval' +import onedal +from daal4py.sklearn._utils import sklearn_check_version +from sklearnex.basic_statistics import BasicStatistics +from sklearnex.decomposition import PCA +from sklearnex.linear_model import Lasso, LinearRegression, LogisticRegression +from sklearnex.manifold import TSNE +from sklearnex.svm import SVR + +# Note here: 'TSNE', 'LogisticRegression', and 'Lasso' are implemented +# in daal4py, and thus work differently from the rest and need separate +# testing. Other estimators like KMeans and SVM which are implemented through +# the 'onedal' module all follow the same logic, so testing two different +# instances should be enough. +# Note also that 'TSNE' is even more special than the rest, because it has +# multiple stages (initialization, calculation of distances, TSNE optimization) +# and the oneDAL implementation covers only the last one, but the earlier +# stages call sklearn estimators / functions that are patched. + + +# DataFrames of different types. They should follow the following logic +# which mimics scikit-learn's handling of the same classes: +# - Normal inputs should be converted to numpy arrays. +# - Inputs where all columns are sparse should be converted to sparse +# arrays or matrices. +# - Inputs where some columns are sparse and some are dense should be +# converted to dense, regardless of how sparse they are. +# - Sparse inputs should be passed as CSR to oneDAL when supported, +# regardless of their original type. +def make_dense_df(): + rng = np.random.default_rng(seed=123) + X = rng.random(size=(50, 4)) + return pd.DataFrame(X) + + +def make_sparse_df(): + X = sp.random(50, 4, 0.5, format="coo", random_state=123) + return pd.DataFrame(X.toarray()).astype(pd.SparseDtype("float", 0)) + + +def make_mixed_df(): + rng = np.random.default_rng(seed=123) + col1 = rng.random(size=(50, 1)) + X_sp = sp.random(50, 3, 0.5, format="coo", random_state=123) + X = np.c_[col1, X_sp.toarray()] + df = pd.DataFrame(X) + for col in range(1, 4): + df[col] = df[col].astype(pd.SparseDtype("float", 0)) + return df + + +# Sparse matrices from SciPy can come in different classes +def make_sparse_matrix(): + out = sp.random(50, 4, 0.5, format="csc", random_state=123) + return sp.csc_matrix(out) + + +def make_sparse_array(): + out = sp.random(50, 4, 0.5, format="csc", random_state=123) + return sp.csc_array(out) + + +# Note: sparse pandas data frames have version requirements on scipy. +# This skips the tests if they are incompatible. +def check_sparse_df_is_supported(): + scipy_version = scipy.__version__.split(".") + if int(scipy_version[0]) > 1: + return True + if int(scipy_version[0]) == 1: + if int(scipy_version[1]) > 8: + return True + if int(scipy_version[1]) == 8 and int(scipy_version[2]) > 1: + return True + return False + + +SPARSE_DF_SUPPORTED = check_sparse_df_is_supported() +MSG_UNSUPPORTED_SP_DF = "Requires higher SciPy version" + + +@pytest.fixture( + params=[make_sparse_matrix(), make_sparse_df()] + + ([make_sparse_array()] if hasattr(sp, "csc_array") else []) +) +def sparse_X(request): + return request.param + + +@pytest.fixture(params=[make_dense_df(), make_mixed_df()]) +def dense_X(request): + return request.param + + +# If the estimator doesn't support sparse data, passing either sparse data frames +# or sparse arrays/matrices should result in falling back to scikit-learn. +@pytest.mark.allow_sklearn_fallback +@pytest.mark.skipif(not SPARSE_DF_SUPPORTED, reason=MSG_UNSUPPORTED_SP_DF) +@pytest.mark.parametrize( + "estimator", [LinearRegression] + ([PCA] if sklearn_check_version("1.8") else []) +) +def test_no_sparse_support_falls_back_to_sklearn(estimator, sparse_X, mocker): + mocker.patch("onedal.datatypes._data_conversion._convert_one_to_table") + estimator().fit(sparse_X, np.r_[np.zeros(25), np.ones(25)]) + assert not onedal.datatypes._data_conversion._convert_one_to_table.called + + +# Note that some estimators that are implemented through daal4py do +# not end up using oneDAL tables, so they require a separate test. +@pytest.mark.allow_sklearn_fallback +@pytest.mark.skipif(not SPARSE_DF_SUPPORTED, reason=MSG_UNSUPPORTED_SP_DF) +@pytest.mark.parametrize( + "estimator,params,internal_function", + [ + ( + LogisticRegression, + {}, + "daal4py.sklearn.linear_model.logistic_path.__logistic_regression_path", + ), + ( + Lasso, + {}, + "daal4py.sklearn.linear_model._coordinate_descent._daal4py_fit_lasso", + ), + ] + + ( + [ + ( + TSNE, + {"init": "random", "n_components": 2, "method": "barnes_hut"}, + "daal4py.sklearn.neighbors._base.daal4py_fit", + ), + ] + if sklearn_check_version("1.8") + else [] + ), +) +def test_no_sparse_support_falls_back_to_sklearn_daal4py( + estimator, params, internal_function, sparse_X, mocker +): + mocker.patch(internal_function) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + estimator().set_params(**params).fit(sparse_X, np.r_[np.zeros(25), np.ones(25)]) + assert not eval(internal_function).called + + +def is_sparse_csr(x): + return sp.issparse(x) and x.format == "csr" + + +# Passing data in any sparse format should result in oneDAL receiving a +# CSR matrix, regardless of what input it comes in. +@pytest.mark.parametrize("estimator", [SVR, BasicStatistics]) +@pytest.mark.skipif(not SPARSE_DF_SUPPORTED, reason=MSG_UNSUPPORTED_SP_DF) +def test_sparse_input_is_passed_as_csr_to_onedal(estimator, sparse_X, mocker): + # First check that it works without crashing + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + estimator().fit(sparse_X, np.arange(sparse_X.shape[0])) + + # Now check for inputs it receives + mocker.patch("onedal.datatypes._data_conversion._convert_one_to_table") + # Note: the call is expected to fail due to the mocking preventing + # calls to C++, but the function that converts to a oneDAL table + # should nevertheless be called regardless. + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + estimator().fit(sparse_X, np.arange(sparse_X.shape[0])) + except Exception: + pass + called_with_csr = False + for call in onedal.datatypes._data_conversion._convert_one_to_table.mock_calls: + if is_sparse_csr(call.args[0]): + called_with_csr = True + break + assert called_with_csr + + +# All estimators should be able to support dense data. +@pytest.mark.parametrize("estimator", [LinearRegression, SVR, BasicStatistics]) +@pytest.mark.skipif(not SPARSE_DF_SUPPORTED, reason=MSG_UNSUPPORTED_SP_DF) +def test_dense_data_is_not_converted_to_sparse(estimator, dense_X, mocker): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + estimator().fit(dense_X, np.arange(dense_X.shape[0])) + mocker.patch("onedal.datatypes._data_conversion._convert_one_to_table") + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + estimator().fit(dense_X, np.arange(dense_X.shape[0])) + except Exception: + pass + assert onedal.datatypes._data_conversion._convert_one_to_table.called + called_with_csr = False + called_with_numpy = False + for call in onedal.datatypes._data_conversion._convert_one_to_table.mock_calls: + if is_sparse_csr(call.args[0]): + called_with_csr = True + elif isinstance(call.args[0], np.ndarray): + called_with_numpy = True + assert not called_with_csr + assert called_with_numpy + + +@pytest.mark.parametrize( + "estimator,params,internal_function,position_X", + [ + ( + LogisticRegression, + {}, + "daal4py.sklearn.linear_model.logistic_path.__logistic_regression_path", + 0, + ), + ( + Lasso, + {}, + "daal4py.sklearn.linear_model._coordinate_descent._daal4py_fit_lasso", + 1, + ), + ( + TSNE, + {"init": "random", "n_components": 2, "method": "barnes_hut"}, + "daal4py.sklearn.neighbors._base.daal4py_fit", + 1, + ), + ], +) +@pytest.mark.skipif(not SPARSE_DF_SUPPORTED, reason=MSG_UNSUPPORTED_SP_DF) +def test_dense_data_is_not_converted_to_sparse_daal4py( + estimator, params, internal_function, position_X, dense_X, mocker +): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + estimator().set_params(**params).fit(dense_X, np.r_[np.zeros(25), np.ones(25)]) + mocker.patch(internal_function) + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + estimator().set_params(**params).fit( + dense_X, np.r_[np.zeros(25), np.ones(25)] + ) + except Exception: + pass + assert eval(internal_function).called + called_with_csr = False + called_with_numpy = False + for call in eval(internal_function).mock_calls: + if len(call.args) <= position_X: + continue + if is_sparse_csr(call.args[position_X]): + called_with_csr = True + elif isinstance(call.args[position_X], np.ndarray): + called_with_numpy = True + assert not called_with_csr + assert called_with_numpy