Skip to content

Commit df5db1e

Browse files
[bug] Fix LocalOutlierFactor kneighbors method via refactor (#1640)
* Inherit KNeighborsDispatchingBase * Only fully implement fit_predict, predict, kneighbors and fit, overwrite _predict to allow for use of score_subsample. * Fix sklearn_check_version use from code duplication * Add n_jobs support * add docstrings * enable GPU LocalOutlierFactor sklearn tests * solves dpctl dpnp support for kneighbors method (also with re-enabled tests) --------- Co-authored-by: KulikovNikita <[email protected]>
1 parent 6fce569 commit df5db1e

File tree

9 files changed

+191
-491
lines changed

9 files changed

+191
-491
lines changed

deselected_tests.yaml

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -706,7 +706,6 @@ gpu:
706706
- tests/test_multioutput.py::test_classifier_chain_tuple_order[tuple]
707707
- tests/test_pipeline.py::test_pipeline_methods_anova
708708
- tests/test_pipeline.py::test_pipeline_methods_pca_svm
709-
- tests/test_pipeline.py::test_pipeline_score_samples_pca_lof
710709
- tests/test_pipeline.py::test_score_samples_on_pipeline_without_score_samples
711710
- tests/test_pipeline.py::test_pipeline_methods_preprocessing_svm
712711
- tests/test_pipeline.py::test_pipeline_transform
@@ -731,7 +730,7 @@ gpu:
731730
- tests/test_calibration.py::test_calibrated_classifier_cv_double_sample_weights_equivalence
732731
- tests/test_calibration.py::test_calibrated_classifier_cv_zeros_sample_weights_equivalence
733732
- tests/test_common.py::test_estimators[FeatureAgglomeration()-check_parameters_default_constructible]
734-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimator_sparse_data]
733+
- neighbors/tests/test_lof.py::test_novelty_true_common_tests[LocalOutlierFactor(novelty=True)-check_methods_subset_invariance]
735734
- tests/test_common.py::test_transformers_get_feature_names_out[StackingRegressor(estimators=[('est1',Ridge(alpha=0.1)),('est2',Ridge(alpha=1))])]
736735
- tests/test_common.py::test_transformers_get_feature_names_out[VotingRegressor(estimators=[('est1',Ridge(alpha=0.1)),('est2',Ridge(alpha=1))])]
737736
- tests/test_common.py::test_f_contiguous_array_estimator[TSNE]
@@ -761,17 +760,10 @@ gpu:
761760
- manifold/tests/test_t_sne.py::test_binary_perplexity_stability
762761
- manifold/tests/test_t_sne.py::test_gradient_bh_multithread_match_sequential
763762
- neighbors/tests/test_kde.py::test_kernel_density_sampling
764-
- neighbors/tests/test_lof.py
765-
- tests/test_common.py::test_check_n_features_in_after_fitting[LocalOutlierFactor()]
766763
- tests/test_common.py::test_check_n_features_in_after_fitting[NearestNeighbors()]
767-
- tests/test_common.py::test_f_contiguous_array_estimator[LocalOutlierFactor]
768-
- tests/test_common.py::test_estimators[NearestNeighbors()-
764+
- tests/test_common.py::test_estimators[NearestNeighbors()]
769765
- model_selection/tests/test_search.py::test_search_cv_score_samples_method[search_cv0]
770766
- model_selection/tests/test_search.py::test_search_cv_score_samples_method[search_cv1]
771-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_outliers_fit_predict]
772-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit_idempotent]
773-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit_check_is_fitted]
774-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_n_features_in]
775767
- manifold/tests/test_t_sne.py::test_barnes_hut_angle
776768
# KNeighborsRegressor
777769
- ensemble/tests/test_bagging.py::test_regression
@@ -1046,8 +1038,6 @@ gpu:
10461038
- neighbors/tests/test_neighbors.py::test_regressor_predict_on_arraylikes
10471039
# `precomputed` metric is not implemented for DBSCAN
10481040
- neighbors/tests/test_neighbors_pipeline.py::test_dbscan
1049-
- neighbors/tests/test_neighbors_pipeline.py::test_lof_novelty_false
1050-
- neighbors/tests/test_neighbors_pipeline.py::test_lof_novelty_true
10511041
- neighbors/tests/test_neighbors_pipeline.py::test_kneighbors_regressor
10521042
# unsorted svm
10531043
- svm/tests/test_svm.py::test_libsvm_iris
@@ -1172,22 +1162,6 @@ gpu:
11721162
- tests/test_common.py::test_estimators[DBSCAN()-check_n_features_in]
11731163
- tests/test_common.py::test_estimators[DBSCAN()-check_fit1d]
11741164
- tests/test_common.py::test_estimators[DBSCAN()-check_fit2d_predict1d]
1175-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_dtypes]
1176-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit_score_takes_y]
1177-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_fit_returns_self]
1178-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_fit_returns_self(readonly_memmap=True)]
1179-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_dtype_object]
1180-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_pipeline_consistency]
1181-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_nan_inf]
1182-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_overwrite_params]
1183-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_pickle]
1184-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_methods_sample_order_invariance]
1185-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_methods_subset_invariance]
1186-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit2d_1sample]
1187-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit2d_1feature]
1188-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_dict_unchanged]
1189-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_dont_overwrite_parameters]
1190-
- tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit2d_predict1d]
11911165
- tests/test_common.py::test_check_n_features_in_after_fitting[DBSCAN()]
11921166
- tests/test_common.py::test_check_n_features_in_after_fitting[SVC()]
11931167
# originated with pca dpctl/dpnp fit, to be re-assesed with pca out-of-preview

sklearnex/neighbors/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@
1414
# limitations under the License.
1515
# ===============================================================================
1616

17+
from ._lof import LocalOutlierFactor
1718
from .knn_classification import KNeighborsClassifier
1819
from .knn_regression import KNeighborsRegressor
1920
from .knn_unsupervised import NearestNeighbors
20-
from .lof import LocalOutlierFactor
2121

2222
__all__ = [
2323
"KNeighborsClassifier",

sklearnex/neighbors/_lof.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
# ===============================================================================
2+
# Copyright 2024 Intel Corporation
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# ===============================================================================
16+
17+
import warnings
18+
19+
import numpy as np
20+
from sklearn.neighbors import LocalOutlierFactor as sklearn_LocalOutlierFactor
21+
from sklearn.utils.metaestimators import available_if
22+
from sklearn.utils.validation import check_is_fitted
23+
24+
from daal4py.sklearn._n_jobs_support import control_n_jobs
25+
from daal4py.sklearn._utils import sklearn_check_version
26+
27+
from .._device_offload import dispatch, wrap_output_data
28+
from .common import KNeighborsDispatchingBase
29+
from .knn_unsupervised import NearestNeighbors
30+
31+
32+
@control_n_jobs(decorated_methods=["fit", "kneighbors"])
33+
class LocalOutlierFactor(KNeighborsDispatchingBase, sklearn_LocalOutlierFactor):
34+
__doc__ = (
35+
sklearn_LocalOutlierFactor.__doc__
36+
+ "\n NOTE: When X=None, methods kneighbors, kneighbors_graph, and predict will"
37+
+ "\n only output numpy arrays. In that case, the only way to offload to gpu"
38+
+ "\n is to use a global queue (e.g. using config_context)"
39+
)
40+
if sklearn_check_version("1.2"):
41+
_parameter_constraints: dict = {
42+
**sklearn_LocalOutlierFactor._parameter_constraints
43+
}
44+
45+
# Only certain methods should be taken from knn to prevent code
46+
# duplication. Inheriting would yield a complicated inheritance
47+
# structure and violate the sklearn inheritance path.
48+
_save_attributes = NearestNeighbors._save_attributes
49+
_onedal_knn_fit = NearestNeighbors._onedal_fit
50+
_onedal_kneighbors = NearestNeighbors._onedal_kneighbors
51+
52+
def _onedal_fit(self, X, y, queue=None):
53+
if sklearn_check_version("1.2"):
54+
self._validate_params()
55+
56+
self._onedal_knn_fit(X, y, queue)
57+
58+
if self.contamination != "auto":
59+
if not (0.0 < self.contamination <= 0.5):
60+
raise ValueError(
61+
"contamination must be in (0, 0.5], " "got: %f" % self.contamination
62+
)
63+
64+
n_samples = self.n_samples_fit_
65+
66+
if self.n_neighbors > n_samples:
67+
warnings.warn(
68+
"n_neighbors (%s) is greater than the "
69+
"total number of samples (%s). n_neighbors "
70+
"will be set to (n_samples - 1) for estimation."
71+
% (self.n_neighbors, n_samples)
72+
)
73+
self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1))
74+
75+
(
76+
self._distances_fit_X_,
77+
_neighbors_indices_fit_X_,
78+
) = self._onedal_kneighbors(n_neighbors=self.n_neighbors_, queue=queue)
79+
80+
# Sklearn includes a check for float32 at this point which may not be
81+
# necessary for onedal
82+
83+
self._lrd = self._local_reachability_density(
84+
self._distances_fit_X_, _neighbors_indices_fit_X_
85+
)
86+
87+
# Compute lof score over training samples to define offset_:
88+
lrd_ratios_array = self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis]
89+
90+
self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1)
91+
92+
if self.contamination == "auto":
93+
# inliers score around -1 (the higher, the less abnormal).
94+
self.offset_ = -1.5
95+
else:
96+
self.offset_ = np.percentile(
97+
self.negative_outlier_factor_, 100.0 * self.contamination
98+
)
99+
100+
return self
101+
102+
def fit(self, X, y=None):
103+
self._fit_validation(X, y)
104+
result = dispatch(
105+
self,
106+
"fit",
107+
{
108+
"onedal": self.__class__._onedal_fit,
109+
"sklearn": sklearn_LocalOutlierFactor.fit,
110+
},
111+
X,
112+
None,
113+
)
114+
return result
115+
116+
# Subtle order change to remove check_array and preserve dpnp and
117+
# dpctl conformance. decision_function will return a dpnp or dpctl
118+
# instance via kneighbors and an equivalent check_array exists in
119+
# that call already in sklearn so no loss of functionality occurs
120+
def _predict(self, X=None):
121+
check_is_fitted(self)
122+
123+
if X is not None:
124+
output = self.decision_function(X) < 0
125+
is_inlier = np.ones(output.shape[0], dtype=int)
126+
is_inlier[output] = -1
127+
else:
128+
is_inlier = np.ones(self.n_samples_fit_, dtype=int)
129+
is_inlier[self.negative_outlier_factor_ < self.offset_] = -1
130+
131+
return is_inlier
132+
133+
# This had to be done because predict loses the queue when no
134+
# argument is given and it is a dpctl tensor or dpnp array.
135+
# This would cause issues in fit_predict. Also, available_if
136+
# is hard to unwrap, and this is the most straighforward way.
137+
@available_if(sklearn_LocalOutlierFactor._check_novelty_fit_predict)
138+
@wrap_output_data
139+
def fit_predict(self, X, y=None):
140+
return self.fit(X)._predict()
141+
142+
@available_if(sklearn_LocalOutlierFactor._check_novelty_predict)
143+
@wrap_output_data
144+
def predict(self, X=None):
145+
return self._predict(X)
146+
147+
@wrap_output_data
148+
def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
149+
check_is_fitted(self)
150+
if sklearn_check_version("1.0") and X is not None:
151+
self._check_feature_names(X, reset=False)
152+
return dispatch(
153+
self,
154+
"kneighbors",
155+
{
156+
"onedal": self.__class__._onedal_kneighbors,
157+
"sklearn": sklearn_LocalOutlierFactor.kneighbors,
158+
},
159+
X,
160+
n_neighbors=n_neighbors,
161+
return_distance=return_distance,
162+
)
163+
164+
fit.__doc__ = sklearn_LocalOutlierFactor.fit.__doc__
165+
fit_predict.__doc__ = sklearn_LocalOutlierFactor.fit_predict.__doc__
166+
predict.__doc__ = sklearn_LocalOutlierFactor.predict.__doc__
167+
kneighbors.__doc__ = sklearn_LocalOutlierFactor.kneighbors.__doc__

sklearnex/neighbors/knn_classification.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def predict_proba(self, X):
246246
@wrap_output_data
247247
def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
248248
check_is_fitted(self)
249-
if sklearn_check_version("1.0"):
249+
if sklearn_check_version("1.0") and X is not None:
250250
self._check_feature_names(X, reset=False)
251251
return dispatch(
252252
self,
@@ -256,8 +256,8 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
256256
"sklearn": sklearn_KNeighborsClassifier.kneighbors,
257257
},
258258
X,
259-
n_neighbors,
260-
return_distance,
259+
n_neighbors=n_neighbors,
260+
return_distance=return_distance,
261261
)
262262

263263
@wrap_output_data

sklearnex/neighbors/knn_regression.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ def predict(self, X):
227227
@wrap_output_data
228228
def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
229229
check_is_fitted(self)
230-
if sklearn_check_version("1.0"):
230+
if sklearn_check_version("1.0") and X is not None:
231231
self._check_feature_names(X, reset=False)
232232
return dispatch(
233233
self,
@@ -237,8 +237,8 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
237237
"sklearn": sklearn_KNeighborsRegressor.kneighbors,
238238
},
239239
X,
240-
n_neighbors,
241-
return_distance,
240+
n_neighbors=n_neighbors,
241+
return_distance=return_distance,
242242
)
243243

244244
@wrap_output_data

sklearnex/neighbors/knn_unsupervised.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,8 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
151151
"sklearn": sklearn_NearestNeighbors.kneighbors,
152152
},
153153
X,
154-
n_neighbors,
155-
return_distance,
154+
n_neighbors=n_neighbors,
155+
return_distance=return_distance,
156156
)
157157

158158
@wrap_output_data

0 commit comments

Comments
 (0)