[bug] Fix LocalOutlierFactor kneighbors method via refactor (#1640)

icfaust · KulikovNikita · web-flow · commit df5db1ee63e5 · 2024-02-01T09:24:51.000+01:00
* Inherit KNeighborsDispatchingBase

* Only fully implement fit_predict, predict, kneighbors and fit, overwrite _predict to allow for use of score_subsample.

* Fix sklearn_check_version use from code duplication

* Add n_jobs support

* add docstrings

* enable GPU LocalOutlierFactor sklearn tests

* solves dpctl dpnp support for kneighbors method (also with re-enabled tests)

---------

Co-authored-by: KulikovNikita &lt;60149822+KulikovNikita@users.noreply.github.com&gt;
diff --git a/deselected_tests.yaml b/deselected_tests.yaml
@@ -706,7 +706,6 @@ gpu:
   - tests/test_multioutput.py::test_classifier_chain_tuple_order[tuple]
   - tests/test_pipeline.py::test_pipeline_methods_anova
   - tests/test_pipeline.py::test_pipeline_methods_pca_svm
-  - tests/test_pipeline.py::test_pipeline_score_samples_pca_lof
   - tests/test_pipeline.py::test_score_samples_on_pipeline_without_score_samples
   - tests/test_pipeline.py::test_pipeline_methods_preprocessing_svm
   - tests/test_pipeline.py::test_pipeline_transform
@@ -731,7 +730,7 @@ gpu:
   - tests/test_calibration.py::test_calibrated_classifier_cv_double_sample_weights_equivalence
   - tests/test_calibration.py::test_calibrated_classifier_cv_zeros_sample_weights_equivalence
   - tests/test_common.py::test_estimators[FeatureAgglomeration()-check_parameters_default_constructible]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimator_sparse_data]
+  - neighbors/tests/test_lof.py::test_novelty_true_common_tests[LocalOutlierFactor(novelty=True)-check_methods_subset_invariance]
   - tests/test_common.py::test_transformers_get_feature_names_out[StackingRegressor(estimators=[('est1',Ridge(alpha=0.1)),('est2',Ridge(alpha=1))])]
   - tests/test_common.py::test_transformers_get_feature_names_out[VotingRegressor(estimators=[('est1',Ridge(alpha=0.1)),('est2',Ridge(alpha=1))])]
   - tests/test_common.py::test_f_contiguous_array_estimator[TSNE]
@@ -761,17 +760,10 @@ gpu:
   - manifold/tests/test_t_sne.py::test_binary_perplexity_stability
   - manifold/tests/test_t_sne.py::test_gradient_bh_multithread_match_sequential
   - neighbors/tests/test_kde.py::test_kernel_density_sampling
-  - neighbors/tests/test_lof.py
-  - tests/test_common.py::test_check_n_features_in_after_fitting[LocalOutlierFactor()]
   - tests/test_common.py::test_check_n_features_in_after_fitting[NearestNeighbors()]
-  - tests/test_common.py::test_f_contiguous_array_estimator[LocalOutlierFactor]
-  - tests/test_common.py::test_estimators[NearestNeighbors()-
+  - tests/test_common.py::test_estimators[NearestNeighbors()] 
   - model_selection/tests/test_search.py::test_search_cv_score_samples_method[search_cv0]
   - model_selection/tests/test_search.py::test_search_cv_score_samples_method[search_cv1]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_outliers_fit_predict]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit_idempotent]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit_check_is_fitted]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_n_features_in]
   - manifold/tests/test_t_sne.py::test_barnes_hut_angle
   # KNeighborsRegressor
   - ensemble/tests/test_bagging.py::test_regression
@@ -1046,8 +1038,6 @@ gpu:
   - neighbors/tests/test_neighbors.py::test_regressor_predict_on_arraylikes
   # `precomputed` metric is not implemented for DBSCAN
   - neighbors/tests/test_neighbors_pipeline.py::test_dbscan
-  - neighbors/tests/test_neighbors_pipeline.py::test_lof_novelty_false
-  - neighbors/tests/test_neighbors_pipeline.py::test_lof_novelty_true
   - neighbors/tests/test_neighbors_pipeline.py::test_kneighbors_regressor
   # unsorted svm
   - svm/tests/test_svm.py::test_libsvm_iris
@@ -1172,22 +1162,6 @@ gpu:
   - tests/test_common.py::test_estimators[DBSCAN()-check_n_features_in]
   - tests/test_common.py::test_estimators[DBSCAN()-check_fit1d]
   - tests/test_common.py::test_estimators[DBSCAN()-check_fit2d_predict1d]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_dtypes]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit_score_takes_y]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_fit_returns_self]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_fit_returns_self(readonly_memmap=True)]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_dtype_object]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_pipeline_consistency]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_nan_inf]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_overwrite_params]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimators_pickle]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_methods_sample_order_invariance]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_methods_subset_invariance]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit2d_1sample]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit2d_1feature]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_dict_unchanged]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_dont_overwrite_parameters]
-  - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit2d_predict1d]
   - tests/test_common.py::test_check_n_features_in_after_fitting[DBSCAN()]
   - tests/test_common.py::test_check_n_features_in_after_fitting[SVC()]
   # originated with pca dpctl/dpnp fit, to be re-assesed with pca out-of-preview
diff --git a/sklearnex/neighbors/__init__.py b/sklearnex/neighbors/__init__.py
@@ -14,10 +14,10 @@
 # limitations under the License.
 # ===============================================================================
 
+from ._lof import LocalOutlierFactor
 from .knn_classification import KNeighborsClassifier
 from .knn_regression import KNeighborsRegressor
 from .knn_unsupervised import NearestNeighbors
-from .lof import LocalOutlierFactor
 
 __all__ = [
     "KNeighborsClassifier",
diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py
@@ -0,0 +1,167 @@
+# ===============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===============================================================================
+
+import warnings
+
+import numpy as np
+from sklearn.neighbors import LocalOutlierFactor as sklearn_LocalOutlierFactor
+from sklearn.utils.metaestimators import available_if
+from sklearn.utils.validation import check_is_fitted
+
+from daal4py.sklearn._n_jobs_support import control_n_jobs
+from daal4py.sklearn._utils import sklearn_check_version
+
+from .._device_offload import dispatch, wrap_output_data
+from .common import KNeighborsDispatchingBase
+from .knn_unsupervised import NearestNeighbors
+
+
+@control_n_jobs(decorated_methods=["fit", "kneighbors"])
+class LocalOutlierFactor(KNeighborsDispatchingBase, sklearn_LocalOutlierFactor):
+    __doc__ = (
+        sklearn_LocalOutlierFactor.__doc__
+        + "\n NOTE: When X=None, methods kneighbors, kneighbors_graph, and predict will"
+        + "\n only output numpy arrays. In that case, the only way to offload to gpu"
+        + "\n is to use a global queue (e.g. using config_context)"
+    )
+    if sklearn_check_version("1.2"):
+        _parameter_constraints: dict = {
+            **sklearn_LocalOutlierFactor._parameter_constraints
+        }
+
+    # Only certain methods should be taken from knn to prevent code
+    # duplication. Inheriting would yield a complicated inheritance
+    # structure and violate the sklearn inheritance path.
+    _save_attributes = NearestNeighbors._save_attributes
+    _onedal_knn_fit = NearestNeighbors._onedal_fit
+    _onedal_kneighbors = NearestNeighbors._onedal_kneighbors
+
+    def _onedal_fit(self, X, y, queue=None):
+        if sklearn_check_version("1.2"):
+            self._validate_params()
+
+        self._onedal_knn_fit(X, y, queue)
+
+        if self.contamination != "auto":
+            if not (0.0 < self.contamination <= 0.5):
+                raise ValueError(
+                    "contamination must be in (0, 0.5], " "got: %f" % self.contamination
+                )
+
+        n_samples = self.n_samples_fit_
+
+        if self.n_neighbors > n_samples:
+            warnings.warn(
+                "n_neighbors (%s) is greater than the "
+                "total number of samples (%s). n_neighbors "
+                "will be set to (n_samples - 1) for estimation."
+                % (self.n_neighbors, n_samples)
+            )
+        self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1))
+
+        (
+            self._distances_fit_X_,
+            _neighbors_indices_fit_X_,
+        ) = self._onedal_kneighbors(n_neighbors=self.n_neighbors_, queue=queue)
+
+        # Sklearn includes a check for float32 at this point which may not be
+        # necessary for onedal
+
+        self._lrd = self._local_reachability_density(
+            self._distances_fit_X_, _neighbors_indices_fit_X_
+        )
+
+        # Compute lof score over training samples to define offset_:
+        lrd_ratios_array = self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis]
+
+        self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1)
+
+        if self.contamination == "auto":
+            # inliers score around -1 (the higher, the less abnormal).
+            self.offset_ = -1.5
+        else:
+            self.offset_ = np.percentile(
+                self.negative_outlier_factor_, 100.0 * self.contamination
+            )
+
+        return self
+
+    def fit(self, X, y=None):
+        self._fit_validation(X, y)
+        result = dispatch(
+            self,
+            "fit",
+            {
+                "onedal": self.__class__._onedal_fit,
+                "sklearn": sklearn_LocalOutlierFactor.fit,
+            },
+            X,
+            None,
+        )
+        return result
+
+    # Subtle order change to remove check_array and preserve dpnp and
+    # dpctl conformance. decision_function will return a dpnp or dpctl
+    # instance via kneighbors and an equivalent check_array exists in
+    # that call already in sklearn so no loss of functionality occurs
+    def _predict(self, X=None):
+        check_is_fitted(self)
+
+        if X is not None:
+            output = self.decision_function(X) < 0
+            is_inlier = np.ones(output.shape[0], dtype=int)
+            is_inlier[output] = -1
+        else:
+            is_inlier = np.ones(self.n_samples_fit_, dtype=int)
+            is_inlier[self.negative_outlier_factor_ < self.offset_] = -1
+
+        return is_inlier
+
+    # This had to be done because predict loses the queue when no
+    # argument is given and it is a dpctl tensor or dpnp array.
+    # This would cause issues in fit_predict. Also, available_if
+    # is hard to unwrap, and this is the most straighforward way.
+    @available_if(sklearn_LocalOutlierFactor._check_novelty_fit_predict)
+    @wrap_output_data
+    def fit_predict(self, X, y=None):
+        return self.fit(X)._predict()
+
+    @available_if(sklearn_LocalOutlierFactor._check_novelty_predict)
+    @wrap_output_data
+    def predict(self, X=None):
+        return self._predict(X)
+
+    @wrap_output_data
+    def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
+        check_is_fitted(self)
+        if sklearn_check_version("1.0") and X is not None:
+            self._check_feature_names(X, reset=False)
+        return dispatch(
+            self,
+            "kneighbors",
+            {
+                "onedal": self.__class__._onedal_kneighbors,
+                "sklearn": sklearn_LocalOutlierFactor.kneighbors,
+            },
+            X,
+            n_neighbors=n_neighbors,
+            return_distance=return_distance,
+        )
+
+    fit.__doc__ = sklearn_LocalOutlierFactor.fit.__doc__
+    fit_predict.__doc__ = sklearn_LocalOutlierFactor.fit_predict.__doc__
+    predict.__doc__ = sklearn_LocalOutlierFactor.predict.__doc__
+    kneighbors.__doc__ = sklearn_LocalOutlierFactor.kneighbors.__doc__
diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py
@@ -246,7 +246,7 @@ def predict_proba(self, X):
     @wrap_output_data
     def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
         check_is_fitted(self)
-        if sklearn_check_version("1.0"):
+        if sklearn_check_version("1.0") and X is not None:
             self._check_feature_names(X, reset=False)
         return dispatch(
             self,
@@ -256,8 +256,8 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
                 "sklearn": sklearn_KNeighborsClassifier.kneighbors,
             },
             X,
-            n_neighbors,
-            return_distance,
+            n_neighbors=n_neighbors,
+            return_distance=return_distance,
         )
 
     @wrap_output_data
diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py
@@ -227,7 +227,7 @@ def predict(self, X):
     @wrap_output_data
     def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
         check_is_fitted(self)
-        if sklearn_check_version("1.0"):
+        if sklearn_check_version("1.0") and X is not None:
             self._check_feature_names(X, reset=False)
         return dispatch(
             self,
@@ -237,8 +237,8 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
                 "sklearn": sklearn_KNeighborsRegressor.kneighbors,
             },
             X,
-            n_neighbors,
-            return_distance,
+            n_neighbors=n_neighbors,
+            return_distance=return_distance,
         )
 
     @wrap_output_data
diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py
@@ -151,8 +151,8 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
                 "sklearn": sklearn_NearestNeighbors.kneighbors,
             },
             X,
-            n_neighbors,
-            return_distance,
+            n_neighbors=n_neighbors,
+            return_distance=return_distance,
         )
 
     @wrap_output_data
diff --git a/sklearnex/neighbors/lof.py b/sklearnex/neighbors/lof.py
diff --git a/sklearnex/neighbors/tests/test_neighbors.py b/sklearnex/neighbors/tests/test_neighbors.py
diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py