diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index e952dddebf..32989289be 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -18,6 +18,7 @@ from numbers import Integral import numpy as np +import sys from onedal._device_offload import supports_queue from onedal.common._backend import bind_default_backend @@ -76,65 +77,68 @@ def infer(self, *args, **kwargs): ... @abstractmethod def _onedal_fit(self, X, y): ... - def _validate_data( - self, X, y=None, reset=True, validate_separately=None, **check_params - ): - if y is None: - if self.requires_y: - raise ValueError( - f"This {self.__class__.__name__} estimator " - f"requires y to be passed, but the target y is None." - ) - X = _check_array(X, **check_params) - out = X, y - else: - if validate_separately: - # We need this because some estimators validate X and y - # separately, and in general, separately calling _check_array() - # on X and y isn't equivalent to just calling _check_X_y() - # :( - check_X_params, check_y_params = validate_separately - X = _check_array(X, **check_X_params) - y = _check_array(y, **check_y_params) - else: - X, y = _check_X_y(X, y, **check_params) - out = X, y - - if check_params.get("ensure_2d", True): - _check_n_features(self, X, reset=reset) - - return out - - def _get_weights(self, dist, weights): - if weights in (None, "uniform"): - return None - if weights == "distance": - # if user attempts to classify a point that was zero distance from one - # or more training points, those training points are weighted as 1.0 - # and the other points as 0.0 - if dist.dtype is np.dtype(object): - for point_dist_i, point_dist in enumerate(dist): - # check if point_dist is iterable - # (ex: RadiusNeighborClassifier.predict may set an element of - # dist to 1e-6 to represent an 'outlier') - if hasattr(point_dist, "__contains__") and 0.0 in point_dist: - dist[point_dist_i] = point_dist == 0.0 - else: - dist[point_dist_i] = 1.0 / point_dist - else: - with np.errstate(divide="ignore"): - dist = 1.0 / dist - inf_mask = np.isinf(dist) - inf_row = np.any(inf_mask, axis=1) - dist[inf_row] = inf_mask[inf_row] - return dist - elif callable(weights): - return weights(dist) - else: - raise ValueError( - "weights not recognized: should be 'uniform', " - "'distance', or a callable function" - ) + # def _validate_data( + # self, X, y=None, reset=True, validate_separately=None, **check_params + # ): + # if y is None: + # if self.requires_y: + # raise ValueError( + # f"This {self.__class__.__name__} estimator " + # f"requires y to be passed, but the target y is None." + # ) + # X = _check_array(X, **check_params) + # out = X, y + # else: + # if validate_separately: + # # We need this because some estimators validate X and y + # # separately, and in general, separately calling _check_array() + # # on X and y isn't equivalent to just calling _check_X_y() + # # :( + # check_X_params, check_y_params = validate_separately + # X = _check_array(X, **check_X_params) + # y = _check_array(y, **check_y_params) + # else: + # X, y = _check_X_y(X, y, **check_params) + # out = X, y + + # if check_params.get("ensure_2d", True): + # _check_n_features(self, X, reset=reset) + + # return out + + # REFACTOR: _get_weights moved to sklearnex/neighbors/common.py + # All prediction logic now in sklearnex layer, so this method is no longer needed in onedal + # Original code kept for reference only + # def _get_weights(self, dist, weights): + # if weights in (None, "uniform"): + # return None + # if weights == "distance": + # # if user attempts to classify a point that was zero distance from one + # # or more training points, those training points are weighted as 1.0 + # # and the other points as 0.0 + # if dist.dtype is np.dtype(object): + # for point_dist_i, point_dist in enumerate(dist): + # # check if point_dist is iterable + # # (ex: RadiusNeighborClassifier.predict may set an element of + # # dist to 1e-6 to represent an 'outlier') + # if hasattr(point_dist, "__contains__") and 0.0 in point_dist: + # dist[point_dist_i] = point_dist == 0.0 + # else: + # dist[point_dist_i] = 1.0 / point_dist + # else: + # with np.errstate(divide="ignore"): + # dist = 1.0 / dist + # inf_mask = np.isinf(dist) + # inf_row = np.any(inf_mask, axis=1) + # dist[inf_row] = inf_mask[inf_row] + # return dist + # elif callable(weights): + # return weights(dist) + # else: + # raise ValueError( + # "weights not recognized: should be 'uniform', " + # "'distance', or a callable function" + # ) def _get_onedal_params(self, X, y=None, n_neighbors=None): class_count = 0 if self.classes_ is None else len(self.classes_) @@ -176,77 +180,119 @@ def __init__( self.p = p self.metric_params = metric_params - def _validate_targets(self, y, dtype): - arr = _column_or_1d(y, warn=True) - - try: - return arr.astype(dtype, copy=False) - except ValueError: - return arr - - def _validate_n_classes(self): - length = 0 if self.classes_ is None else len(self.classes_) - if length < 2: - raise ValueError( - f"The number of classes has to be greater than one; got {length}" - ) + # REFACTOR: _validate_targets commented out - all data conversion/validation moved to sklearnex layer + # Following PCA pattern: onedal should not do any data type conversion + # The sklearnex layer prepares data in the correct format before calling onedal + # Original code kept for reference: + # def _validate_targets(self, y, dtype): + # arr = _column_or_1d(y, warn=True) + # + # try: + # return arr.astype(dtype, copy=False) + # except ValueError: + # return arr + + # REFACTOR NOTE: _validate_n_classes moved to sklearnex/neighbors/common.py + # This method is no longer used in the onedal layer - all validation happens in sklearnex + # Commented out for reference only + # def _validate_n_classes(self): + # length = 0 if self.classes_ is None else len(self.classes_) + # if length < 2: + # raise ValueError( + # f"The number of classes has to be greater than one; got {length}" + # ) def _fit(self, X, y): + print(f"DEBUG oneDAL _fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) self._onedal_model = None self._tree = None - self._shape = None - self.classes_ = None + # REFACTOR: Shape processing moved to sklearnex layer + # _shape should be set by _process_classification_targets or _process_regression_targets in sklearnex + # self._shape = None + if not hasattr(self, '_shape'): + self._shape = None + # REFACTOR STEP 1: Don't reset classes_ - it may have been set by sklearnex layer + # self.classes_ = None + if not hasattr(self, 'classes_'): + self.classes_ = None self.effective_metric_ = getattr(self, "effective_metric_", self.metric) self.effective_metric_params_ = getattr( self, "effective_metric_params_", self.metric_params ) - _, xp, _ = _get_sycl_namespace(X) - use_raw_input = _get_config().get("use_raw_input", False) is True + # _, xp, _ = _get_sycl_namespace(X) + # REFACTOR: _validate_data call commented out - validation now happens in sklearnex layer + # Original code kept for reference: + # use_raw_input = _get_config().get("use_raw_input", False) is True if y is not None or self.requires_y: - shape = getattr(y, "shape", None) - if not use_raw_input: - X, y = super()._validate_data( - X, y, dtype=[np.float64, np.float32], accept_sparse="csr" - ) - self._shape = shape if shape is not None else y.shape - + # REFACTOR: Shape processing commented out - should be done in sklearnex layer + # Original code kept for reference: + # shape = getattr(y, "shape", None) + # REFACTOR: _validate_data call commented out - validation now happens in sklearnex layer + # if not use_raw_input: + # X, y = super()._validate_data( + # X, y, dtype=[np.float64, np.float32], accept_sparse="csr" + # ) + # self._shape = shape if shape is not None else y.shape + + # REFACTOR: Classification target processing moved to sklearnex layer + # This code is now commented out - processing MUST happen in sklearnex before calling fit + # Assertion: Verify that sklearnex has done the preprocessing if _is_classifier(self): - if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: - self.outputs_2d_ = False - y = y.reshape((-1, 1)) - else: - self.outputs_2d_ = True - - _check_classification_targets(y) - self.classes_ = [] - self._y = np.empty(y.shape, dtype=int) - for k in range(self._y.shape[1]): - classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes) - - if not self.outputs_2d_: - self.classes_ = self.classes_[0] - self._y = self._y.ravel() - - self._validate_n_classes() + if not hasattr(self, 'classes_') or self.classes_ is None: + raise ValueError( + "Classification target processing must be done in sklearnex layer before calling onedal fit. " + "classes_ attribute is not set. This indicates the refactoring is incomplete." + ) + if not hasattr(self, '_y') or self._y is None: + raise ValueError( + "Classification target processing must be done in sklearnex layer before calling onedal fit. " + "_y attribute is not set. This indicates the refactoring is incomplete." + ) + print(f"DEBUG oneDAL: Using pre-processed classification targets from sklearnex (classes_={self.classes_})", file=sys.stderr) + + # Original classification processing code - NOW COMMENTED OUT (moved to sklearnex) + # if _is_classifier(self): + # if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: + # self.outputs_2d_ = False + # y = y.reshape((-1, 1)) + # else: + # self.outputs_2d_ = True + + # _check_classification_targets(y) + # self.classes_ = [] + # self._y = np.empty(y.shape, dtype=int) + # for k in range(self._y.shape[1]): + # classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) + # self.classes_.append(classes) + + # if not self.outputs_2d_: + # self.classes_ = self.classes_[0] + # self._y = self._y.ravel() + + # self._validate_n_classes() + # else: else: + # For regressors, just store y self._y = y - elif not use_raw_input: - X, _ = super()._validate_data(X, dtype=[np.float64, np.float32]) + # REFACTOR: _validate_data call commented out - validation now happens in sklearnex layer + # elif not use_raw_input: + # X, _ = super()._validate_data(X, dtype=[np.float64, np.float32]) self.n_samples_fit_ = X.shape[0] self.n_features_in_ = X.shape[1] self._fit_X = X - if self.n_neighbors is not None: - if self.n_neighbors <= 0: - raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) - if not isinstance(self.n_neighbors, Integral): - raise TypeError( - "n_neighbors does not take %s value, " - "enter integer value" % type(self.n_neighbors) - ) + # REFACTOR: n_neighbors validation commented out - should be done in sklearnex layer + # Original code kept for reference: + # if self.n_neighbors is not None: + # if self.n_neighbors <= 0: + # raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) + # if not isinstance(self.n_neighbors, Integral): + # raise TypeError( + # "n_neighbors does not take %s value, " + # "enter integer value" % type(self.n_neighbors) + # ) self._fit_method = super()._parse_auto_method( self.algorithm, self.n_samples_fit_, self.n_features_in_ @@ -256,12 +302,26 @@ def _fit(self, X, y): queue = QM.get_global_queue() gpu_device = queue is not None and queue.sycl_device.is_gpu + print(f"DEBUG oneDAL _fit: Before _onedal_fit, X type={type(X)}, _fit_y type={type(_fit_y)}", file=sys.stderr) + # REFACTOR: All data preparation including reshaping moved to sklearnex layer + # Following PCA pattern: onedal is a thin wrapper, no data manipulation + # sklearnex prepares self._y in the correct shape before calling fit() + # Original code kept for reference: + # if _is_classifier(self) or (_is_regressor(self) and gpu_device): + # _fit_y = self._validate_targets(self._y, X.dtype).reshape((-1, 1)) + # OR for refactor without _validate_targets: + # _fit_y = self._y.reshape((-1, 1)) + + # REFACTOR: Just pass self._y as-is - sklearnex should have already reshaped it if _is_classifier(self) or (_is_regressor(self) and gpu_device): - _fit_y = self._validate_targets(self._y, X.dtype).reshape((-1, 1)) + _fit_y = self._y result = self._onedal_fit(X, _fit_y) + print(f"DEBUG oneDAL _fit: After _onedal_fit, self._fit_X type={type(self._fit_X)}, shape={getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - if y is not None and _is_regressor(self): - self._y = y if self._shape is None else xp.reshape(y, self._shape) + # REFACTOR: Shape-based y reshaping commented out - y should already be properly shaped by sklearnex + # Original code kept for reference: + # if y is not None and _is_regressor(self): + # self._y = y if self._shape is None else xp.reshape(y, self._shape) self._onedal_model = result result = self @@ -269,111 +329,157 @@ def _fit(self, X, y): return result def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): - use_raw_input = _get_config().get("use_raw_input", False) is True - n_features = getattr(self, "n_features_in_", None) - shape = getattr(X, "shape", None) - if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError( - ( - f"X has {X.shape[1]} features, " - f"but kneighbors is expecting " - f"{n_features} features as input" - ) - ) + # REFACTOR: Feature count validation commented out - should be done in sklearnex layer + # Original validation code kept for reference: + # use_raw_input = _get_config().get("use_raw_input", False) is True + # n_features = getattr(self, "n_features_in_", None) + # shape = getattr(X, "shape", None) + # if n_features and shape and len(shape) > 1 and shape[1] != n_features: + # raise ValueError( + # ( + # f"X has {X.shape[1]} features, " + # f"but kneighbors is expecting " + # f"{n_features} features as input" + # ) + # ) + + # Still need n_features for _parse_auto_method call later + # n_features = getattr(self, "n_features_in_", None) _check_is_fitted(self) if n_neighbors is None: n_neighbors = self.n_neighbors - elif n_neighbors <= 0: - raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) - else: - if not isinstance(n_neighbors, Integral): - raise TypeError( - "n_neighbors does not take %s value, " - "enter integer value" % type(n_neighbors) - ) - - if X is not None: - query_is_train = False - if not use_raw_input: - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - else: - query_is_train = True + # REFACTOR: n_neighbors validation commented out - should be done in sklearnex layer + # Original validation code kept for reference: + # elif n_neighbors <= 0: + # raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) + # else: + # if not isinstance(n_neighbors, Integral): + # raise TypeError( + # "n_neighbors does not take %s value, " + # "enter integer value" % type(n_neighbors) + # ) + + # REFACTOR: X array validation commented out - should be done in sklearnex layer + # Original validation code kept for reference: + # if X is not None: + # query_is_train = False + # if not use_raw_input: + # X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + # else: + # query_is_train = True + # X = self._fit_X + # # Include an extra neighbor to account for the sample itself being + # # returned, which is removed later + # n_neighbors += 1 + + # REFACTOR: query_is_train handling moved to sklearnex layer + # All post-processing now happens in sklearnex._kneighbors_post_processing() + # Original code kept for reference: + # if X is not None: + # query_is_train = False + # else: + # query_is_train = True + # X = self._fit_X + # # Include an extra neighbor to account for the sample itself being + # # returned, which is removed later + # n_neighbors += 1 + + # REFACTOR: onedal now just returns raw results, sklearnex does all processing + # Following PCA pattern: simple onedal layer + if X is None: X = self._fit_X - # Include an extra neighbor to account for the sample itself being - # returned, which is removed later - n_neighbors += 1 - - n_samples_fit = self.n_samples_fit_ - if n_neighbors > n_samples_fit: - if query_is_train: - n_neighbors -= 1 # ok to modify inplace because an error is raised - inequality_str = "n_neighbors < n_samples_fit" - else: - inequality_str = "n_neighbors <= n_samples_fit" - raise ValueError( - f"Expected {inequality_str}, but " - f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, " - f"n_samples = {X.shape[0]}" # include n_samples for common tests - ) - - chunked_results = None - method = self._parse_auto_method( - self._fit_method, self.n_samples_fit_, n_features - ) + # n_samples_fit = self.n_samples_fit_ + # REFACTOR: n_neighbors bounds validation moved to sklearnex layer (_onedal_kneighbors) + # Original validation code kept for reference: + # if n_neighbors > n_samples_fit: + # if query_is_train: + # n_neighbors -= 1 # ok to modify inplace because an error is raised + # inequality_str = "n_neighbors < n_samples_fit" + # else: + # inequality_str = "n_neighbors <= n_samples_fit" + # raise ValueError( + # f"Expected {inequality_str}, but " + # f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, " + # f"n_samples = {X.shape[0]}" # include n_samples for common tests + # ) + + # chunked_results = None + # method = self._parse_auto_method( + # self._fit_method, self.n_samples_fit_, n_features + # ) + + # REFACTOR: Following PCA pattern - onedal just calls backend and returns raw results + # All post-processing (kd_tree sorting, removing self, return_distance decision) moved to sklearnex params = super()._get_onedal_params(X, n_neighbors=n_neighbors) prediction_results = self._onedal_predict(self._onedal_model, X, params) distances = from_table(prediction_results.distances) indices = from_table(prediction_results.indices) - if method == "kd_tree": - for i in range(distances.shape[0]): - seq = distances[i].argsort() - indices[i] = indices[i][seq] - distances[i] = distances[i][seq] - - if return_distance: - results = distances, indices - else: - results = indices - - if chunked_results is not None: - if return_distance: - neigh_dist, neigh_ind = zip(*chunked_results) - results = np.vstack(neigh_dist), np.vstack(neigh_ind) - else: - results = np.vstack(chunked_results) - - if not query_is_train: - return results - - # If the query data is the same as the indexed data, we would like - # to ignore the first nearest neighbor of every sample, i.e - # the sample itself. - if return_distance: - neigh_dist, neigh_ind = results - else: - neigh_ind = results - - n_queries, _ = X.shape - sample_range = np.arange(n_queries)[:, None] - sample_mask = neigh_ind != sample_range - - # Corner case: When the number of duplicates are more - # than the number of neighbors, the first NN will not - # be the sample, but a duplicate. - # In that case mask the first duplicate. - dup_gr_nbrs = np.all(sample_mask, axis=1) - sample_mask[:, 0][dup_gr_nbrs] = False - - neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) - - if return_distance: - neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) - return neigh_dist, neigh_ind - return neigh_ind + # REFACTOR: kd_tree sorting moved to sklearnex._kneighbors_post_processing() + # Original code kept for reference: + # if method == "kd_tree": + # for i in range(distances.shape[0]): + # seq = distances[i].argsort() + # indices[i] = indices[i][seq] + # distances[i] = distances[i][seq] + + # REFACTOR: return_distance decision moved to sklearnex._kneighbors_post_processing() + # onedal always returns both distances and indices (backend always computes both) + # Original code kept for reference: + # if return_distance: + # results = distances, indices + # else: + # results = indices + + # Always return both - sklearnex will decide what to return to user + results = distances, indices + + # REFACTOR: chunked_results vstack moved to sklearnex (was dead code anyway) + # Original code kept for reference: + # if chunked_results is not None: + # if return_distance: + # neigh_dist, neigh_ind = zip(*chunked_results) + # results = np.vstack(neigh_dist), np.vstack(neigh_ind) + # else: + # results = np.vstack(chunked_results) + + # REFACTOR: Removing self from results moved to sklearnex._kneighbors_post_processing() + # All query_is_train post-processing now in sklearnex layer + # Original code kept for reference: + # if not query_is_train: + # return results + # + # # If the query data is the same as the indexed data, we would like + # # to ignore the first nearest neighbor of every sample, i.e + # # the sample itself. + # if return_distance: + # neigh_dist, neigh_ind = results + # else: + # neigh_ind = results + # + # n_queries, _ = X.shape + # sample_range = np.arange(n_queries)[:, None] + # sample_mask = neigh_ind != sample_range + # + # # Corner case: When the number of duplicates are more + # # than the number of neighbors, the first NN will not + # # be the sample, but a duplicate. + # # In that case mask the first duplicate. + # dup_gr_nbrs = np.all(sample_mask, axis=1) + # sample_mask[:, 0][dup_gr_nbrs] = False + # + # neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + # + # if return_distance: + # neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) + # return neigh_dist, neigh_ind + # return neigh_ind + + # Return raw results - sklearnex will do all post-processing + return results class KNeighborsClassifier(NeighborsBase, ClassifierMixin): @@ -412,8 +518,10 @@ def infer(self, *args, **kwargs): ... def _onedal_fit(self, X, y): # global queue is set as per user configuration (`target_offload`) or from data prior to calling this internal function queue = QM.get_global_queue() - params = self._get_onedal_params(X, y) + # REFACTOR: Convert to table FIRST, then get params from table (following PCA pattern) + # This ensures dtype is normalized (array API dtype -> numpy dtype) X_table, y_table = to_table(X, y, queue=queue) + params = self._get_onedal_params(X_table, y) return self.train(params, X_table, y_table).model def _onedal_predict(self, model, X, params): @@ -429,76 +537,100 @@ def _onedal_predict(self, model, X, params): def fit(self, X, y, queue=None): return self._fit(X, y) - @supports_queue - def predict(self, X, queue=None): - use_raw_input = _get_config().get("use_raw_input", False) is True - if not use_raw_input: - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - onedal_model = getattr(self, "_onedal_model", None) - n_features = getattr(self, "n_features_in_", None) - n_samples_fit_ = getattr(self, "n_samples_fit_", None) - shape = getattr(X, "shape", None) - if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError( - ( - f"X has {X.shape[1]} features, " - f"but KNNClassifier is expecting " - f"{n_features} features as input" - ) - ) - - _check_is_fitted(self) - - self._fit_method = self._parse_auto_method( - self.algorithm, n_samples_fit_, n_features - ) - - self._validate_n_classes() - - params = self._get_onedal_params(X) - prediction_result = self._onedal_predict(onedal_model, X, params) - responses = from_table(prediction_result.responses) - - result = self.classes_.take(np.asarray(responses.ravel(), dtype=np.intp)) - return result - - @supports_queue - def predict_proba(self, X, queue=None): - neigh_dist, neigh_ind = self.kneighbors(X, queue=queue) - - classes_ = self.classes_ - _y = self._y - if not self.outputs_2d_: - _y = self._y.reshape((-1, 1)) - classes_ = [self.classes_] - - n_queries = _num_samples(X) - - weights = self._get_weights(neigh_dist, self.weights) - if weights is None: - weights = np.ones_like(neigh_ind) - - all_rows = np.arange(n_queries) - probabilities = [] - for k, classes_k in enumerate(classes_): - pred_labels = _y[:, k][neigh_ind] - proba_k = np.zeros((n_queries, classes_k.size)) - - # a simple ':' index doesn't work right - for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) - proba_k[all_rows, idx] += weights[:, i] - - # normalize 'votes' into real [0,1] probabilities - normalizer = proba_k.sum(axis=1)[:, np.newaxis] - normalizer[normalizer == 0.0] = 1.0 - proba_k /= normalizer - - probabilities.append(proba_k) - - if not self.outputs_2d_: - probabilities = probabilities[0] - - return probabilities + # REFACTOR: All prediction logic moved to sklearnex layer + # predict() and predict_proba() are no longer used - sklearnex calls kneighbors() and computes predictions + # Original code kept for reference only + # @supports_queue + # def predict(self, X, queue=None): + # print(f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + # + # # REFACTOR: _check_array validation commented out - should be done in sklearnex layer + # # Original validation code kept for reference: + # # use_raw_input = _get_config().get("use_raw_input", False) is True + # # if not use_raw_input: + # # X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + # + # onedal_model = getattr(self, "_onedal_model", None) + # n_features = getattr(self, "n_features_in_", None) + # n_samples_fit_ = getattr(self, "n_samples_fit_", None) + # + # # REFACTOR: Feature count validation commented out - should be done in sklearnex layer + # # Original validation code kept for reference: + # # shape = getattr(X, "shape", None) + # # if n_features and shape and len(shape) > 1 and shape[1] != n_features: + # # raise ValueError( + # # ( + # # f"X has {X.shape[1]} features, " + # # f"but KNNClassifier is expecting " + # # f"{n_features} features as input" + # # ) + # # ) + # + # _check_is_fitted(self) + # + # self._fit_method = self._parse_auto_method( + # self.algorithm, n_samples_fit_, n_features + # ) + # + # # REFACTOR NOTE: _validate_n_classes() is now called during fit in sklearnex layer + # # No need to validate again during predict + # # self._validate_n_classes() + # + # # Handle X=None case (LOOCV pattern) - use training data + # # This is needed because _get_onedal_params expects X to have .dtype attribute + # if X is None: + # X = self._fit_X + # + # params = self._get_onedal_params(X) + # prediction_result = self._onedal_predict(onedal_model, X, params) + # responses = from_table(prediction_result.responses) + # + # result = self.classes_.take(np.asarray(responses.ravel(), dtype=np.intp)) + # print(f"DEBUG KNeighborsClassifier.predict END: result type={type(result)}", file=sys.stderr) + # return result + # + # @supports_queue + # def predict_proba(self, X, queue=None): + # print(f"DEBUG KNeighborsClassifier.predict_proba START: X type={type(X)}", file=sys.stderr) + # neigh_dist, neigh_ind = self.kneighbors(X, queue=queue) + # + # classes_ = self.classes_ + # _y = self._y + # if not self.outputs_2d_: + # _y = self._y.reshape((-1, 1)) + # classes_ = [self.classes_] + # + # n_queries = _num_samples(X) + # + # print(f"DEBUG predict_proba: Calling _get_weights", file=sys.stderr) + # weights = self._get_weights(neigh_dist, self.weights) + # if weights is None: + # print(f"DEBUG predict_proba: weights is None, using ones_like", file=sys.stderr) + # weights = np.ones_like(neigh_ind) + # else: + # print(f"DEBUG predict_proba: weights calculated, type={type(weights)}", file=sys.stderr) + # + # all_rows = np.arange(n_queries) + # probabilities = [] + # for k, classes_k in enumerate(classes_): + # pred_labels = _y[:, k][neigh_ind] + # proba_k = np.zeros((n_queries, classes_k.size)) + # + # # a simple ':' index doesn't work right + # for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) + # proba_k[all_rows, idx] += weights[:, i] + # + # # normalize 'votes' into real [0,1] probabilities + # normalizer = proba_k.sum(axis=1)[:, np.newaxis] + # normalizer[normalizer == 0.0] = 1.0 + # proba_k /= normalizer + # + # probabilities.append(proba_k) + # + # if not self.outputs_2d_: + # probabilities = probabilities[0] + # + # return probabilities @supports_queue def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): @@ -576,22 +708,14 @@ def fit(self, X, y, queue=None): def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return self._kneighbors(X, n_neighbors, return_distance) + # REFACTOR: Keep _predict_gpu for GPU backend support (called by sklearnex) + # This is the ONLY prediction method needed in onedal - it calls the backend directly + # All computation logic (weights, averaging, etc.) is in sklearnex def _predict_gpu(self, X): - use_raw_input = _get_config().get("use_raw_input", False) is True - if not use_raw_input: - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + # REFACTOR: Validation commented out - should be done in sklearnex layer before calling this onedal_model = getattr(self, "_onedal_model", None) n_features = getattr(self, "n_features_in_", None) n_samples_fit_ = getattr(self, "n_samples_fit_", None) - shape = getattr(X, "shape", None) - if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError( - ( - f"X has {X.shape[1]} features, " - f"but KNNClassifier is expecting " - f"{n_features} features as input" - ) - ) _check_is_fitted(self) @@ -607,39 +731,6 @@ def _predict_gpu(self, X): return result - def _predict_skl(self, X): - neigh_dist, neigh_ind = self.kneighbors(X) - - weights = self._get_weights(neigh_dist, self.weights) - - _y = self._y - if _y.ndim == 1: - _y = _y.reshape((-1, 1)) - - if weights is None: - y_pred = np.mean(_y[neigh_ind], axis=1) - else: - y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64) - denom = np.sum(weights, axis=1) - - for j in range(_y.shape[1]): - num = np.sum(_y[neigh_ind, j] * weights, axis=1) - y_pred[:, j] = num / denom - - if self._y.ndim == 1: - y_pred = y_pred.ravel() - - return y_pred - - @supports_queue - def predict(self, X, queue=None): - gpu_device = queue is not None and getattr(queue.sycl_device, "is_gpu", False) - is_uniform_weights = getattr(self, "weights", "uniform") == "uniform" - if gpu_device and is_uniform_weights: - return self._predict_gpu(X) - else: - return self._predict_skl(X) - class NearestNeighbors(NeighborsBase): def __init__( @@ -671,8 +762,11 @@ def infer(self, *arg, **kwargs): ... def _onedal_fit(self, X, y): # global queue is set as per user configuration (`target_offload`) or from data prior to calling this internal function queue = QM.get_global_queue() + # REFACTOR: Convert to table FIRST, then get params from table (following PCA pattern) + # This ensures dtype is normalized (array API dtype -> numpy dtype) + # Note: NearestNeighbors has no y, so only convert X to avoid y becoming a table + X = to_table(X, queue=queue) params = self._get_onedal_params(X, y) - X, y = to_table(X, y, queue=queue) return self.train(params, X).model def _onedal_predict(self, model, X, params): @@ -687,4 +781,4 @@ def fit(self, X, y=None, queue=None): @supports_queue def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): - return self._kneighbors(X, n_neighbors, return_distance) + return self._kneighbors(X, n_neighbors, return_distance) \ No newline at end of file diff --git a/onedal/neighbors/tests/test_knn_classification.py b/onedal/neighbors/tests/test_knn_classification.py index d29bdab345..783d9d6e24 100755 --- a/onedal/neighbors/tests/test_knn_classification.py +++ b/onedal/neighbors/tests/test_knn_classification.py @@ -19,31 +19,58 @@ from numpy.testing import assert_array_equal from sklearn import datasets -from onedal.neighbors import KNeighborsClassifier +# REFACTOR: Import from sklearnex instead of onedal +# Classification processing now happens in sklearnex layer +from sklearnex.neighbors import KNeighborsClassifier from onedal.tests.utils._device_selection import get_queues @pytest.mark.parametrize("queue", get_queues()) def test_iris(queue): + import sys + print(f"\n=== DEBUG test_iris START: queue={queue} ===", file=sys.stderr) + # REFACTOR NOTE: queue parameter not used with sklearnex, but kept for test parametrization iris = datasets.load_iris() - clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) - assert clf.score(iris.data, iris.target, queue=queue) > 0.9 + print(f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", file=sys.stderr) + print(f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", file=sys.stderr) + print(f"DEBUG test: Creating KNeighborsClassifier and calling fit", file=sys.stderr) + clf = KNeighborsClassifier(2).fit(iris.data, iris.target) + print(f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f"DEBUG test: Calling score", file=sys.stderr) + score = clf.score(iris.data, iris.target) + print(f"DEBUG test: score completed, score={score}", file=sys.stderr) + assert score > 0.9 assert_array_equal(clf.classes_, np.sort(clf.classes_)) + print(f"=== DEBUG test_iris END ===\n", file=sys.stderr) @pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): + import sys + print(f"\n=== DEBUG test_pickle START: queue={queue} ===", file=sys.stderr) + # REFACTOR NOTE: queue parameter not used with sklearnex, but kept for test parametrization if queue and queue.sycl_device.is_gpu: pytest.skip("KNN classifier pickling for the GPU sycl_queue is buggy.") iris = datasets.load_iris() - clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) - expected = clf.predict(iris.data, queue=queue) + print(f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", file=sys.stderr) + print(f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", file=sys.stderr) + print(f"DEBUG test: Creating KNeighborsClassifier and calling fit", file=sys.stderr) + clf = KNeighborsClassifier(2).fit(iris.data, iris.target) + print(f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f"DEBUG test: Calling predict", file=sys.stderr) + expected = clf.predict(iris.data) + print(f"DEBUG test: predict completed, expected type={type(expected)}, shape={expected.shape}", file=sys.stderr) import pickle + print(f"DEBUG test: Pickling classifier", file=sys.stderr) dump = pickle.dumps(clf) + print(f"DEBUG test: Unpickling classifier", file=sys.stderr) clf2 = pickle.loads(dump) assert type(clf2) == clf.__class__ - result = clf2.predict(iris.data, queue=queue) + print(f"DEBUG test: Calling predict on unpickled classifier", file=sys.stderr) + result = clf2.predict(iris.data) + print(f"DEBUG test: predict completed, result type={type(result)}, shape={result.shape}", file=sys.stderr) assert_array_equal(expected, result) + print(f"=== DEBUG test_pickle END ===\n", file=sys.stderr) \ No newline at end of file diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index 63a98164e7..7a47f25ffb 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -29,7 +29,7 @@ from sklearnex.neighbors.knn_unsupervised import NearestNeighbors from ..utils._array_api import get_namespace -from ..utils.validation import check_feature_names +from ..utils.validation import check_feature_names, validate_data @control_n_jobs(decorated_methods=["fit", "kneighbors", "_kneighbors"]) @@ -53,9 +53,18 @@ class LocalOutlierFactor(KNeighborsDispatchingBase, _sklearn_LocalOutlierFactor) _onedal_kneighbors = NearestNeighbors._onedal_kneighbors def _onedal_fit(self, X, y, queue=None): + import sys + print(f"DEBUG LocalOutlierFactor._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) if sklearn_check_version("1.2"): self._validate_params() + # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr" + ) + print(f"DEBUG: After validate_data, X type={type(X)}", file=sys.stderr) + + print(f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_knn_fit", file=sys.stderr) self._onedal_knn_fit(X, y, queue=queue) if self.contamination != "auto": @@ -75,6 +84,7 @@ def _onedal_fit(self, X, y, queue=None): ) self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) + print(f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_kneighbors", file=sys.stderr) ( self._distances_fit_X_, _neighbors_indices_fit_X_, @@ -109,9 +119,12 @@ def _onedal_fit(self, X, y, queue=None): "Increase the number of neighbors for more accurate results." ) + print(f"DEBUG LocalOutlierFactor._onedal_fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) return self def fit(self, X, y=None): + import sys + print(f"DEBUG LocalOutlierFactor.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) result = dispatch( self, "fit", @@ -122,9 +135,12 @@ def fit(self, X, y=None): X, None, ) + print(f"DEBUG LocalOutlierFactor.fit END: result type={type(result)}", file=sys.stderr) return result def _predict(self, X=None): + import sys + print(f"DEBUG LocalOutlierFactor._predict START: X type={type(X)}", file=sys.stderr) check_is_fitted(self) if X is not None: @@ -136,6 +152,7 @@ def _predict(self, X=None): is_inlier = np.ones(self.n_samples_fit_, dtype=int) is_inlier[self.negative_outlier_factor_ < self.offset_] = -1 + print(f"DEBUG LocalOutlierFactor._predict END: is_inlier type={type(is_inlier)}", file=sys.stderr) return is_inlier # This had to be done because predict loses the queue when no @@ -146,13 +163,28 @@ def _predict(self, X=None): @wraps(_sklearn_LocalOutlierFactor.fit_predict, assigned=["__doc__"]) @wrap_output_data def fit_predict(self, X, y=None): - return self.fit(X)._predict() + import sys + print(f"DEBUG LocalOutlierFactor.fit_predict START: X type={type(X)}", file=sys.stderr) + result = self.fit(X)._predict() + print(f"DEBUG LocalOutlierFactor.fit_predict END: result type={type(result)}", file=sys.stderr) + return result def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): + import sys + print(f"DEBUG LocalOutlierFactor._kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + + # Validate n_neighbors parameter first (before check_is_fitted) + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) - return dispatch( + + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) + self._kneighbors_validation(X, n_neighbors) + + result = dispatch( self, "kneighbors", { @@ -163,6 +195,8 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) + print(f"DEBUG LocalOutlierFactor._kneighbors END: result type={type(result)}", file=sys.stderr) + return result kneighbors = wrap_output_data(_kneighbors) @@ -170,7 +204,16 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): @wraps(_sklearn_LocalOutlierFactor.score_samples, assigned=["__doc__"]) @wrap_output_data def score_samples(self, X): + import sys + print(f"DEBUG LocalOutlierFactor.score_samples START: X type={type(X)}", file=sys.stderr) check_is_fitted(self) + + # Validate and convert X (pandas to numpy if needed) + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) + + check_feature_names(self, X, reset=False) distances_X, neighbors_indices_X = self._kneighbors( X, n_neighbors=self.n_neighbors_ @@ -183,7 +226,9 @@ def score_samples(self, X): lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis] - return -np.mean(lrd_ratios_array, axis=1) + result = -np.mean(lrd_ratios_array, axis=1) + print(f"DEBUG LocalOutlierFactor.score_samples END: result type={type(result)}", file=sys.stderr) + return result fit.__doc__ = _sklearn_LocalOutlierFactor.fit.__doc__ - kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__ + kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index ed48c48e77..a2e64a1baa 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -15,6 +15,7 @@ # ============================================================================== import warnings +from numbers import Integral import numpy as np from scipy import sparse as sp @@ -24,9 +25,19 @@ from sklearn.neighbors._kd_tree import KDTree from sklearn.utils.validation import check_is_fitted +from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version + +from ..utils.validation import validate_data from onedal._device_offload import _transfer_to_host -from onedal.utils.validation import _check_array, _num_features, _num_samples +from onedal.utils.validation import ( + _check_array, + _check_classification_targets, + _check_X_y, + _column_or_1d, + _num_features, + _num_samples, +) from .._utils import PatchingConditionsChain from ..base import oneDALEstimator @@ -35,10 +46,526 @@ class KNeighborsDispatchingBase(oneDALEstimator): + def _parse_auto_method(self, method, n_samples, n_features): + result_method = method + + if method in ["auto", "ball_tree"]: + condition = ( + self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 + ) + if self.metric == "precomputed" or n_features > 15 or condition: + result_method = "brute" + else: + if self.metric == "euclidean": + result_method = "kd_tree" + else: + result_method = "brute" + + return result_method + + # def _validate_data( + # self, X, y=None, reset=True, validate_separately=None, **check_params + # ): + # if y is None: + # if getattr(self, "requires_y", False): + # raise ValueError( + # f"This {self.__class__.__name__} estimator " + # f"requires y to be passed, but the target y is None." + # ) + # X = _check_array(X, **check_params) + # out = X, y + # else: + # if validate_separately: + # # We need this because some estimators validate X and y + # # separately, and in general, separately calling _check_array() + # # on X and y isn't equivalent to just calling _check_X_y() + # # :( + # check_X_params, check_y_params = validate_separately + # X = _check_array(X, **check_X_params) + # y = _check_array(y, **check_y_params) + # else: + # X, y = _check_X_y(X, y, **check_params) + # out = X, y + + # if check_params.get("ensure_2d", True): + # from onedal.utils.validation import _check_n_features + + # _check_n_features(self, X, reset=reset) + + # return out + + def _get_weights(self, dist, weights): + if weights in (None, "uniform"): + return None + if weights == "distance": + # Array API support: get namespace from dist array + xp, _ = get_namespace(dist) + # if user attempts to classify a point that was zero distance from one + # or more training points, those training points are weighted as 1.0 + # and the other points as 0.0 + if dist.dtype is xp.asarray(object).dtype: + for point_dist_i, point_dist in enumerate(dist): + # check if point_dist is iterable + # (ex: RadiusNeighborClassifier.predict may set an element of + # dist to 1e-6 to represent an 'outlier') + if hasattr(point_dist, "__contains__") and 0.0 in point_dist: + dist[point_dist_i] = point_dist == 0.0 + else: + dist[point_dist_i] = 1.0 / point_dist + else: + with xp.errstate(divide="ignore") if hasattr(xp, 'errstate') else np.errstate(divide="ignore"): + dist = 1.0 / dist + inf_mask = xp.isinf(dist) + inf_row = xp.any(inf_mask, axis=1) + dist[inf_row] = inf_mask[inf_row] + return dist + elif callable(weights): + return weights(dist) + else: + raise ValueError( + "weights not recognized: should be 'uniform', " + "'distance', or a callable function" + ) + + def _compute_weighted_prediction(self, neigh_dist, neigh_ind, weights_param, y_train): + """Compute weighted prediction for regression. + + Args: + neigh_dist: Distances to neighbors + neigh_ind: Indices of neighbors + weights_param: Weight parameter ('uniform', 'distance', or callable) + y_train: Training target values + + Returns: + Predicted values + """ + # Array API support: get namespace from input arrays + xp, _ = get_namespace(neigh_dist, neigh_ind, y_train) + + weights = self._get_weights(neigh_dist, weights_param) + + _y = y_train + if _y.ndim == 1: + _y = xp.reshape(_y, (-1, 1)) + + if weights is None: + # Array API: Use take() per row since array API take() only supports 1-D indices + # Build result by gathering rows one at a time + gathered_list = [] + for i in range(neigh_ind.shape[0]): + # Get indices for this sample's neighbors + sample_indices = neigh_ind[i, ...] # Shape: (n_neighbors,) + # Gather those rows from _y + sample_neighbors = xp.take(_y, sample_indices, axis=0) # Shape: (n_neighbors, n_outputs) + gathered_list.append(sample_neighbors) + # Stack and compute mean + gathered = xp.stack(gathered_list, axis=0) # Shape: (n_samples, n_neighbors, n_outputs) + y_pred = xp.mean(gathered, axis=1) + else: + y_pred = xp.empty((neigh_ind.shape[0], _y.shape[1]), dtype=xp.float64) + denom = xp.sum(weights, axis=1) + + for j in range(_y.shape[1]): + # Array API: Iterate over samples to gather values + y_col_j = _y[:, j, ...] # Shape: (n_train_samples,) + gathered_vals = [] + for i in range(neigh_ind.shape[0]): + sample_indices = neigh_ind[i, ...] # Shape: (n_neighbors,) + sample_vals = xp.take(y_col_j, sample_indices, axis=0) # Shape: (n_neighbors,) + gathered_vals.append(sample_vals) + gathered_j = xp.stack(gathered_vals, axis=0) # Shape: (n_samples, n_neighbors) + num = xp.sum(gathered_j * weights, axis=1) + y_pred[:, j, ...] = num / denom + + if y_train.ndim == 1: + y_pred = xp.reshape(y_pred, (-1,)) + + return y_pred + + def _compute_class_probabilities(self, neigh_dist, neigh_ind, weights_param, y_train, classes, outputs_2d): + """Compute class probabilities for classification. + + Args: + neigh_dist: Distances to neighbors + neigh_ind: Indices of neighbors + weights_param: Weight parameter ('uniform', 'distance', or callable) + y_train: Encoded training labels + classes: Class labels + outputs_2d: Whether output is 2D (multi-output) + + Returns: + Class probabilities + """ + from ..utils.validation import _num_samples + + # Array API support: get namespace from input arrays + xp, _ = get_namespace(neigh_dist, neigh_ind, y_train) + + _y = y_train + classes_ = classes + if not outputs_2d: + _y = xp.reshape(y_train, (-1, 1)) + classes_ = [classes] + + n_queries = neigh_ind.shape[0] + + weights = self._get_weights(neigh_dist, weights_param) + if weights is None: + # REFACTOR: Ensure weights is float for array API type promotion + # neigh_ind is int, so ones_like would give int, but we need float + weights = xp.ones_like(neigh_ind, dtype=xp.float64) + + probabilities = [] + for k, classes_k in enumerate(classes_): + # Get predicted labels for each neighbor: shape (n_samples, n_neighbors) + # _y[:, k] gives training labels for output k, then gather using neigh_ind + y_col_k = _y[:, k, ...] + + # Array API: Use take() with iteration since take() only supports 1-D indices + pred_labels_list = [] + for i in range(neigh_ind.shape[0]): + sample_indices = neigh_ind[i, ...] + sample_labels = xp.take(y_col_k, sample_indices, axis=0) + pred_labels_list.append(sample_labels) + pred_labels = xp.stack(pred_labels_list, axis=0) # Shape: (n_queries, n_neighbors) + + proba_k = xp.zeros((n_queries, classes_k.size), dtype=xp.float64) + + # Array API: Cannot use fancy indexing __setitem__ like proba_k[all_rows, idx] = ... + # Instead, build probabilities sample by sample + proba_list = [] + for sample_idx in range(n_queries): + sample_proba = xp.zeros((classes_k.size,), dtype=xp.float64) + # For this sample, accumulate weights for each neighbor's predicted class + for neighbor_idx in range(pred_labels.shape[1]): + class_label = int(pred_labels[sample_idx, neighbor_idx]) + weight = weights[sample_idx, neighbor_idx] + # Update probability for this class + sample_proba = xp.asarray([ + sample_proba[i] + weight if i == class_label else sample_proba[i] + for i in range(classes_k.size) + ]) + proba_list.append(sample_proba) + proba_k = xp.stack(proba_list, axis=0) # Shape: (n_queries, n_classes) + + # normalize 'votes' into real [0,1] probabilities + normalizer = xp.sum(proba_k, axis=1)[:, xp.newaxis] + normalizer[normalizer == 0.0] = 1.0 + proba_k /= normalizer + + probabilities.append(proba_k) + + if not outputs_2d: + probabilities = probabilities[0] + + return probabilities + + def _predict_skl_regression(self, X): + """SKL prediction path for regression - calls kneighbors, computes predictions. + + This method handles X=None (LOOCV) properly by calling self.kneighbors which + has the query_is_train logic. + + Args: + X: Query samples (or None for LOOCV) + Returns: + Predicted regression values + """ + neigh_dist, neigh_ind = self.kneighbors(X) + return self._compute_weighted_prediction( + neigh_dist, neigh_ind, self.weights, self._y + ) + + def _predict_skl_classification(self, X): + """SKL prediction path for classification - calls kneighbors, computes predictions. + + This method handles X=None (LOOCV) properly by calling self.kneighbors which + has the query_is_train logic. + + Args: + X: Query samples (or None for LOOCV) + Returns: + Predicted class labels + """ + neigh_dist, neigh_ind = self.kneighbors(X) + proba = self._compute_class_probabilities( + neigh_dist, neigh_ind, self.weights, self._y, self.classes_, self.outputs_2d_ + ) + # Array API support: get namespace from probability array + xp, _ = get_namespace(proba) + + if not self.outputs_2d_: + # Single output: classes_[argmax(proba, axis=1)] + result = self.classes_[xp.argmax(proba, axis=1)] + else: + # Multi-output: apply argmax separately for each output + result = [classes_k[xp.argmax(proba_k, axis=1)] + for classes_k, proba_k in zip(self.classes_, proba.T)] + result = xp.asarray(result).T + + return result + + def _validate_targets(self, y, dtype): + arr = _column_or_1d(y, warn=True) + + try: + return arr.astype(dtype, copy=False) + except ValueError: + return arr + + def _validate_n_neighbors(self, n_neighbors): + if n_neighbors is not None: + if n_neighbors <= 0: + raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) + if not isinstance(n_neighbors, Integral): + raise TypeError( + "n_neighbors does not take %s value, " + "enter integer value" % type(n_neighbors) + ) + + def _validate_n_classes(self): + """Validate that the classifier has at least 2 classes.""" + length = 0 if self.classes_ is None else len(self.classes_) + if length < 2: + raise ValueError( + f"The number of classes has to be greater than one; got {length}" + ) + + def _validate_feature_count(self, X, method_name=""): + n_features = getattr(self, "n_features_in_", None) + shape = getattr(X, "shape", None) + if n_features and shape and len(shape) > 1 and shape[1] != n_features: + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but {method_name} is expecting " + f"{n_features} features as input" + ) + ) + + def _validate_kneighbors_bounds(self, n_neighbors, query_is_train, X): + n_samples_fit = self.n_samples_fit_ + if n_neighbors > n_samples_fit: + if query_is_train: + n_neighbors -= 1 # ok to modify inplace because an error is raised + inequality_str = "n_neighbors < n_samples_fit" + else: + inequality_str = "n_neighbors <= n_samples_fit" + raise ValueError( + f"Expected {inequality_str}, but " + f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, " + f"n_samples = {X.shape[0]}" # include n_samples for common tests + ) + + def _kneighbors_validation(self, X, n_neighbors): + """Shared validation for kneighbors method called from sklearnex layer. + + Validates: + - Feature count matches training data if X is provided + - n_neighbors is within valid bounds if provided + """ + # Validate feature count if X is provided + if X is not None: + self._validate_feature_count(X) + + # Validate n_neighbors bounds if provided + if n_neighbors is not None: + # Determine if query is the training set + query_is_train = X is None or (hasattr(self, '_fit_X') and X is self._fit_X) + self._validate_kneighbors_bounds(n_neighbors, query_is_train, X if X is not None else self._fit_X) + + def _prepare_kneighbors_inputs(self, X, n_neighbors): + """Prepare inputs for kneighbors call to onedal backend. + + Handles query_is_train case: when X=None, sets X to training data and adds +1 to n_neighbors. + Validates n_neighbors bounds AFTER adding +1 (replicates original onedal behavior). + + Args: + X: Query data or None + n_neighbors: Number of neighbors or None + + Returns: + Tuple of (X, n_neighbors, query_is_train) + - X: Processed query data (self._fit_X if original X was None) + - n_neighbors: Adjusted n_neighbors (includes +1 if query_is_train) + - query_is_train: Boolean flag indicating if original X was None + """ + query_is_train = X is None + + if X is not None: + # Get the array namespace to use correct dtypes + xp, _ = get_namespace(X) + # Use _check_array like main branch, with array API dtype support + X = _check_array( + X, dtype=[xp.float64, xp.float32], accept_sparse="csr" + ) + else: + X = self._fit_X + # Include an extra neighbor to account for the sample itself being + # returned, which is removed later + if n_neighbors is None: + n_neighbors = self.n_neighbors + n_neighbors += 1 + + # Validate bounds AFTER adding +1 (replicates original onedal behavior) + # Original code in onedal had validation after n_neighbors += 1 + n_samples_fit = self.n_samples_fit_ + if n_neighbors > n_samples_fit: + n_neighbors_for_msg = n_neighbors - 1 # for error message, show original value + raise ValueError( + f"Expected n_neighbors < n_samples_fit, but " + f"n_neighbors = {n_neighbors_for_msg}, n_samples_fit = {n_samples_fit}, " + f"n_samples = {X.shape[0]}" + ) + + return X, n_neighbors, query_is_train + + def _kneighbors_post_processing(self, X, n_neighbors, return_distance, result, query_is_train): + """Shared post-processing for kneighbors results. + + Following PCA pattern: all post-processing in sklearnex, onedal returns raw results. + Replicates exact logic from main branch onedal._kneighbors() method. + + Handles (in order, matching main branch): + 1. kd_tree sorting: sorts results by distance (BEFORE deciding what to return) + 2. query_is_train case (X=None): removes self from results + 3. return_distance decision: return distances+indices or just indices + + Args: + X: Query data (self._fit_X if query_is_train) + n_neighbors: Number of neighbors (already includes +1 if query_is_train) + return_distance: Whether to return distances to user + result: Raw result from onedal backend - always (distances, indices) + query_is_train: Boolean indicating if original X was None + + Returns: + Post-processed result: (distances, indices) if return_distance else indices + """ + # Array API support: get namespace from result arrays + # onedal always returns both distances and indices (backend computes both) + distances, indices = result + xp, _ = get_namespace(distances, indices) + + # POST-PROCESSING STEP 1: kd_tree sorting (moved from onedal) + # This happens BEFORE deciding what to return, using distances that are always available + # Matches main branch: sorting uses distances even when return_distance=False + if self._fit_method == "kd_tree": + for i in range(distances.shape[0]): + seq = xp.argsort(distances[i]) + indices[i] = indices[i][seq] + distances[i] = distances[i][seq] + + # POST-PROCESSING STEP 2: Decide what to return (moved from onedal) + # This happens AFTER kd_tree sorting + if return_distance: + results = distances, indices + else: + results = indices + + # POST-PROCESSING STEP 3: Remove self from results when query_is_train (moved from onedal) + # This happens LAST, after sorting and after deciding format + if not query_is_train: + return results + + # If the query data is the same as the indexed data, we would like + # to ignore the first nearest neighbor of every sample, i.e the sample itself. + if return_distance: + neigh_dist, neigh_ind = results + else: + neigh_ind = results + + # X is self._fit_X in query_is_train case (set by caller) + n_queries, _ = X.shape + sample_range = xp.arange(n_queries)[:, xp.newaxis] + sample_mask = neigh_ind != sample_range + + # Corner case: When the number of duplicates are more + # than the number of neighbors, the first NN will not + # be the sample, but a duplicate. + # In that case mask the first duplicate. + dup_gr_nbrs = xp.all(sample_mask, axis=1) + sample_mask[:, 0][dup_gr_nbrs] = False + + neigh_ind = xp.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + + if return_distance: + neigh_dist = xp.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) + return neigh_dist, neigh_ind + return neigh_ind + + def _process_classification_targets(self, y): + """Process classification targets and set class-related attributes. + + Note: y should already be converted to numpy array via validate_data before calling this. + """ + import sys + print(f"DEBUG _process_classification_targets: y type={type(y)}, y shape={getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) + + # Array API support: get namespace from y + xp, _ = get_namespace(y) + + # y should already be numpy array from validate_data + y = xp.asarray(y) + + # Handle shape processing + shape = getattr(y, "shape", None) + self._shape = shape if shape is not None else y.shape + + if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: + self.outputs_2d_ = False + y = xp.reshape(y, (-1, 1)) + else: + self.outputs_2d_ = True + + # Validate classification targets + _check_classification_targets(y) + + # Process classes - note: np.unique is used for class extraction + # This is acceptable as classes are typically numpy arrays in sklearn + self.classes_ = [] + self._y = xp.empty(y.shape, dtype=xp.int32) + for k in range(self._y.shape[1]): + # Use numpy unique for class extraction (standard sklearn pattern) + y_k = np.asarray(y[:, k]) + classes, indices = np.unique(y_k, return_inverse=True) + self.classes_.append(classes) + self._y[:, k] = xp.asarray(indices) + + if not self.outputs_2d_: + self.classes_ = self.classes_[0] + self._y = xp.reshape(self._y, (-1,)) + + # Validate we have at least 2 classes + self._validate_n_classes() + + return y + + def _process_regression_targets(self, y): + """Process regression targets and set shape-related attributes. + + REFACTOR: This replicates the EXACT shape processing that was in onedal _fit. + Original onedal code: + shape = getattr(y, "shape", None) + self._shape = shape if shape is not None else y.shape + # (later, after fit) + self._y = y if self._shape is None else xp.reshape(y, self._shape) + + For now, just store _shape and _y as-is. The reshape happens after onedal fit is complete. + """ + import sys + # EXACT replication of original onedal shape processing + shape = getattr(y, "shape", None) + self._shape = shape if shape is not None else y.shape + self._y = y + print(f"DEBUG _process_regression_targets: _y type={type(self._y)}, _shape={self._shape}", file=sys.stderr) + return y + def _fit_validation(self, X, y=None): if sklearn_check_version("1.2"): self._validate_params() check_feature_names(self, X, reset=True) + # Validate n_neighbors parameter + self._validate_n_neighbors(self.n_neighbors) if self.metric_params is not None and "p" in self.metric_params: if self.p is not None: warnings.warn( @@ -67,8 +594,12 @@ def _fit_validation(self, X, y=None): self.effective_metric_ = "chebyshev" if not isinstance(X, (KDTree, BallTree, _sklearn_NeighborsBase)): + # Use _check_array like main branch, but with array API dtype support + # Get array namespace for array API support + # Don't check for NaN - let oneDAL handle it (will fallback to sklearn if needed) + xp, _ = get_namespace(X) self._fit_X = _check_array( - X, dtype=[np.float64, np.float32], accept_sparse=True + X, dtype=[xp.float64, xp.float32], accept_sparse=True, force_all_finite=False ) self.n_samples_fit_ = _num_samples(self._fit_X) self.n_features_in_ = _num_features(self._fit_X) @@ -199,9 +730,13 @@ def _onedal_supported(self, device, method_name, *data): y = None # To check multioutput, might be overhead if len(data) > 1: - y = np.asarray(data[1]) + # Array API support: get namespace from y + y_input = data[1] + xp, _ = get_namespace(y_input) + y = xp.asarray(y_input) if is_classifier: - class_count = len(np.unique(y)) + # Use numpy for unique (standard sklearn pattern) + class_count = len(np.unique(np.asarray(y))) if hasattr(self, "_onedal_estimator"): y = self._onedal_estimator._y if y is not None and hasattr(y, "ndim") and hasattr(y, "shape"): @@ -284,13 +819,17 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): # requires moving data to host to construct the csr_matrix if mode == "connectivity": A_ind = self.kneighbors(X, n_neighbors, return_distance=False) + # Transfer to host - after this, arrays are numpy _, (A_ind,) = _transfer_to_host(A_ind) n_queries = A_ind.shape[0] + # Use numpy after transfer to host A_data = np.ones(n_queries * n_neighbors) elif mode == "distance": A_data, A_ind = self.kneighbors(X, n_neighbors, return_distance=True) + # Transfer to host - after this, arrays are numpy _, (A_data, A_ind) = _transfer_to_host(A_data, A_ind) + # Use numpy after transfer to host A_data = np.reshape(A_data, (-1,)) else: @@ -302,6 +841,7 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): n_queries = A_ind.shape[0] n_samples_fit = self.n_samples_fit_ n_nonzero = n_queries * n_neighbors + # Use numpy after transfer to host A_indptr = np.arange(0, n_nonzero + 1, n_neighbors) kneighbors_graph = sp.csr_matrix( @@ -310,4 +850,4 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): return kneighbors_graph - kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ + kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 7e25fa5ae1..050957d9e2 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -14,6 +14,7 @@ # limitations under the License. # =============================================================================== +import numpy as np from sklearn.metrics import accuracy_score from sklearn.neighbors._classification import ( KNeighborsClassifier as _sklearn_KNeighborsClassifier, @@ -26,10 +27,11 @@ from onedal.neighbors import KNeighborsClassifier as onedal_KNeighborsClassifier from .._device_offload import dispatch, wrap_output_data -from ..utils.validation import check_feature_names +from ..utils._array_api import enable_array_api, get_namespace +from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase - +@enable_array_api @control_n_jobs( decorated_methods=["fit", "predict", "predict_proba", "kneighbors", "score"] ) @@ -64,6 +66,8 @@ def __init__( ) def fit(self, X, y): + import sys + print(f"DEBUG KNeighborsClassifier.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) dispatch( self, "fit", @@ -74,13 +78,16 @@ def fit(self, X, y): X, y, ) + print(f"DEBUG KNeighborsClassifier.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) return self @wrap_output_data def predict(self, X): + import sys + print(f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) check_is_fitted(self) check_feature_names(self, X, reset=False) - return dispatch( + result = dispatch( self, "predict", { @@ -89,12 +96,16 @@ def predict(self, X): }, X, ) + print(f"DEBUG KNeighborsClassifier.predict END: result type={type(result)}", file=sys.stderr) + return result @wrap_output_data def predict_proba(self, X): + import sys + print(f"DEBUG KNeighborsClassifier.predict_proba START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) check_is_fitted(self) check_feature_names(self, X, reset=False) - return dispatch( + result = dispatch( self, "predict_proba", { @@ -103,12 +114,16 @@ def predict_proba(self, X): }, X, ) + print(f"DEBUG KNeighborsClassifier.predict_proba END: result type={type(result)}", file=sys.stderr) + return result @wrap_output_data def score(self, X, y, sample_weight=None): + import sys + print(f"DEBUG KNeighborsClassifier.score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) check_is_fitted(self) check_feature_names(self, X, reset=False) - return dispatch( + result = dispatch( self, "score", { @@ -119,13 +134,26 @@ def score(self, X, y, sample_weight=None): y, sample_weight=sample_weight, ) + print(f"DEBUG KNeighborsClassifier.score END: result={result}", file=sys.stderr) + return result @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + import sys + print(f"DEBUG KNeighborsClassifier.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + + # Validate n_neighbors parameter first (before check_is_fitted) + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) - return dispatch( + + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) + self._kneighbors_validation(X, n_neighbors) + + result = dispatch( self, "kneighbors", { @@ -136,8 +164,29 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) + print(f"DEBUG KNeighborsClassifier.kneighbors END: result type={type(result)}", file=sys.stderr) + return result def _onedal_fit(self, X, y, queue=None): + import sys + print(f"DEBUG KNeighborsClassifier._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + + # Get array namespace for array API support + xp, _ = get_namespace(X) + print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) + + # REFACTOR: Use validate_data to convert pandas to numpy and validate types + # force_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) + X, y = validate_data( + self, X, y, dtype=[xp.float64, xp.float32], accept_sparse="csr" + ) + print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) + + # REFACTOR STEP 1: Process classification targets in sklearnex before passing to onedal + print(f"DEBUG: Processing classification targets in sklearnex", file=sys.stderr) + y_processed = self._process_classification_targets(y) + print(f"DEBUG: After _process_classification_targets, y_processed type={type(y_processed)}", file=sys.stderr) + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -150,40 +199,111 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ + + # REFACTOR: Pass both original and processed targets to onedal + # onedal needs the processed classes_ and _y attributes that we just set + self._onedal_estimator.classes_ = self.classes_ + self._onedal_estimator._y = self._y + self._onedal_estimator.outputs_2d_ = self.outputs_2d_ + self._onedal_estimator._shape = self._shape # Pass shape from sklearnex + print(f"DEBUG: Set onedal_estimator.classes_={self._onedal_estimator.classes_}", file=sys.stderr) + print(f"DEBUG: Set onedal_estimator._y shape={self._onedal_estimator._y.shape}", file=sys.stderr) + print(f"DEBUG: Set onedal_estimator._shape={self._onedal_estimator._shape}", file=sys.stderr) + + print(f"DEBUG KNeighborsClassifier._onedal_fit: Calling onedal_estimator.fit with X and original y", file=sys.stderr) + # Pass original y to onedal - it will use the pre-set classes_ and _y attributes we just assigned self._onedal_estimator.fit(X, y, queue=queue) + print(f"DEBUG KNeighborsClassifier._onedal_fit: After fit, calling _save_attributes", file=sys.stderr) self._save_attributes() + print(f"DEBUG KNeighborsClassifier._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) def _onedal_predict(self, X, queue=None): - return self._onedal_estimator.predict(X, queue=queue) + import sys + print(f"DEBUG KNeighborsClassifier._onedal_predict START: X type={type(X)}", file=sys.stderr) + + # Validate X to convert array API to numpy + if X is not None: + xp, _ = get_namespace(X) + X = validate_data( + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False + ) + + # Use the unified helper from common.py (calls kneighbors + computes prediction) + # This properly handles X=None (LOOCV) case + result = self._predict_skl_classification(X) + + print(f"DEBUG KNeighborsClassifier._onedal_predict END: result type={type(result)}", file=sys.stderr) + return result def _onedal_predict_proba(self, X, queue=None): - return self._onedal_estimator.predict_proba(X, queue=queue) + import sys + print(f"DEBUG KNeighborsClassifier._onedal_predict_proba START: X type={type(X)}", file=sys.stderr) + + # Call kneighbors through sklearnex (self.kneighbors is the sklearnex method) + # This properly handles X=None case (LOOCV) with query_is_train logic + neigh_dist, neigh_ind = self.kneighbors(X) + + # Use the helper method to compute class probabilities + result = self._compute_class_probabilities( + neigh_dist, neigh_ind, self.weights, self._y, self.classes_, self.outputs_2d_ + ) + + print(f"DEBUG KNeighborsClassifier._onedal_predict_proba END: result type={type(result)}", file=sys.stderr) + return result def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - return self._onedal_estimator.kneighbors( + import sys + print(f"DEBUG KNeighborsClassifier._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + + # REFACTOR: All post-processing now in sklearnex following PCA pattern + # Prepare inputs and handle query_is_train case + X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) + + # Get raw results from onedal backend + result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) + + # Apply post-processing (kd_tree sorting, removing self from results) + result = self._kneighbors_post_processing(X, n_neighbors, return_distance, result, query_is_train) + + print(f"DEBUG KNeighborsClassifier._onedal_kneighbors END: result type={type(result)}", file=sys.stderr) + return result def _onedal_score(self, X, y, sample_weight=None, queue=None): - return accuracy_score( + import sys + print(f"DEBUG KNeighborsClassifier._onedal_score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + # Convert array API to numpy for sklearn's accuracy_score + # Note: validate_data does NOT convert array API to numpy, so we do it explicitly + y = np.asarray(y) + if sample_weight is not None: + sample_weight = np.asarray(sample_weight) + result = accuracy_score( y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) + print(f"DEBUG KNeighborsClassifier._onedal_score END: result={result}", file=sys.stderr) + return result def _save_attributes(self): + import sys + print(f"DEBUG KNeighborsClassifier._save_attributes START", file=sys.stderr) self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ self._fit_X = self._onedal_estimator._fit_X + print(f"DEBUG KNeighborsClassifier._save_attributes: _fit_X type={type(self._fit_X)}", file=sys.stderr) self._y = self._onedal_estimator._y + print(f"DEBUG KNeighborsClassifier._save_attributes: _y type={type(self._y)}", file=sys.stderr) self._fit_method = self._onedal_estimator._fit_method self.outputs_2d_ = self._onedal_estimator.outputs_2d_ self._tree = self._onedal_estimator._tree + print(f"DEBUG KNeighborsClassifier._save_attributes END", file=sys.stderr) fit.__doc__ = _sklearn_KNeighborsClassifier.fit.__doc__ predict.__doc__ = _sklearn_KNeighborsClassifier.predict.__doc__ predict_proba.__doc__ = _sklearn_KNeighborsClassifier.predict_proba.__doc__ score.__doc__ = _sklearn_KNeighborsClassifier.score.__doc__ - kneighbors.__doc__ = _sklearn_KNeighborsClassifier.kneighbors.__doc__ + kneighbors.__doc__ = _sklearn_KNeighborsClassifier.kneighbors.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index ba1626b4ff..665e22c87f 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== +import numpy as np from sklearn.metrics import r2_score from sklearn.neighbors._regression import ( KNeighborsRegressor as _sklearn_KNeighborsRegressor, @@ -26,10 +27,12 @@ from onedal.neighbors import KNeighborsRegressor as onedal_KNeighborsRegressor from .._device_offload import dispatch, wrap_output_data -from ..utils.validation import check_feature_names +from ..utils._array_api import enable_array_api, get_namespace +from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase +@enable_array_api @control_n_jobs(decorated_methods=["fit", "predict", "kneighbors", "score"]) class KNeighborsRegressor(KNeighborsDispatchingBase, _sklearn_KNeighborsRegressor): __doc__ = _sklearn_KNeighborsRegressor.__doc__ @@ -62,6 +65,8 @@ def __init__( ) def fit(self, X, y): + import sys + print(f"DEBUG KNeighborsRegressor.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) dispatch( self, "fit", @@ -72,13 +77,16 @@ def fit(self, X, y): X, y, ) + print(f"DEBUG KNeighborsRegressor.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) return self @wrap_output_data def predict(self, X): + import sys + print(f"DEBUG KNeighborsRegressor.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) check_is_fitted(self) check_feature_names(self, X, reset=False) - return dispatch( + result = dispatch( self, "predict", { @@ -87,12 +95,16 @@ def predict(self, X): }, X, ) + print(f"DEBUG KNeighborsRegressor.predict END: result type={type(result)}", file=sys.stderr) + return result @wrap_output_data def score(self, X, y, sample_weight=None): + import sys + print(f"DEBUG KNeighborsRegressor.score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) check_is_fitted(self) check_feature_names(self, X, reset=False) - return dispatch( + result = dispatch( self, "score", { @@ -103,13 +115,26 @@ def score(self, X, y, sample_weight=None): y, sample_weight=sample_weight, ) + print(f"DEBUG KNeighborsRegressor.score END: result={result}", file=sys.stderr) + return result @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + import sys + print(f"DEBUG KNeighborsRegressor.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + + # Validate n_neighbors parameter first (before check_is_fitted) + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) - return dispatch( + + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) + self._kneighbors_validation(X, n_neighbors) + + result = dispatch( self, "kneighbors", { @@ -120,8 +145,30 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) + print(f"DEBUG KNeighborsRegressor.kneighbors END: result type={type(result)}", file=sys.stderr) + return result def _onedal_fit(self, X, y, queue=None): + import sys + print(f"DEBUG KNeighborsRegressor._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + + # Get array namespace for array API support + xp, _ = get_namespace(X) + print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) + + # REFACTOR: Use validate_data to convert pandas to numpy and validate types for X only + # force_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) + X = validate_data( + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr" + ) + print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) + + # REFACTOR: Process regression targets in sklearnex before passing to onedal + # This sets _shape and _y attributes + print(f"DEBUG: Processing regression targets in sklearnex", file=sys.stderr) + y_processed = self._process_regression_targets(y) + print(f"DEBUG: After _process_regression_targets, _shape={self._shape}, _y type={type(self._y)}", file=sys.stderr) + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -134,34 +181,133 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ + + # REFACTOR: Pass pre-processed shape and _y to onedal + # For GPU backend, reshape _y to (-1, 1) before passing to onedal + from onedal.utils import _sycl_queue_manager as QM + queue_instance = QM.get_global_queue() + gpu_device = queue_instance is not None and queue_instance.sycl_device.is_gpu + + self._onedal_estimator._shape = self._shape + # REFACTOR: Reshape _y for GPU backend (needs column vector) + # Following PCA pattern: all data preparation in sklearnex + if gpu_device: + self._onedal_estimator._y = xp.reshape(self._y, (-1, 1)) + else: + self._onedal_estimator._y = self._y + print(f"DEBUG: Set onedal_estimator._shape={self._onedal_estimator._shape}", file=sys.stderr) + print(f"DEBUG: GPU device={gpu_device}, _y shape={self._onedal_estimator._y.shape}", file=sys.stderr) + + print(f"DEBUG KNeighborsRegressor._onedal_fit: Calling onedal_estimator.fit", file=sys.stderr) self._onedal_estimator.fit(X, y, queue=queue) + print(f"DEBUG KNeighborsRegressor._onedal_fit: After fit, calling _save_attributes", file=sys.stderr) self._save_attributes() + + # REFACTOR: Replicate the EXACT post-fit reshaping from original onedal code + # Original onedal code (after fit): + # if y is not None and _is_regressor(self): + # _, xp, _ = _get_sycl_namespace(X) + # self._y = y if self._shape is None else xp.reshape(y, self._shape) + # Now doing this in sklearnex layer + if y is not None: + xp, _ = get_namespace(y) + self._y = y if self._shape is None else xp.reshape(y, self._shape) + # Also update the onedal estimator's _y since that's what gets used in predict + self._onedal_estimator._y = self._y + print(f"DEBUG: After reshape, self._y type={type(self._y)}, shape={getattr(self._y, 'shape', 'NO_SHAPE')}", file=sys.stderr) + + print(f"DEBUG KNeighborsRegressor._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) def _onedal_predict(self, X, queue=None): - return self._onedal_estimator.predict(X, queue=queue) + import sys + print(f"DEBUG KNeighborsRegressor._onedal_predict START: X type={type(X)}", file=sys.stderr) + + # Dispatch between GPU and SKL prediction methods + # This logic matches onedal regressor predict() method but computation happens in sklearnex + gpu_device = queue is not None and getattr(queue.sycl_device, "is_gpu", False) + is_uniform_weights = getattr(self, "weights", "uniform") == "uniform" + + if gpu_device and is_uniform_weights: + # GPU path: call onedal backend directly + result = self._predict_gpu(X, queue=queue) + else: + # SKL path: call kneighbors (through sklearnex) then compute in sklearnex + result = self._predict_skl(X, queue=queue) + + print(f"DEBUG KNeighborsRegressor._onedal_predict END: result type={type(result)}", file=sys.stderr) + return result + + def _predict_gpu(self, X, queue=None): + """GPU prediction path - validates X and calls onedal backend.""" + import sys + print(f"DEBUG KNeighborsRegressor._predict_gpu START: X type={type(X)}", file=sys.stderr) + # Validate and convert X (pandas to numpy if needed) only if X is not None + if X is not None: + xp, _ = get_namespace(X) + X = validate_data( + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, force_all_finite=False + ) + # Call onedal backend for GPU prediction + result = self._onedal_estimator._predict_gpu(X) + print(f"DEBUG KNeighborsRegressor._predict_gpu END: result type={type(result)}", file=sys.stderr) + return result + + def _predict_skl(self, X, queue=None): + """SKL prediction path - calls kneighbors through sklearnex, computes prediction here.""" + import sys + print(f"DEBUG KNeighborsRegressor._predict_skl START: X type={type(X)}", file=sys.stderr) + + # Use the unified helper from common.py (calls kneighbors + computes prediction) + result = self._predict_skl_regression(X) + + print(f"DEBUG KNeighborsRegressor._predict_skl END: result type={type(result)}", file=sys.stderr) + return result def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - return self._onedal_estimator.kneighbors( + import sys + print(f"DEBUG KNeighborsRegressor._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + + # REFACTOR: All post-processing now in sklearnex following PCA pattern + # Prepare inputs and handle query_is_train case + X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) + + # Get raw results from onedal backend + result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) + + # Apply post-processing (kd_tree sorting, removing self from results) + result = self._kneighbors_post_processing(X, n_neighbors, return_distance, result, query_is_train) + + print(f"DEBUG KNeighborsRegressor._onedal_kneighbors END: result type={type(result)}", file=sys.stderr) + return result def _onedal_score(self, X, y, sample_weight=None, queue=None): - return r2_score( + import sys + print(f"DEBUG KNeighborsRegressor._onedal_score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + result = r2_score( y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) + print(f"DEBUG KNeighborsRegressor._onedal_score END: result={result}", file=sys.stderr) + return result def _save_attributes(self): + import sys + print(f"DEBUG KNeighborsRegressor._save_attributes START", file=sys.stderr) self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ self._fit_X = self._onedal_estimator._fit_X + print(f"DEBUG KNeighborsRegressor._save_attributes: _fit_X type={type(self._fit_X)}", file=sys.stderr) self._y = self._onedal_estimator._y + print(f"DEBUG KNeighborsRegressor._save_attributes: _y type={type(self._y)}", file=sys.stderr) self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree + print(f"DEBUG KNeighborsRegressor._save_attributes END", file=sys.stderr) fit.__doc__ = _sklearn_KNeighborsRegressor.__doc__ predict.__doc__ = _sklearn_KNeighborsRegressor.predict.__doc__ kneighbors.__doc__ = _sklearn_KNeighborsRegressor.kneighbors.__doc__ - score.__doc__ = _sklearn_KNeighborsRegressor.score.__doc__ + score.__doc__ = _sklearn_KNeighborsRegressor.score.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 80da8bb2cf..e8f6e46840 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -14,6 +14,9 @@ # limitations under the License. # =============================================================================== +import sys + +import numpy as np from sklearn.neighbors._unsupervised import NearestNeighbors as _sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted @@ -23,10 +26,12 @@ from onedal.neighbors import NearestNeighbors as onedal_NearestNeighbors from .._device_offload import dispatch, wrap_output_data -from ..utils.validation import check_feature_names +from ..utils._array_api import enable_array_api, get_namespace +from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase +@enable_array_api @control_n_jobs(decorated_methods=["fit", "kneighbors", "radius_neighbors"]) class NearestNeighbors(KNeighborsDispatchingBase, _sklearn_NearestNeighbors): __doc__ = _sklearn_NearestNeighbors.__doc__ @@ -59,6 +64,7 @@ def __init__( ) def fit(self, X, y=None): + print(f"DEBUG NearestNeighbors.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) dispatch( self, "fit", @@ -69,14 +75,25 @@ def fit(self, X, y=None): X, None, ) + print(f"DEBUG NearestNeighbors.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}, _fit_X shape={getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) return self @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + print(f"DEBUG NearestNeighbors.kneighbors START: X type={type(X)}, _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + + # Validate n_neighbors parameter first (before check_is_fitted) + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) - return dispatch( + + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) + self._kneighbors_validation(X, n_neighbors) + + result = dispatch( self, "kneighbors", { @@ -87,19 +104,25 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) + print(f"DEBUG NearestNeighbors.kneighbors END: result type={type(result)}", file=sys.stderr) + return result @wrap_output_data def radius_neighbors( self, X=None, radius=None, return_distance=True, sort_results=False ): + print(f"DEBUG NearestNeighbors.radius_neighbors START: X type={type(X)}, _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}, _fit_X shape={getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f"DEBUG radius_neighbors: hasattr _onedal_estimator={hasattr(self, '_onedal_estimator')}, _tree={getattr(self, '_tree', 'NOT_SET')}, _fit_method={getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) if ( hasattr(self, "_onedal_estimator") or getattr(self, "_tree", 0) is None and self._fit_method == "kd_tree" ): + print(f"DEBUG radius_neighbors: Calling sklearn fit with _fit_X type={type(self._fit_X)}", file=sys.stderr) _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) + print(f"DEBUG radius_neighbors: sklearn fit completed, _fit_X type now={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) check_is_fitted(self) - return dispatch( + result = dispatch( self, "radius_neighbors", { @@ -111,6 +134,8 @@ def radius_neighbors( return_distance=return_distance, sort_results=sort_results, ) + print(f"DEBUG NearestNeighbors.radius_neighbors END: result type={type(result)}", file=sys.stderr) + return result def radius_neighbors_graph( self, X=None, radius=None, mode="connectivity", sort_results=False @@ -129,6 +154,18 @@ def radius_neighbors_graph( ) def _onedal_fit(self, X, y=None, queue=None): + print(f"DEBUG NearestNeighbors._onedal_fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) + + # Get array namespace for array API support + xp, _ = get_namespace(X) + + # REFACTOR: Use validate_data to convert pandas to numpy and validate types + # force_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) + X = validate_data( + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr" + ) + print(f"DEBUG: After validate_data, X type={type(X)}", file=sys.stderr) + onedal_params = { "n_neighbors": self.n_neighbors, "algorithm": self.algorithm, @@ -140,31 +177,63 @@ def _onedal_fit(self, X, y=None, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ + print(f"DEBUG NearestNeighbors._onedal_fit: Calling onedal_estimator.fit", file=sys.stderr) self._onedal_estimator.fit(X, y, queue=queue) + print(f"DEBUG NearestNeighbors._onedal_fit: After fit, onedal_estimator._fit_X type={type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) self._save_attributes() + print(f"DEBUG NearestNeighbors._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) def _onedal_predict(self, X, queue=None): + # Validate and convert X (pandas to numpy if needed) only if X is not None + if X is not None: + xp, _ = get_namespace(X) + X = validate_data( + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, force_all_finite=False + ) return self._onedal_estimator.predict(X, queue=queue) def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - return self._onedal_estimator.kneighbors( + import sys + print(f"DEBUG NearestNeighbors._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + + # REFACTOR: All post-processing now in sklearnex following PCA pattern + # Prepare inputs and handle query_is_train case (includes validation AFTER +=1) + X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) + + # Get raw results from onedal backend + result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) + + # Apply post-processing (kd_tree sorting, removing self from results) + result = self._kneighbors_post_processing(X, n_neighbors, return_distance, result, query_is_train) + + print(f"DEBUG NearestNeighbors._onedal_kneighbors END: result type={type(result)}", file=sys.stderr) + return result def _save_attributes(self): + print(f"DEBUG NearestNeighbors._save_attributes START: onedal_estimator._fit_X type={type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + if hasattr(self._onedal_estimator, '_fit_X'): + fit_x_preview = str(self._onedal_estimator._fit_X)[:200] + print(f"DEBUG _save_attributes: _fit_X value preview={fit_x_preview}", file=sys.stderr) self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ + # ORIGINAL MAIN BRANCH: Direct assignment without any tuple extraction self._fit_X = self._onedal_estimator._fit_X + print(f"DEBUG _save_attributes: AFTER assignment - self._fit_X type={type(self._fit_X)}, has shape attr={hasattr(self._fit_X, 'shape')}", file=sys.stderr) + if hasattr(self._fit_X, 'shape'): + print(f"DEBUG _save_attributes: self._fit_X.shape={self._fit_X.shape}", file=sys.stderr) self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree + print(f"DEBUG NearestNeighbors._save_attributes END: _fit_method={self._fit_method}, _tree={self._tree}", file=sys.stderr) fit.__doc__ = _sklearn_NearestNeighbors.__doc__ kneighbors.__doc__ = _sklearn_NearestNeighbors.kneighbors.__doc__ radius_neighbors.__doc__ = _sklearn_NearestNeighbors.radius_neighbors.__doc__ radius_neighbors_graph.__doc__ = ( _sklearn_NearestNeighbors.radius_neighbors_graph.__doc__ - ) + ) \ No newline at end of file diff --git a/sklearnex/tests/test_common.py b/sklearnex/tests/test_common.py index d8e3cb8188..a0b1d90476 100644 --- a/sklearnex/tests/test_common.py +++ b/sklearnex/tests/test_common.py @@ -601,4 +601,4 @@ def test_estimator(estimator, method, design_pattern, estimator_trace): if key in _DESIGN_RULE_VIOLATIONS: pytest.xfail(_DESIGN_RULE_VIOLATIONS[key]) else: - raise + raise \ No newline at end of file