diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index 6869a2235..50e55cff3 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -83,8 +83,13 @@ from scipy.sparse.linalg import eigsh from sklearn.base import BaseEstimator, MetaEstimatorMixin from sklearn.feature_selection._base import SelectorMixin -from sklearn.utils import check_array, check_random_state, check_X_y, safe_mask -from sklearn.utils.validation import FLOAT_DTYPES, as_float_array, check_is_fitted +from sklearn.utils import check_random_state, safe_mask +from sklearn.utils.validation import ( + FLOAT_DTYPES, + as_float_array, + check_is_fitted, + validate_data, +) from .utils import ( X_orthogonalizer, @@ -157,11 +162,6 @@ def __init__( self.n_to_select = n_to_select self.score_threshold = score_threshold self.score_threshold_type = score_threshold_type - if self.score_threshold_type not in ["relative", "absolute"]: - raise ValueError( - "invalid score_threshold_type, expected one of 'relative' or 'absolute'" - ) - self.full = full self.progress_bar = progress_bar self.random_state = random_state @@ -184,6 +184,11 @@ def fit(self, X, y=None, warm_start=False): ------- self : object """ + if self.score_threshold_type not in ["relative", "absolute"]: + raise ValueError( + "invalid score_threshold_type, expected one of 'relative' or 'absolute'" + ) + if self.selection_type == "feature": self._axis = 1 elif self.selection_type == "sample": @@ -205,7 +210,7 @@ def fit(self, X, y=None, warm_start=False): if hasattr(self, "mixing") or y is not None: X, y = self._validate_data(X, y, **params) - X, y = check_X_y(X, y, multi_output=True) + X, y = validate_data(self, X, y, multi_output=True) if len(y.shape) == 1: # force y to have multi_output 2D format even when it's 1D, since @@ -214,7 +219,7 @@ def fit(self, X, y=None, warm_start=False): y = y.reshape((len(y), 1)) else: - X = check_array(X, **params) + X = validate_data(self, X, **params) if self.full and self.score_threshold is not None: raise ValueError( @@ -308,7 +313,7 @@ def transform(self, X, y=None): mask = self.get_support() - X = check_array(X) + X = validate_data(self, X, reset=False) if len(X.shape) == 1: if self._axis == 0: @@ -486,6 +491,11 @@ def _more_tags(self): "requires_y": False, } + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.required = False + return tags + class _CUR(GreedySelector): """Transformer that performs Greedy Selection by choosing features @@ -560,6 +570,8 @@ def score(self, X, y=None): score : numpy.ndarray of (n_to_select_from_) :math:`\pi` importance for the given samples or features """ + X, y = validate_data(self, X, y, reset=False) + return self.pi_ def _init_greedy_search(self, X, y, n_to_select): @@ -734,6 +746,8 @@ def score(self, X, y=None): score : numpy.ndarray of (n_to_select_from_) :math:`\pi` importance for the given samples or features """ + X, y = validate_data(self, X, y, reset=False) + return self.pi_ def _init_greedy_search(self, X, y, n_to_select): @@ -927,6 +941,8 @@ def score(self, X, y=None): ------- hausdorff : Hausdorff distances """ + X, y = validate_data(self, X, y, reset=False) + return self.hausdorff_ def get_distance(self): @@ -1048,11 +1064,6 @@ def __init__( full=False, random_state=0, ): - if mixing == 1.0: - raise ValueError( - "Mixing = 1.0 corresponds to traditional FPS." - "Please use the FPS class." - ) self.mixing = mixing self.initialize = initialize @@ -1067,6 +1078,16 @@ def __init__( random_state=random_state, ) + def fit(self, X, y=None, warm_start=False): + + if self.mixing == 1.0: + raise ValueError( + "Mixing = 1.0 corresponds to traditional FPS." + "Please use the FPS class." + ) + + return super().fit(X, y) + def score(self, X, y=None): """Returns the Hausdorff distances of all samples to previous selections. @@ -1083,6 +1104,8 @@ def score(self, X, y=None): ------- hausdorff : Hausdorff distances """ + X, y = validate_data(self, X, y, reset=False) + return self.hausdorff_ def get_distance(self): @@ -1159,3 +1182,8 @@ def _more_tags(self): return { "requires_y": True, } + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.required = True + return tags diff --git a/src/skmatter/decomposition/_kernel_pcovr.py b/src/skmatter/decomposition/_kernel_pcovr.py index 84a9439e1..65fe39a3e 100644 --- a/src/skmatter/decomposition/_kernel_pcovr.py +++ b/src/skmatter/decomposition/_kernel_pcovr.py @@ -9,10 +9,10 @@ from sklearn.kernel_ridge import KernelRidge from sklearn.linear_model._base import LinearModel from sklearn.metrics.pairwise import pairwise_kernels -from sklearn.utils import check_array, check_random_state +from sklearn.utils import check_random_state from sklearn.utils._arpack import _init_arpack_v0 from sklearn.utils.extmath import randomized_svd, stable_cumsum, svd_flip -from sklearn.utils.validation import check_is_fitted, check_X_y +from sklearn.utils.validation import check_is_fitted, validate_data from ..preprocessing import KernelNormalizer from ..utils import check_krr_fit, pcovr_kernel @@ -270,7 +270,7 @@ def fit(self, X, Y, W=None): ): raise ValueError("Regressor must be an instance of `KernelRidge`") - X, Y = check_X_y(X, Y, y_numeric=True, multi_output=True) + X, Y = validate_data(self, X, Y, y_numeric=True, multi_output=True) self.X_fit_ = X.copy() if self.n_components is None: @@ -387,7 +387,7 @@ def predict(self, X=None): """Predicts the property values""" check_is_fitted(self, ["pky_", "pty_"]) - X = check_array(X) + X = validate_data(self, X, reset=False) K = self._get_kernel(X, self.X_fit_) if self.center: K = self.centerer_.transform(K) @@ -408,7 +408,7 @@ def transform(self, X): """ check_is_fitted(self, ["pkt_", "X_fit_"]) - X = check_array(X) + X = validate_data(self, X, reset=False) K = self._get_kernel(X, self.X_fit_) if self.center: @@ -440,7 +440,7 @@ def inverse_transform(self, T): """ return T @ self.ptx_ - def score(self, X, Y): + def score(self, X, y): r"""Computes the (negative) loss values for KernelPCovR on the given predictor and response variables. The loss in :math:`\mathbf{K}`, as explained in [Helfrecht2020]_ does not correspond to a traditional Gram loss @@ -474,7 +474,7 @@ def score(self, X, Y): """ check_is_fitted(self, ["pkt_", "X_fit_"]) - X = check_array(X) + X, y = validate_data(self, X, y, reset=False) K_NN = self._get_kernel(self.X_fit_, self.X_fit_) K_VN = self._get_kernel(X, self.X_fit_) @@ -485,8 +485,8 @@ def score(self, X, Y): K_VN = self.centerer_.transform(K_VN) K_VV = self.centerer_.transform(K_VV) - y = K_VN @ self.pky_ - Lkrr = np.linalg.norm(Y - y) ** 2 / np.linalg.norm(Y) ** 2 + ypred = K_VN @ self.pky_ + Lkrr = np.linalg.norm(y - ypred) ** 2 / np.linalg.norm(y) ** 2 t_n = K_NN @ self.pkt_ t_v = K_VN @ self.pkt_ diff --git a/src/skmatter/decomposition/_pcovr.py b/src/skmatter/decomposition/_pcovr.py index ddaf3bebd..8cdd24680 100644 --- a/src/skmatter/decomposition/_pcovr.py +++ b/src/skmatter/decomposition/_pcovr.py @@ -10,10 +10,10 @@ from sklearn.decomposition._pca import _infer_dimension from sklearn.linear_model import LinearRegression, Ridge, RidgeCV from sklearn.linear_model._base import LinearModel -from sklearn.utils import check_array, check_random_state +from sklearn.utils import check_random_state from sklearn.utils._arpack import _init_arpack_v0 from sklearn.utils.extmath import randomized_svd, stable_cumsum, svd_flip -from sklearn.utils.validation import check_is_fitted, check_X_y +from sklearn.utils.validation import check_is_fitted, validate_data from ..utils import check_lr_fit, pcovr_covariance, pcovr_kernel @@ -221,7 +221,7 @@ def fit(self, X, Y, W=None): Regression weights, optional when regressor=`precomputed`. If not passed, it is assumed that `W = np.linalg.lstsq(X, Y, self.tol)[0]` """ - X, Y = check_X_y(X, Y, y_numeric=True, multi_output=True) + X, Y = validate_data(self, X, Y, y_numeric=True, multi_output=True) # saved for inverse transformations from the latent space, # should be zero in the case that the features have been properly centered @@ -582,10 +582,10 @@ def predict(self, X=None, T=None): raise ValueError("Either X or T must be supplied.") if X is not None: - X = check_array(X) + X = validate_data(self, X, reset=False) return X @ self.pxy_ else: - T = check_array(T) + T = validate_data(self, T, reset=False) return T @ self.pty_ def transform(self, X=None): @@ -604,7 +604,7 @@ def transform(self, X=None): return super().transform(X) - def score(self, X, Y, T=None): + def score(self, X, y, T=None): r"""Return the (negative) total reconstruction error for X and Y, defined as: @@ -635,13 +635,15 @@ def score(self, X, Y, T=None): Negative sum of the loss in reconstructing X from the latent-space projection T and the loss in predicting Y from the latent-space projection T """ + X, y = validate_data(self, X, y, reset=False) + if T is None: T = self.transform(X) - x = self.inverse_transform(T) - y = self.predict(T=T) + Xrec = self.inverse_transform(T) + ypred = self.predict(T=T) return -( - np.linalg.norm(X - x) ** 2.0 / np.linalg.norm(X) ** 2.0 - + np.linalg.norm(Y - y) ** 2.0 / np.linalg.norm(Y) ** 2.0 + np.linalg.norm(X - Xrec) ** 2.0 / np.linalg.norm(X) ** 2.0 + + np.linalg.norm(y - ypred) ** 2.0 / np.linalg.norm(y) ** 2.0 ) diff --git a/src/skmatter/linear_model/_ridge.py b/src/skmatter/linear_model/_ridge.py index 6e4fcf1f3..9dd5e1678 100644 --- a/src/skmatter/linear_model/_ridge.py +++ b/src/skmatter/linear_model/_ridge.py @@ -3,11 +3,10 @@ from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin from sklearn.metrics import check_scoring from sklearn.model_selection import KFold, check_cv -from sklearn.utils import check_array -from sklearn.utils.validation import check_is_fitted +from sklearn.utils.validation import check_is_fitted, validate_data -class Ridge2FoldCV(BaseEstimator, MultiOutputMixin, RegressorMixin): +class Ridge2FoldCV(RegressorMixin, MultiOutputMixin, BaseEstimator): r"""Ridge regression with an efficient 2-fold cross-validation method using the SVD solver. @@ -20,7 +19,7 @@ class Ridge2FoldCV(BaseEstimator, MultiOutputMixin, RegressorMixin): while the alpha value is determined with a 2-fold cross-validation from a list of alpha values. It is more efficient version than doing 2-fold cross-validation naively The algorithmic trick is to reuse the matrices obtained by SVD for each - regularization paramater :param alpha: The 2-fold CV can be broken donw to + regularization paramater :param alpha: The 2-fold CV can be broken down to .. math:: @@ -136,6 +135,11 @@ def __init__( self.shuffle = shuffle self.n_jobs = n_jobs + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.single_output = False + return tags + def _more_tags(self): return {"multioutput_only": True} @@ -195,7 +199,7 @@ def predict(self, X): Training data, where n_samples is the number of samples and n_features is the number of features. """ - X = check_array(X) + X = validate_data(self, X, reset=False) check_is_fitted(self, ["coef_"]) diff --git a/src/skmatter/sample_selection/_base.py b/src/skmatter/sample_selection/_base.py index f5531d897..67d5f0472 100644 --- a/src/skmatter/sample_selection/_base.py +++ b/src/skmatter/sample_selection/_base.py @@ -4,7 +4,7 @@ import numpy as np from scipy.interpolate import LinearNDInterpolator, interp1d -from scipy.interpolate.interpnd import _ndim_coords_from_arrays +from scipy.interpolate._interpnd import _ndim_coords_from_arrays from scipy.spatial import ConvexHull from sklearn.utils.validation import check_array, check_is_fitted, check_X_y