Skip to content

Commit 64232da

Browse files
cajchristianChristian Jorgensen
andauthored
Updates to fix sklearn branch (#241)
* replacing `self._validate_data` with `validate_data` * Additional fixes * Fixing scikit-learn warnings * Fixing PCovR to work with 1D column vectors * Fixing examples * Changing shape of y in pcovr tests * Fixing PCovR to not mess with data shape --------- Co-authored-by: Christian Jorgensen <[email protected]>
1 parent f8c2cc8 commit 64232da

File tree

10 files changed

+74
-36
lines changed

10 files changed

+74
-36
lines changed

examples/pcovr/PCovR.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
)
5151
pcovr.fit(X_scaled, y_scaled)
5252
T = pcovr.transform(X_scaled)
53-
yp = y_scaler.inverse_transform(pcovr.predict(X_scaled))
53+
yp = y_scaler.inverse_transform(pcovr.predict(X_scaled).reshape(-1, 1))
5454

5555
fig, ((axT, axy), (caxT, caxy)) = plt.subplots(
5656
2, 2, figsize=(8, 5), gridspec_kw=dict(height_ratios=(1, 0.1))
@@ -90,7 +90,7 @@
9090
)
9191
pcovr.fit(X_scaled, y_scaled)
9292
T = pcovr.transform(X_scaled)
93-
yp = y_scaler.inverse_transform(pcovr.predict(X_scaled))
93+
yp = y_scaler.inverse_transform(pcovr.predict(X_scaled).reshape(-1, 1))
9494

9595
axes[0, i].scatter(
9696
T[:, 0], T[:, 1], s=50, alpha=0.8, c=y, cmap=cmapX, edgecolor="k"
@@ -136,7 +136,7 @@
136136
)
137137
kpcovr.fit(X_scaled, y_scaled)
138138
T = kpcovr.transform(X_scaled)
139-
yp = y_scaler.inverse_transform(kpcovr.predict(X_scaled))
139+
yp = y_scaler.inverse_transform(kpcovr.predict(X_scaled).reshape(-1, 1))
140140

141141
fig, ((axT, axy), (caxT, caxy)) = plt.subplots(
142142
2, 2, figsize=(8, 5), gridspec_kw=dict(height_ratios=(1, 0.1))

examples/pcovr/PCovR_Regressors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
X_scaled = X_scaler.fit_transform(X)
3030

3131
y_scaler = StandardScaler()
32-
y_scaled = y_scaler.fit_transform(y.reshape(-1, 1)).ravel()
32+
y_scaled = y_scaler.fit_transform(y.reshape(-1, 1))
3333

3434

3535
# %%

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@ known_first_party = "skmatter"
8787
[tool.pytest.ini_options]
8888
testpaths = ["tests"]
8989
addopts = ["--cov"]
90-
filterwarnings = ["error"]
9190

9291
[tool.ruff]
9392
exclude = ["docs/src/examples/"]

src/skmatter/_selection.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,7 @@ def fit(self, X, y=None, warm_start=False):
209209
params = dict(ensure_min_samples=2, ensure_min_features=2, dtype=FLOAT_DTYPES)
210210

211211
if hasattr(self, "mixing") or y is not None:
212-
X, y = self._validate_data(X, y, **params)
213-
X, y = validate_data(self, X, y, multi_output=True)
212+
X, y = validate_data(self, X, y, multi_output=True, **params)
214213

215214
if len(y.shape) == 1:
216215
# force y to have multi_output 2D format even when it's 1D, since
@@ -569,7 +568,10 @@ def score(self, X, y=None):
569568
score : numpy.ndarray of (n_to_select_from_)
570569
:math:`\pi` importance for the given samples or features
571570
"""
572-
validate_data(self, X, y, reset=False) # present for API consistency
571+
if y is not None:
572+
validate_data(self, X, y.ravel(), reset=False)
573+
else:
574+
validate_data(self, X, reset=False) # present for API consistency
573575
return self.pi_
574576

575577
def _init_greedy_search(self, X, y, n_to_select):
@@ -744,7 +746,10 @@ def score(self, X, y=None):
744746
score : numpy.ndarray of (n_to_select_from_)
745747
:math:`\pi` importance for the given samples or features
746748
"""
747-
validate_data(self, X, y, reset=False) # present for API consistency
749+
if y is not None:
750+
validate_data(self, X, y.ravel(), reset=False)
751+
else:
752+
validate_data(self, X, reset=False) # present for API consistency
748753
return self.pi_
749754

750755
def _init_greedy_search(self, X, y, n_to_select):
@@ -938,7 +943,10 @@ def score(self, X, y=None):
938943
-------
939944
hausdorff : Hausdorff distances
940945
"""
941-
validate_data(self, X, y, reset=False)
946+
if y is not None:
947+
validate_data(self, X, y.ravel(), reset=False)
948+
else:
949+
validate_data(self, X, reset=False)
942950
return self.hausdorff_
943951

944952
def get_distance(self):
@@ -1101,7 +1109,11 @@ def score(self, X, y=None):
11011109
-------
11021110
hausdorff : Hausdorff distances
11031111
"""
1104-
validate_data(self, X, y, reset=False)
1112+
if y is not None:
1113+
validate_data(self, X, y.ravel(), reset=False)
1114+
else:
1115+
validate_data(self, X, reset=False)
1116+
11051117
return self.hausdorff_
11061118

11071119
def get_distance(self):

src/skmatter/decomposition/_kernel_pcovr.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from sklearn.utils import check_random_state
1313
from sklearn.utils._arpack import _init_arpack_v0
1414
from sklearn.utils.extmath import randomized_svd, stable_cumsum, svd_flip
15-
from sklearn.utils.validation import check_is_fitted, validate_data
15+
from sklearn.utils.validation import _check_n_features, check_is_fitted, validate_data
1616

1717
from ..preprocessing import KernelNormalizer
1818
from ..utils import check_krr_fit, pcovr_kernel
@@ -347,7 +347,7 @@ def fit(self, X, Y, W=None):
347347
except NotFittedError:
348348
self.regressor_.set_params(**regressor.get_params())
349349
self.regressor_.X_fit_ = self.X_fit_
350-
self.regressor_._check_n_features(self.X_fit_, reset=True)
350+
_check_n_features(self.regressor_, self.X_fit_, reset=True)
351351
else:
352352
Yhat = Y.copy()
353353
if W is None:

src/skmatter/linear_model/_base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import numpy as np
22
from scipy.linalg import orthogonal_procrustes
3-
from sklearn.base import MultiOutputMixin, RegressorMixin
3+
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
44
from sklearn.linear_model import LinearRegression
55
from sklearn.utils import check_array, check_X_y
66
from sklearn.utils.validation import check_is_fitted
77

88

9-
class OrthogonalRegression(MultiOutputMixin, RegressorMixin):
9+
class OrthogonalRegression(MultiOutputMixin, RegressorMixin, BaseEstimator):
1010
r"""Orthogonal regression by solving the Procrustes problem
1111
1212
Linear regression with the additional constraint that the weight matrix

src/skmatter/linear_model/_ridge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def fit(self, X, y):
170170
"[0,1)"
171171
)
172172

173-
X, y = self._validate_data(X, y, y_numeric=True, multi_output=True)
173+
X, y = validate_data(self, X, y, y_numeric=True, multi_output=True)
174174
self.n_samples_in_, self.n_features_in_ = X.shape
175175

176176
# check_scoring uses estimators scoring function if the scorer is None, this is

src/skmatter/preprocessing/_data.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
import numpy as np
22
from sklearn.base import BaseEstimator, TransformerMixin
33
from sklearn.preprocessing._data import KernelCenterer
4-
from sklearn.utils.validation import FLOAT_DTYPES, _check_sample_weight, check_is_fitted
4+
from sklearn.utils.validation import (
5+
FLOAT_DTYPES,
6+
_check_sample_weight,
7+
check_is_fitted,
8+
validate_data,
9+
)
510

611

712
class StandardFlexibleScaler(TransformerMixin, BaseEstimator):
@@ -128,7 +133,8 @@ def fit(self, X, y=None, sample_weight=None):
128133
self : object
129134
Fitted scaler.
130135
"""
131-
X = self._validate_data(
136+
X = validate_data(
137+
self,
132138
X,
133139
copy=self.copy,
134140
estimator=self,
@@ -181,7 +187,8 @@ def transform(self, X, y=None, copy=None):
181187
Transformed array.
182188
"""
183189
copy = copy if copy is not None else self.copy
184-
X = self._validate_data(
190+
X = validate_data(
191+
self,
185192
X,
186193
reset=False,
187194
copy=copy,
@@ -298,7 +305,7 @@ def fit(self, K, y=None, sample_weight=None):
298305
self : object
299306
Fitted transformer.
300307
"""
301-
K = self._validate_data(K, copy=True, dtype=FLOAT_DTYPES, reset=False)
308+
K = validate_data(self, K, copy=True, dtype=FLOAT_DTYPES, reset=False)
302309

303310
if sample_weight is not None:
304311
self.sample_weight_ = _check_sample_weight(sample_weight, K, dtype=K.dtype)
@@ -350,7 +357,7 @@ def transform(self, K, copy=True):
350357
Transformed array
351358
"""
352359
check_is_fitted(self)
353-
K = self._validate_data(K, copy=copy, dtype=FLOAT_DTYPES, reset=False)
360+
K = validate_data(self, K, copy=copy, dtype=FLOAT_DTYPES, reset=False)
354361

355362
if self.with_center:
356363
K_pred_cols = np.average(K, weights=self.sample_weight_, axis=1)[
@@ -391,7 +398,7 @@ def fit_transform(self, K, y=None, sample_weight=None, copy=True, **fit_params):
391398
return self.transform(K, copy)
392399

393400

394-
class SparseKernelCenterer(TransformerMixin):
401+
class SparseKernelCenterer(TransformerMixin, BaseEstimator):
395402
r"""Kernel centering method for sparse kernels, similar to
396403
:class:`KernelFlexibleCenterer`.
397404

src/skmatter/utils/_pcovr_utils.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from sklearn.exceptions import NotFittedError
66
from sklearn.metrics.pairwise import pairwise_kernels
77
from sklearn.utils.extmath import randomized_svd
8-
from sklearn.utils.validation import check_is_fitted
8+
from sklearn.utils.validation import check_is_fitted, validate_data
99

1010

1111
def check_lr_fit(regressor, X, y):
@@ -39,10 +39,20 @@ def check_lr_fit(regressor, X, y):
3939
fitted_regressor = deepcopy(regressor)
4040

4141
# Check compatibility with X
42-
fitted_regressor._validate_data(X, y, reset=False, multi_output=True)
42+
validate_data(fitted_regressor, X, y, reset=False, multi_output=True)
4343

4444
# Check compatibility with y
45+
46+
# TO DO: This if statement is a band-aid for the case when we pass in a
47+
# prefitted Ridge() or RidgeCV(), which, as of sklearn 1.6, will create
48+
# coef_ with shape (n_features, ) even if fitted on a 2-D y with one target.
49+
# In the future, we can optimize this block if LinearRegression() also changes.
50+
4551
if fitted_regressor.coef_.ndim != y.ndim:
52+
if y.ndim == 2:
53+
if fitted_regressor.coef_.ndim == 1 and y.shape[1] == 1:
54+
return fitted_regressor
55+
4656
raise ValueError(
4757
"The regressor coefficients have a dimension incompatible with the "
4858
"supplied target space. The coefficients have dimension "
@@ -103,7 +113,7 @@ def check_krr_fit(regressor, K, X, y):
103113
fitted_regressor = deepcopy(regressor)
104114

105115
# Check compatibility with K
106-
fitted_regressor._validate_data(X, y, reset=False, multi_output=True)
116+
validate_data(fitted_regressor, X, y, reset=False, multi_output=True)
107117

108118
# Check compatibility with y
109119
if fitted_regressor.dual_coef_.ndim != y.ndim:

tests/test_pcovr.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(self, *args, **kwargs):
2424

2525
self.X, self.Y = get_dataset(return_X_y=True)
2626
self.X = StandardScaler().fit_transform(self.X)
27-
self.Y = StandardScaler().fit_transform(np.vstack(self.Y))
27+
self.Y = StandardScaler().fit_transform(np.vstack(self.Y)).ravel()
2828

2929
def setUp(self):
3030
pass
@@ -69,7 +69,7 @@ def test_simple_reconstruction(self):
6969
def test_simple_prediction(self):
7070
"""
7171
Check that PCovR with a full eigendecomposition at mixing=0
72-
can fully reconstruct the input properties.
72+
can reproduce a linear regression result.
7373
"""
7474
for space in ["feature", "sample", "auto"]:
7575
with self.subTest(space=space):
@@ -481,32 +481,42 @@ def test_none_regressor(self):
481481
self.assertTrue(pcovr.regressor is None)
482482
self.assertTrue(pcovr.regressor_ is not None)
483483

484-
def test_incompatible_coef_shape(self):
485-
# self.Y is 2D with one target
484+
def test_incompatible_coef_dim(self):
485+
# self.Y is 1D with one target
486486
# Don't need to test X shape, since this should
487-
# be caught by sklearn's _validate_data
487+
# be caught by sklearn's validate_data
488+
Y_2D = np.column_stack((self.Y, self.Y))
488489
regressor = Ridge(alpha=1e-8, fit_intercept=False, tol=1e-12)
489-
regressor.fit(self.X, self.Y)
490+
regressor.fit(self.X, Y_2D)
490491
pcovr = self.model(mixing=0.5, regressor=regressor)
491492

492493
# Dimension mismatch
493494
with self.assertRaises(ValueError) as cm:
494-
pcovr.fit(self.X, np.zeros((self.Y.shape[0], 2)))
495+
pcovr.fit(self.X, self.Y)
495496
self.assertEqual(
496497
str(cm.exception),
497498
"The regressor coefficients have a dimension incompatible with the "
498-
"supplied target space. The coefficients have dimension 1 and the targets "
499-
"have dimension 2",
499+
"supplied target space. The coefficients have dimension 2 and the targets "
500+
"have dimension 1",
500501
)
501502

503+
def test_incompatible_coef_shape(self):
502504
# Shape mismatch (number of targets)
505+
Y_double = np.column_stack((self.Y, self.Y))
506+
Y_triple = np.column_stack((Y_double, self.Y))
507+
508+
regressor = Ridge(alpha=1e-8, fit_intercept=False, tol=1e-12)
509+
regressor.fit(self.X, Y_double)
510+
511+
pcovr = self.model(mixing=0.5, regressor=regressor)
512+
503513
with self.assertRaises(ValueError) as cm:
504-
pcovr.fit(self.X, np.column_stack((self.Y, self.Y)))
514+
pcovr.fit(self.X, Y_triple)
505515
self.assertEqual(
506516
str(cm.exception),
507517
"The regressor coefficients have a shape incompatible with the supplied "
508518
"target space. The coefficients have shape %r and the targets have shape %r"
509-
% (regressor.coef_.shape, np.column_stack((self.Y, self.Y)).shape),
519+
% (regressor.coef_.shape, Y_triple.shape),
510520
)
511521

512522

0 commit comments

Comments
 (0)