Skip to content

Commit 11e12a6

Browse files
sofiia-chornaPicoCentauri
authored andcommitted
Improve docs, small refactor of measures
1 parent 006787a commit 11e12a6

File tree

1 file changed

+124
-135
lines changed

1 file changed

+124
-135
lines changed

src/skmatter/metrics/_reconstruction_measures.py

Lines changed: 124 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -44,46 +44,28 @@ def pointwise_global_reconstruction_error(
4444
X use input shape (samples, features). For sample reconstruction of Y using X
4545
use input shape (features, samples).
4646
train_idx : numpy.ndarray, dtype=int, default=None
47-
array of indices used for training, if None,
48-
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
49-
also None, 2-fold split is taken.
47+
Array of indices used for training. If None, the complement of the ``test_idx``
48+
is used. If ``train_size`` is also None, 2-fold split is taken.
5049
test_idx : numpy.ndarray, dtype=int, default=None
51-
array of indices used for training, if None,
52-
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
53-
also None, 2-fold split is taken.
54-
scaler : object implementing fit/transfom
55-
Scales the X and Y before computing the reconstruction measure.
56-
The default value scales the features such that the reconstruction
57-
measure on the training set is upper bounded to 1.
58-
estimator : object implementing fit/predict, default=None
59-
Sklearn estimator used to reconstruct features/samples.
50+
Array of indices used for testing. If None, the complement of the ``train_idx``
51+
is used. If ``test_size`` is also None, 2-fold split is taken.
52+
scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
53+
Scales X and Y before computing the reconstruction measure. The default value
54+
scales the features such that the reconstruction measure on the training set is
55+
upper bounded to 1.
56+
estimator : object implementing fit/predict, default=``Ridge2FoldCV``
57+
Sklearn estimator used to reconstruct test features/samples.
6058
6159
Returns
6260
-------
6361
pointwise_global_reconstruction_error : numpy.ndarray
64-
The global reconstruction error for each sample/point
62+
The global reconstruction error for each test sample/point.
6563
"""
66-
(
67-
train_idx,
68-
test_idx,
69-
scaler,
70-
estimator,
71-
) = check_global_reconstruction_measures_input(
64+
train_idx, test_idx, scaler, estimator = check_global_reconstruction_measures_input(
7265
X, Y, train_idx, test_idx, scaler, estimator
7366
)
74-
X_train, X_test, Y_train, Y_test = (
75-
X[train_idx],
76-
X[test_idx],
77-
Y[train_idx],
78-
Y[test_idx],
79-
)
8067

81-
scaler.fit(X_train)
82-
X_train = scaler.transform(X_train)
83-
X_test = scaler.transform(X_test)
84-
scaler.fit(Y_train)
85-
Y_train = scaler.transform(Y_train)
86-
Y_test = scaler.transform(Y_test)
68+
X_train, X_test, Y_train, Y_test = _prepare_data(X, Y, train_idx, test_idx, scaler)
8769

8870
estimator.fit(X_train, Y_train)
8971

@@ -120,27 +102,25 @@ def global_reconstruction_error(
120102
Parameters
121103
----------
122104
X : numpy.ndarray of shape (n_samples, X_n_features)
123-
Source data which reconstructs target Y.
124-
For feature reconstruction of Y using X use input shape (samples, features).
125-
For sample reconstruction of Y using X use input shape (features, samples).
105+
Source data which reconstructs target Y. For feature reconstruction of Y using X
106+
use input shape (samples, features). For sample reconstruction of Y using X use
107+
input shape (features, samples).
126108
Y : numpy.ndarray of shape (n_samples, Y_n_targets)
127-
Target data which is reconstructed with X.
128-
For feature reconstruction of Y using X use input shape (samples, features).
129-
For sample reconstruction of Y using X use input shape (features, samples).
109+
Target data which is reconstructed with X. For feature reconstruction of Y using
110+
X use input shape (samples, features). For sample reconstruction of Y using X
111+
use input shape (features, samples).
130112
train_idx : numpy.ndarray, dtype=int, default=None
131-
array of indices used for training, if None,
132-
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
133-
also None, 2-fold split is taken.
113+
Array of indices used for training. If None, the complement of the ``test_idx``
114+
is used. If ``train_size`` is also None, 2-fold split is taken.
134115
test_idx : numpy.ndarray, dtype=int, default=None
135-
array of indices used for training, if None,
136-
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
137-
also None, 2-fold split is taken.
138-
scaler : object implementing fit/transfom
139-
Scales the X and Y before computing the reconstruction measure.
140-
The default value scales the features such that the reconstruction
141-
measure on the training set is upper bounded to 1.
142-
estimator : object implementing fit/predict, default=None
143-
Sklearn estimator used to reconstruct features/samples.
116+
Array of indices used for testing. If None, the complement of the ``train_idx``
117+
is used. If ``test_size`` is also None, 2-fold split is taken.
118+
scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
119+
Scales X and Y before computing the reconstruction measure. The default value
120+
scales the features such that the reconstruction measure on the training set is
121+
upper bounded to 1.
122+
estimator : object implementing fit/predict, default=``Ridge2FoldCV``
123+
Sklearn estimator used to reconstruct test features/samples.
144124
145125
Returns
146126
-------
@@ -201,46 +181,28 @@ def pointwise_global_reconstruction_distortion(
201181
For feature reconstruction of Y using X use input shape (samples, features).
202182
For sample reconstruction of Y using X use input shape (features, samples).
203183
train_idx : numpy.ndarray, dtype=int, default=None
204-
array of indices used for training, if None,
205-
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
206-
also None, 2-fold split is taken.
184+
Array of indices used for training. If None, the complement of the ``test_idx``
185+
is used. If ``train_size`` is also None, 2-fold split is taken.
207186
test_idx : numpy.ndarray, dtype=int, default=None
208-
array of indices used for training, if None,
209-
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
210-
also None, 2-fold split is taken.
211-
scaler : object implementing fit/transfom
212-
Scales the X and Y before computing the reconstruction measure.
213-
The default value scales the features such that the reconstruction
214-
measure on the training set is upper bounded to 1.
215-
estimator : object implementing fit/predict, default=None
216-
Sklearn estimator used to reconstruct features/samples.
187+
Array of indices used for testing. If None, the complement of the ``train_idx``
188+
is used. If ``test_size`` is also None, 2-fold split is taken.
189+
scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
190+
Scales X and Y before computing the reconstruction measure. The default value
191+
scales the features such that the reconstruction measure on the training set is
192+
upper bounded to 1.
193+
estimator : object implementing fit/predict, default=``Ridge2FoldCV``
194+
Sklearn estimator used to reconstruct test features/samples.
217195
218196
Returns
219197
-------
220198
pointwise_global_reconstruction_distortion : ndarray
221199
The global reconstruction distortion for each sample/point
222200
"""
223-
(
224-
train_idx,
225-
test_idx,
226-
scaler,
227-
estimator,
228-
) = check_global_reconstruction_measures_input(
201+
train_idx, test_idx, scaler, estimator = check_global_reconstruction_measures_input(
229202
X, Y, train_idx, test_idx, scaler, estimator
230203
)
231-
X_train, X_test, Y_train, Y_test = (
232-
X[train_idx],
233-
X[test_idx],
234-
Y[train_idx],
235-
Y[test_idx],
236-
)
237204

238-
scaler.fit(X_train)
239-
X_train = scaler.transform(X_train)
240-
X_test = scaler.transform(X_test)
241-
scaler.fit(Y_train)
242-
Y_train = scaler.transform(Y_train)
243-
Y_test = scaler.transform(Y_test)
205+
X_train, X_test, Y_train, _Y_test = _prepare_data(X, Y, train_idx, test_idx, scaler)
244206

245207
predictions_Y_test = estimator.fit(X_train, Y_train).predict(X_test)
246208
orthogonal_predictions_Y_test = (
@@ -291,19 +253,17 @@ def global_reconstruction_distortion(
291253
For feature reconstruction of Y using X use input shape (samples, features).
292254
For sample reconstruction of Y using X use input shape (features, samples).
293255
train_idx : numpy.ndarray, dtype=int, default=None
294-
array of indices used for training, if None,
295-
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
296-
also None, 2-fold split is taken.
256+
Array of indices used for training. If None, the complement of the ``test_idx``
257+
is used. If ``train_size`` is also None, 2-fold split is taken.
297258
test_idx : numpy.ndarray, dtype=int, default=None
298-
array of indices used for training, if None,
299-
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
300-
also None, 2-fold split is taken.
301-
scaler : object implementing fit/transfom
302-
Scales the X and Y before computing the reconstruction measure.
303-
The default value scales the features such that the reconstruction
304-
measure on the training set is upper bounded to 1.
305-
estimator : object implementing fit/predict, default=None
306-
Sklearn estimator used to reconstruct features/samples.
259+
Array of indices used for testing. If None, the complement of the ``train_idx``
260+
is used. If ``test_size`` is also None, 2-fold split is taken.
261+
scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
262+
Scales X and Y before computing the reconstruction measure. The default value
263+
scales the features such that the reconstruction measure on the training set is
264+
upper bounded to 1.
265+
estimator : object implementing fit/predict, default=``Ridge2FoldCV``
266+
Sklearn estimator used to reconstruct test features/samples.
307267
308268
Returns
309269
-------
@@ -373,47 +333,35 @@ def pointwise_local_reconstruction_error(
373333
Number of neighbour points used to compute the local reconstruction weight for
374334
each sample/point.
375335
train_idx : numpy.ndarray, dtype=int, default=None
376-
array of indices used for training, if None,
377-
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
378-
also None, 2-fold split is taken.
336+
Array of indices used for training. If None, the complement of the ``test_idx``
337+
is used. If ``train_size`` is also None, 2-fold split is taken.
379338
test_idx : numpy.ndarray, dtype=int, default=None
380-
array of indices used for training, if None,
381-
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
382-
also None, 2-fold split is taken.
383-
scaler : object implementing fit/transfom
384-
Scales the X and Y before computing the reconstruction measure.
385-
The default value scales the features such that the reconstruction
386-
measure on the training set is upper bounded to 1.
387-
estimator : object implementing fit/predict, default=None
388-
Sklearn estimator used to reconstruct features/samples.
339+
Array of indices used for testing. If None, the complement of the ``train_idx``
340+
is used. If ``test_size`` is also None, 2-fold split is taken.
341+
scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
342+
Scales X and Y before computing the reconstruction measure. The default value
343+
scales the features such that the reconstruction measure on the training set is
344+
upper bounded to 1.
345+
estimator : object implementing fit/predict, default=``Ridge2FoldCV``
346+
Sklearn estimator used to reconstruct test features/samples.
347+
n_jobs : int, default=None
348+
The number of CPUs to use to do the computation.
349+
:obj:`None` means 1 unless in a :obj:`joblib.parallel_backend` context.
350+
``-1`` means using all processors. See
351+
`n_jobs glossary from sklearn (external link) <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_
352+
for more details.
389353
390354
Returns
391355
-------
392356
pointwise_local_reconstruction_error : numpy.ndarray
393357
The local reconstruction error for each sample/point
394358
395359
"""
396-
(
397-
train_idx,
398-
test_idx,
399-
scaler,
400-
estimator,
401-
) = check_local_reconstruction_measures_input(
360+
train_idx, test_idx, scaler, estimator = check_local_reconstruction_measures_input(
402361
X, Y, n_local_points, train_idx, test_idx, scaler, estimator
403362
)
404-
X_train, X_test, Y_train, Y_test = (
405-
X[train_idx],
406-
X[test_idx],
407-
Y[train_idx],
408-
Y[test_idx],
409-
)
410363

411-
scaler.fit(X_train)
412-
X_train = scaler.transform(X_train)
413-
X_test = scaler.transform(X_test).astype(X_train.dtype)
414-
scaler.fit(Y_train)
415-
Y_train = scaler.transform(Y_train)
416-
Y_test = scaler.transform(Y_test)
364+
X_train, X_test, Y_train, Y_test = _prepare_data(X, Y, train_idx, test_idx, scaler)
417365

418366
squared_dist = (
419367
np.sum(X_train**2, axis=1)
@@ -496,19 +444,23 @@ def local_reconstruction_error(
496444
Number of neighbour points used to compute the local reconstruction weight for
497445
each sample/point.
498446
train_idx : numpy.ndarray, dtype=int, default=None
499-
array of indices used for training, if None,
500-
If None, the complement of the ``test_idx`` is used. If ``train_size`` is
501-
also None, 2-fold split is taken.
447+
Array of indices used for training. If None, the complement of the ``test_idx``
448+
is used. If ``train_size`` is also None, 2-fold split is taken.
502449
test_idx : numpy.ndarray, dtype=int, default=None
503-
array of indices used for training, if None,
504-
If None, the complement of the ``train_idx`` is used. If ``test_size`` is
505-
also None, 2-fold split is taken.
506-
scaler : object implementing fit/transfom
507-
Scales the X and Y before computing the reconstruction measure.
508-
The default value scales the features such that the reconstruction
509-
measure on the training set is upper bounded to 1.
510-
estimator : object implementing fit/predict, default=None
511-
Sklearn estimator used to reconstruct features/samples.
450+
Array of indices used for testing. If None, the complement of the ``train_idx``
451+
is used. If ``test_size`` is also None, 2-fold split is taken.
452+
scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
453+
Scales X and Y before computing the reconstruction measure. The default value
454+
scales the features such that the reconstruction measure on the training set is
455+
upper bounded to 1.
456+
estimator : object implementing fit/predict, default=``Ridge2FoldCV``
457+
Sklearn estimator used to reconstruct test features/samples.
458+
n_jobs : int, default=None
459+
The number of CPUs to use to do the computation.
460+
:obj:`None` means 1 unless in a :obj:`joblib.parallel_backend` context.
461+
``-1`` means using all processors. See
462+
`n_jobs glossary from sklearn (external link) <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_
463+
for more details.
512464
513465
Returns
514466
-------
@@ -534,7 +486,11 @@ def check_global_reconstruction_measures_input(
534486
X, Y, train_idx, test_idx, scaler, estimator
535487
):
536488
"""Returns default reconstruction measure inputs for all None parameters"""
537-
assert len(X) == len(Y)
489+
if X.shape[0] != Y.shape[0]:
490+
raise ValueError(
491+
f"First dimension of X ({X.shape[0]}) and Y ({Y.shape[0]}) must match"
492+
)
493+
538494
if (train_idx is None) and (test_idx is None):
539495
train_idx, test_idx = train_test_split(
540496
np.arange(len(X)),
@@ -562,6 +518,7 @@ def check_global_reconstruction_measures_input(
562518
scoring="neg_root_mean_squared_error",
563519
n_jobs=1,
564520
)
521+
565522
return train_idx, test_idx, scaler, estimator
566523

567524

@@ -570,7 +527,39 @@ def check_local_reconstruction_measures_input(
570527
):
571528
"""Returns default reconstruction measure inputs for all None parameters"""
572529
# only needs to check one extra parameter
573-
assert len(X) >= n_local_points
530+
if len(X) < n_local_points:
531+
raise ValueError(
532+
f"X has {len(X)} samples but n_local_points={n_local_points}. "
533+
"Must have at least n_local_points samples"
534+
)
535+
574536
return check_global_reconstruction_measures_input(
575537
X, Y, train_idx, test_idx, scaler, estimator
576538
)
539+
540+
541+
def _prepare_data(X, Y, train_idx, test_idx, scaler):
542+
"""
543+
Split and scale data for reconstruction measures
544+
545+
Parameters
546+
----------
547+
X, Y : array-like
548+
Input data
549+
train_idx, test_idx : array-like
550+
Indices for train/test split
551+
scaler : object
552+
Fitted scaler
553+
"""
554+
X_train, X_test = X[train_idx], X[test_idx]
555+
Y_train, Y_test = Y[train_idx], Y[test_idx]
556+
557+
scaler.fit(X_train)
558+
X_train_scaled = scaler.transform(X_train)
559+
X_test_scaled = scaler.transform(X_test)
560+
561+
scaler.fit(Y_train)
562+
Y_train_scaled = scaler.transform(Y_train)
563+
Y_test_scaled = scaler.transform(Y_test)
564+
565+
return X_train_scaled, X_test_scaled, Y_train_scaled, Y_test_scaled

0 commit comments

Comments
 (0)