From 5c6e30a42c03b1c196c0575d1299ca0c72d9872f Mon Sep 17 00:00:00 2001 From: Rhushil Vasavada Date: Sun, 1 Jun 2025 12:50:40 -0500 Subject: [PATCH 1/8] Trying out multioutput PCovC --- src/skmatter/decomposition/_pcovc.py | 78 +++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 12 deletions(-) diff --git a/src/skmatter/decomposition/_pcovc.py b/src/skmatter/decomposition/_pcovc.py index ec8ce3202..65d42da9a 100644 --- a/src/skmatter/decomposition/_pcovc.py +++ b/src/skmatter/decomposition/_pcovc.py @@ -10,6 +10,7 @@ SGDClassifier, ) from sklearn.linear_model._base import LinearClassifierMixin +from sklearn.multioutput import MultiOutputClassifier from sklearn.svm import LinearSVC from sklearn.utils import check_array from sklearn.utils.multiclass import check_classification_targets, type_of_target @@ -258,7 +259,7 @@ def fit(self, X, Y, W=None): not passed, it is assumed that the weights will be taken from a linear classifier fit between :math:`\mathbf{X}` and :math:`\mathbf{Y}` """ - X, Y = validate_data(self, X, Y, y_numeric=False) + X, Y = validate_data(self, X, Y, multi_output=True, y_numeric=False) check_classification_targets(Y) self.classes_ = np.unique(Y) @@ -269,6 +270,7 @@ def fit(self, X, Y, W=None): LogisticRegressionCV, LinearSVC, LinearDiscriminantAnalysis, + MultiOutputClassifier, RidgeClassifier, RidgeClassifierCV, SGDClassifier, @@ -285,23 +287,39 @@ def fit(self, X, Y, W=None): ) if self.classifier != "precomputed": - if self.classifier is None: + if self.classifier is None and Y.ndim < 2: classifier = LogisticRegression() + elif self.classifier is None and Y.ndim >= 2: + classifier = MultiOutputClassifier(estimator=LogisticRegression()) else: classifier = self.classifier self.z_classifier_ = check_cl_fit(classifier, X, Y) - W = self.z_classifier_.coef_.T.reshape(X.shape[1], -1) + + if isinstance(self.z_classifier_, MultiOutputClassifier): + W = np.hstack([est_.coef_.T for est_ in self.z_classifier_.estimators_]) + print(W.shape) + else: + W = self.z_classifier_.coef_.T.reshape(X.shape[1], -1) else: # If precomputed, use default classifier to predict Y from T - classifier = LogisticRegression() - if W is None: - W = LogisticRegression().fit(X, Y).coef_.T - W = W.reshape(X.shape[1], -1) + # check for the case of 2D Y -- we need to make sure that this is MultiOutputClassifier instead + if Y.ndim >= 2: + classifier = MultiOutputClassifier(estimator=LogisticRegression) + if W is None: + _ = MultiOutputClassifier(estimator=LogisticRegression).fit(X, Y) + W = np.hstack([est_.coef_.T for est_ in _.estimators_]) + else: + classifier = LogisticRegression() + if W is None: + W = LogisticRegression().fit(X, Y).coef_.T + W = W.reshape(X.shape[1], -1) + # print(f"X {X.shape}") + # print(f"W {W.shape}") Z = X @ W - + # print(f"Z {Z.shape}") if self.space_ == "feature": self._fit_feature_space(X, Y, Z) else: @@ -311,8 +329,19 @@ def fit(self, X, Y, W=None): # classifier and steal weights to get pxz and ptz self.classifier_ = clone(classifier).fit(X @ self.pxt_, Y) - self.ptz_ = self.classifier_.coef_.T - self.pxz_ = self.pxt_ @ self.ptz_ + if isinstance(self.classifier_, MultiOutputClassifier): + self.ptz_ = np.hstack( + [est_.coef_.T for est_ in self.classifier_.estimators_] + ) + # print(f"pxt {self.pxt_.shape}") + # print(f"ptz {self.ptz_.shape}") + self.pxz_ = self.pxt_ @ self.ptz_ + # print(f"pxz {self.pxz_.shape}") + + else: + self.ptz_ = self.classifier_.coef_.T + # print(self.ptz_.shape) + self.pxz_ = self.pxt_ @ self.ptz_ if len(Y.shape) == 1 and type_of_target(Y) == "binary": self.pxz_ = self.pxz_.reshape( @@ -423,10 +452,35 @@ def decision_function(self, X=None, T=None): if X is not None: X = validate_data(self, X, reset=False) # Or self.classifier_.decision_function(X @ self.pxt_) - return X @ self.pxz_ + self.classifier_.intercept_ + Z = X @ self.pxz_ else: T = check_array(T) - return T @ self.ptz_ + self.classifier_.intercept_ + Z = T @ self.ptz_ + + if isinstance(self.classifier_, MultiOutputClassifier): + + n_outputs = len(self.classifier_.estimators_) + n_classes = Z.shape[1] // n_outputs + print(Z.shape) + # unpack to 3d + Z = Z.reshape(Z.shape[0], n_outputs, n_classes) + print(Z.shape) + + # add the intercept for estimator in MultiOutputClassifier + for i, est_ in enumerate(self.classifier_.estimators_): + # print(Z[:, i, :][0, :]) + Z[:, i, :] += est_.intercept_ + # print(Z) + # print() + print(est_.intercept_) + # print() + # print(Z[:, i, :][0, :]) + # swap order of Z axesfrom (n_samples, n_outputs, n_classes) to (n_samples, n_classes, n_outputs) as in paper + + return Z.transpose(0, 2, 1) + + print(self.classifier_.intercept_) + return Z + self.classifier_.intercept_ def predict(self, X=None, T=None): """Predicts the property labels using classification on T.""" From caa86680bf09cbcd67fbb49ea16bc386c1400659 Mon Sep 17 00:00:00 2001 From: Rhushil Vasavada Date: Wed, 4 Jun 2025 14:46:15 -0500 Subject: [PATCH 2/8] Furthering multioutput support for decision_function --- src/skmatter/decomposition/_pcovc.py | 77 ++++++++++++---------------- src/skmatter/utils/_pcovc_utils.py | 46 ++++++++++------- tests/test_pcovc.py | 2 +- 3 files changed, 62 insertions(+), 63 deletions(-) diff --git a/src/skmatter/decomposition/_pcovc.py b/src/skmatter/decomposition/_pcovc.py index 65d42da9a..d31982085 100644 --- a/src/skmatter/decomposition/_pcovc.py +++ b/src/skmatter/decomposition/_pcovc.py @@ -287,10 +287,11 @@ def fit(self, X, Y, W=None): ) if self.classifier != "precomputed": - if self.classifier is None and Y.ndim < 2: - classifier = LogisticRegression() - elif self.classifier is None and Y.ndim >= 2: - classifier = MultiOutputClassifier(estimator=LogisticRegression()) + if self.classifier is None: + if Y.ndim < 2: + classifier = LogisticRegression() + else: + classifier = MultiOutputClassifier(estimator=LogisticRegression()) else: classifier = self.classifier @@ -298,28 +299,28 @@ def fit(self, X, Y, W=None): if isinstance(self.z_classifier_, MultiOutputClassifier): W = np.hstack([est_.coef_.T for est_ in self.z_classifier_.estimators_]) - print(W.shape) else: W = self.z_classifier_.coef_.T.reshape(X.shape[1], -1) else: - # If precomputed, use default classifier to predict Y from T - # check for the case of 2D Y -- we need to make sure that this is MultiOutputClassifier instead - if Y.ndim >= 2: - classifier = MultiOutputClassifier(estimator=LogisticRegression) - if W is None: - _ = MultiOutputClassifier(estimator=LogisticRegression).fit(X, Y) - W = np.hstack([est_.coef_.T for est_ in _.estimators_]) - else: + if Y.ndim < 2: + # if self.classifier = "precomputed", use default classifier to predict Y from T classifier = LogisticRegression() if W is None: W = LogisticRegression().fit(X, Y).coef_.T W = W.reshape(X.shape[1], -1) - # print(f"X {X.shape}") - # print(f"W {W.shape}") + else: + classifier = MultiOutputClassifier(estimator=LogisticRegression()) + if W is None: + _ = MultiOutputClassifier(estimator=LogisticRegression).fit(X, Y) + W = np.hstack([est_.coef_.T for est_ in _.estimators_]) + + print(f"X: {X.shape}") + print(f"W: {len(W), W[0]}") + Z = X @ W - # print(f"Z {Z.shape}") + if self.space_ == "feature": self._fit_feature_space(X, Y, Z) else: @@ -451,36 +452,26 @@ def decision_function(self, X=None, T=None): if X is not None: X = validate_data(self, X, reset=False) + + # this is similar to how MultiOutputClassifier handles predict_proba() if n_outputs > 1 + if isinstance(self.classifier_, MultiOutputClassifier): + return [ + est_.decision_function(X @ self.pxt_) + for est_ in self.classifier_.estimators_ + ] + # Or self.classifier_.decision_function(X @ self.pxt_) - Z = X @ self.pxz_ + return X @ self.pxz_ + self.classifier_.intercept_ else: T = check_array(T) - Z = T @ self.ptz_ - - if isinstance(self.classifier_, MultiOutputClassifier): - - n_outputs = len(self.classifier_.estimators_) - n_classes = Z.shape[1] // n_outputs - print(Z.shape) - # unpack to 3d - Z = Z.reshape(Z.shape[0], n_outputs, n_classes) - print(Z.shape) - - # add the intercept for estimator in MultiOutputClassifier - for i, est_ in enumerate(self.classifier_.estimators_): - # print(Z[:, i, :][0, :]) - Z[:, i, :] += est_.intercept_ - # print(Z) - # print() - print(est_.intercept_) - # print() - # print(Z[:, i, :][0, :]) - # swap order of Z axesfrom (n_samples, n_outputs, n_classes) to (n_samples, n_classes, n_outputs) as in paper - - return Z.transpose(0, 2, 1) - - print(self.classifier_.intercept_) - return Z + self.classifier_.intercept_ + + if isinstance(self.classifier_, MultiOutputClassifier): + return [ + est_.decision_function(T @ self.ptz_) + for est_ in self.classifier_.estimators_ + ] + + return T @ self.ptz_ + self.classifier_.intercept_ def predict(self, X=None, T=None): """Predicts the property labels using classification on T.""" diff --git a/src/skmatter/utils/_pcovc_utils.py b/src/skmatter/utils/_pcovc_utils.py index ea55dd60a..3203dca82 100644 --- a/src/skmatter/utils/_pcovc_utils.py +++ b/src/skmatter/utils/_pcovc_utils.py @@ -5,6 +5,8 @@ from sklearn.exceptions import NotFittedError from sklearn.utils.validation import check_is_fitted, validate_data +from sklearn.multioutput import MultiOutputClassifier + def check_cl_fit(classifier, X, y): """ @@ -39,29 +41,35 @@ def check_cl_fit(classifier, X, y): # Check compatibility with X validate_data(fitted_classifier, X, y, reset=False, multi_output=True) - # Check compatibility with the number of features in X and the number of - # classes in y - n_classes = len(np.unique(y)) - - if n_classes == 2: - if fitted_classifier.coef_.shape[0] != 1: - raise ValueError( - "For binary classification, expected classifier coefficients " - "to have shape (1, " - f"{X.shape[1]}) but got shape " - f"{fitted_classifier.coef_.shape}" - ) + # Check coefficent compatibility with the number of features in X and the + # number of classes in y + if isinstance(fitted_classifier, MultiOutputClassifier): + for est_ in fitted_classifier.estimators_: + check_cl_coef(X, est_.coef_, len(est_.classes_)) else: - if fitted_classifier.coef_.shape[0] != n_classes: - raise ValueError( - "For multiclass classification, expected classifier coefficients " - "to have shape " - f"({n_classes}, {X.shape[1]}) but got shape " - f"{fitted_classifier.coef_.shape}" - ) + check_cl_coef(X, fitted_classifier.coef_, len(np.unique(y))) except NotFittedError: fitted_classifier = clone(classifier) fitted_classifier.fit(X, y) return fitted_classifier + + +def check_cl_coef(X, classifier_coef_, n_classes): + if n_classes == 2: + if classifier_coef_.shape[0] != 1: + raise ValueError( + "For binary classification, expected classifier coefficients " + "to have shape (1, " + f"{X.shape[1]}) but got shape " + f"{classifier_coef_.shape}" + ) + else: + if classifier_coef_.shape[0] != n_classes: + raise ValueError( + "For multiclass classification, expected classifier coefficients " + "to have shape " + f"({n_classes}, {X.shape[1]}) but got shape " + f"{classifier_coef_.shape}" + ) diff --git a/tests/test_pcovc.py b/tests/test_pcovc.py index 5746c610f..7b70036ca 100644 --- a/tests/test_pcovc.py +++ b/tests/test_pcovc.py @@ -531,7 +531,7 @@ def test_incompatible_classifier(self): str(cm.exception), "Classifier must be an instance of " "`LogisticRegression`, `LogisticRegressionCV`, `LinearSVC`, " - "`LinearDiscriminantAnalysis`, `RidgeClassifier`, " + "`LinearDiscriminantAnalysis`, `MultiOutputClassifier`, `RidgeClassifier`, " "`RidgeClassifierCV`, `SGDClassifier`, `Perceptron`, " "or `precomputed`", ) From da3f728f613d7f03aab8843940b5c28c6177def3 Mon Sep 17 00:00:00 2001 From: Rhushil Vasavada Date: Tue, 10 Jun 2025 12:08:53 -0500 Subject: [PATCH 3/8] Starting on docstrings --- src/skmatter/_version.py | 21 +++++++++++++++ src/skmatter/decomposition/_pcovc.py | 38 ++++++++++++++++++---------- src/skmatter/decomposition/_pcovr.py | 2 +- 3 files changed, 46 insertions(+), 15 deletions(-) create mode 100644 src/skmatter/_version.py diff --git a/src/skmatter/_version.py b/src/skmatter/_version.py new file mode 100644 index 000000000..e0c87fad9 --- /dev/null +++ b/src/skmatter/_version.py @@ -0,0 +1,21 @@ +# file generated by setuptools-scm +# don't change, don't track in version control + +__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"] + +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple + from typing import Union + + VERSION_TUPLE = Tuple[Union[int, str], ...] +else: + VERSION_TUPLE = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE + +__version__ = version = '0.2.1.dev62+g7cef97c.d20250618' +__version_tuple__ = version_tuple = (0, 2, 1, 'dev62', 'g7cef97c.d20250618') diff --git a/src/skmatter/decomposition/_pcovc.py b/src/skmatter/decomposition/_pcovc.py index d31982085..f7fd9ce24 100644 --- a/src/skmatter/decomposition/_pcovc.py +++ b/src/skmatter/decomposition/_pcovc.py @@ -121,8 +121,8 @@ class PCovC(LinearClassifierMixin, _BasePCov): `sklearn.pipeline.Pipeline` with model caching. In such cases, the classifier will be re-fitted on the same training data as the composite estimator. - If None, ``sklearn.linear_model.LogisticRegression()`` - is used as the classifier. + If None and ``Y.ndim < 2``, ``sklearn.linear_model.LogisticRegression()`` is used. + If None and ``Y.ndim == 2``, ``sklearn.multioutput.MultiOutputClassifier()`` is used. iterated_power : int or 'auto', default='auto' Number of iterations for the power method computed by @@ -167,11 +167,13 @@ class PCovC(LinearClassifierMixin, _BasePCov): the projector, or weights, from the input space :math:`\mathbf{X}` to the latent-space projection :math:`\mathbf{T}` - pxz_ : ndarray of size :math:`({n_{features}, })` or :math:`({n_{features}, n_{classes}})` + pxz_ : ndarray of size :math:`({n_{features}, })`, :math:`({n_{features}, n_{classes}})`, \ + or :math:`({n_{components}, n_{classes}*n_{outputs}})` the projector, or weights, from the input space :math:`\mathbf{X}` to the class confidence scores :math:`\mathbf{Z}` - ptz_ : ndarray of size :math:`({n_{components}, })` or :math:`({n_{components}, n_{classes}})` + ptz_ : ndarray of size :math:`({n_{components}, })`, :math:`({n_{components}, n_{classes}})` \ + or :math:`({n_{components}, n_{classes}*n_{outputs}})` the projector, or weights, from the latent-space projection :math:`\mathbf{T}` to the class confidence scores :math:`\mathbf{Z}` @@ -251,13 +253,18 @@ def fit(self, X, Y, W=None): scaled to have unit variance, otherwise :math:`\mathbf{X}` should be scaled so that each feature has a variance of 1 / n_features. - Y : numpy.ndarray, shape (n_samples,) - Training data, where n_samples is the number of samples. + Y : numpy.ndarray, shape (n_samples,) or (n_samples, n_outputs) + Training data, where n_samples is the number of samples and + n_outputs is the number of outputs. If classifier parameter is an instance + of ``sklearn.multioutput.MultiOutputClassifier()``, Y can be of shape + (n_samples, n_outputs). - W : numpy.ndarray, shape (n_features, n_classes) + W : numpy.ndarray, shape (n_features, n_classes) or (n_features, n_classes*n_outputs) Classification weights, optional when classifier = `precomputed`. If not passed, it is assumed that the weights will be taken from a - linear classifier fit between :math:`\mathbf{X}` and :math:`\mathbf{Y}` + linear classifier fit between :math:`\mathbf{X}` and :math:`\mathbf{Y}`. + In the case of a multioutput classifier ``classifier``, + `` W = np.hstack([est_.coef_.T for est_ in classifier.estimators_])``. """ X, Y = validate_data(self, X, Y, multi_output=True, y_numeric=False) check_classification_targets(Y) @@ -317,7 +324,7 @@ def fit(self, X, Y, W=None): W = np.hstack([est_.coef_.T for est_ in _.estimators_]) print(f"X: {X.shape}") - print(f"W: {len(W), W[0]}") + print(f"W: {W.shape}") Z = X @ W @@ -344,6 +351,7 @@ def fit(self, X, Y, W=None): # print(self.ptz_.shape) self.pxz_ = self.pxt_ @ self.ptz_ + print(self.ptz_.shape) if len(Y.shape) == 1 and type_of_target(Y) == "binary": self.pxz_ = self.pxz_.reshape( X.shape[1], @@ -441,9 +449,12 @@ def decision_function(self, X=None, T=None): Returns ------- - Z : numpy.ndarray, shape (n_samples,) or (n_samples, n_classes) + Z : numpy.ndarray, shape (n_samples,) or (n_samples, n_classes), or a list of \ + n_outputs such arrays if n_outputs > 1 Confidence scores. For binary classification, has shape `(n_samples,)`, - for multiclass classification, has shape `(n_samples, n_classes)` + for multiclass classification, has shape `(n_samples, n_classes)`. If n_outputs > 1, + the list returned can contain such arrays with differing shapes depending on the + number of classes in each output of Y. """ check_is_fitted(self, attributes=["pxz_", "ptz_"]) @@ -464,11 +475,10 @@ def decision_function(self, X=None, T=None): return X @ self.pxz_ + self.classifier_.intercept_ else: T = check_array(T) - + if isinstance(self.classifier_, MultiOutputClassifier): return [ - est_.decision_function(T @ self.ptz_) - for est_ in self.classifier_.estimators_ + est_.decision_function(T) for est_ in self.classifier_.estimators_ ] return T @ self.ptz_ + self.classifier_.intercept_ diff --git a/src/skmatter/decomposition/_pcovr.py b/src/skmatter/decomposition/_pcovr.py index 9a038c6ea..72c3fd96c 100644 --- a/src/skmatter/decomposition/_pcovr.py +++ b/src/skmatter/decomposition/_pcovr.py @@ -227,7 +227,7 @@ def fit(self, X, Y, W=None): regressed form of the properties, :math:`{\mathbf{\hat{Y}}}`. W : numpy.ndarray, shape (n_features, n_properties) - Regression weights, optional when regressor= `precomputed`. If not + Regression weights, optional when regressor = `precomputed`. If not passed, it is assumed that `W = np.linalg.lstsq(X, Y, self.tol)[0]` """ X, Y = validate_data(self, X, Y, y_numeric=True, multi_output=True) From cfd8385fa9babc2ffcf4d013c756eeee386f9669 Mon Sep 17 00:00:00 2001 From: Rhushil Vasavada Date: Sun, 22 Jun 2025 11:34:18 -0500 Subject: [PATCH 4/8] Score function and tests --- src/skmatter/_version.py | 4 +- src/skmatter/decomposition/_pcovc.py | 40 +++++++++++ src/skmatter/decomposition/_pcovr.py | 2 +- tests/test_pcovc.py | 102 +++++++++++++++++---------- 4 files changed, 107 insertions(+), 41 deletions(-) diff --git a/src/skmatter/_version.py b/src/skmatter/_version.py index e0c87fad9..db8bdda40 100644 --- a/src/skmatter/_version.py +++ b/src/skmatter/_version.py @@ -17,5 +17,5 @@ __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE -__version__ = version = '0.2.1.dev62+g7cef97c.d20250618' -__version_tuple__ = version_tuple = (0, 2, 1, 'dev62', 'g7cef97c.d20250618') +__version__ = version = '0.2.1.dev58+gead41e2.d20250623' +__version_tuple__ = version_tuple = (0, 2, 1, 'dev58', 'gead41e2.d20250623') diff --git a/src/skmatter/decomposition/_pcovc.py b/src/skmatter/decomposition/_pcovc.py index f7fd9ce24..dcb03fc94 100644 --- a/src/skmatter/decomposition/_pcovc.py +++ b/src/skmatter/decomposition/_pcovc.py @@ -10,6 +10,8 @@ SGDClassifier, ) from sklearn.linear_model._base import LinearClassifierMixin + +from sklearn.base import MultiOutputMixin from sklearn.multioutput import MultiOutputClassifier from sklearn.svm import LinearSVC from sklearn.utils import check_array @@ -19,6 +21,11 @@ from skmatter.utils import check_cl_fit +# No inheritance from MultiOutputMixin because decision_function would fail +# test_check_estimator.py 'check_classifier_multioutput' (line 2479 of estimator_checks.py) +# - this is the only test for MultiOutputClassifiers, so is it OK to exclude this tag? + + class PCovC(LinearClassifierMixin, _BasePCov): r"""Principal Covariates Classification (PCovC). @@ -510,3 +517,36 @@ def transform(self, X=None): and n_features is the number of features. """ return super().transform(X) + + def score(self, X, Y, sample_weight=None): + """Return the accuracy on the given test data and labels. + + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Test samples. + + Y : array-like of shape (n_samples,) or (n_samples, n_outputs) + True labels for `X`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. Can only be used if the PCovC instance + has been trained on multitarget data. + + Returns + ------- + score : float + Accuracy scores. If the PCovC instance was trained on a 1D Y, + this will call the ``score()`` function defined by + ``sklearn.base.ClassifierMixin``. If trained on a 2D Y, this will + call the ``score()`` function defined by + ``sklearn.multioutput.MultiOutputClassifier``, to ensure multi + """ + X, Y = validate_data(self, X, Y, reset=False) + + if isinstance(self.classifier_, MultiOutputClassifier): + # LinearClassifierMixin.score fails with multioutput-multiclass Y + return self.classifier_.score(X @ self.pxt_, Y) + else: + return self.classifier_.score(X @ self.pxt_, Y, sample_weight=sample_weight) diff --git a/src/skmatter/decomposition/_pcovr.py b/src/skmatter/decomposition/_pcovr.py index 72c3fd96c..1c8835f12 100644 --- a/src/skmatter/decomposition/_pcovr.py +++ b/src/skmatter/decomposition/_pcovr.py @@ -414,7 +414,7 @@ def score(self, X, y, T=None): Negative sum of the loss in reconstructing X from the latent-space projection T and the loss in predicting Y from the latent-space projection T """ - X, y = validate_data(self, X, y, reset=False) + X, y = validate_data(self, X, y, reset=False, multi_output=True) if T is None: T = self.transform(X) diff --git a/tests/test_pcovc.py b/tests/test_pcovc.py index 7b70036ca..e62ac561e 100644 --- a/tests/test_pcovc.py +++ b/tests/test_pcovc.py @@ -3,10 +3,10 @@ import numpy as np from sklearn import exceptions -from sklearn.calibration import LinearSVC -from sklearn.datasets import load_breast_cancer as get_dataset +from sklearn.datasets import load_iris as get_dataset from sklearn.decomposition import PCA -from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import LogisticRegression, RidgeClassifier +from sklearn.multioutput import MultiOutputClassifier from sklearn.naive_bayes import GaussianNB from sklearn.preprocessing import StandardScaler from sklearn.utils.validation import check_X_y @@ -75,19 +75,25 @@ def test_simple_reconstruction(self): def test_simple_prediction(self): """ Check that PCovC with a full eigendecomposition at mixing=0 - can fully reconstruct the input properties. + can reproduce a linear classification result. """ for space in ["feature", "sample", "auto"]: with self.subTest(space=space): - pcovc = self.model(mixing=0.0, n_components=2, space=space) + pcovc = self.model( + mixing=0.0, + classifier=RidgeClassifier(), + n_components=2, + space=space, + ) pcovc.classifier.fit(self.X, self.Y) Yhat = pcovc.classifier.predict(self.X) pcovc.fit(self.X, self.Y) Yp = pcovc.predict(self.X) + self.assertLessEqual( - np.linalg.norm(Yp - Yhat) ** 2.0 / np.linalg.norm(Yhat) ** 2.0, + np.linalg.norm(Yp - Yhat) ** 2.0 / np.linalg.norm(Yp) ** 2.0, self.error_tol, ) @@ -172,8 +178,8 @@ def test_select_sample_space(self): """ pcovc = self.model(n_components=2, tol=1e-12) - n_samples = self.X.shape[1] - 1 - pcovc.fit(self.X[:n_samples], self.Y[:n_samples]) + n_samples = 2 + pcovc.fit(self.X[49 : 49 + n_samples], self.Y[49 : 49 + n_samples]) self.assertTrue(pcovc.space_ == "sample") @@ -289,7 +295,7 @@ def test_bad_n_components(self): pcovc = self.model( n_components="mle", classifier=LinearSVC(), svd_solver="full" ) - pcovc.fit(self.X[:20], self.Y[:20]) + pcovc.fit(self.X[49:51], self.Y[49:51]) self.assertEqual( str(cm.exception), "n_components='mle' is only supported if n_samples >= n_features", @@ -395,7 +401,7 @@ def test_T_shape(self): """Check that PCovC returns a latent space projection consistent with the shape of the input matrix. """ - n_components = 5 + n_components = 4 pcovc = self.model(n_components=n_components, tol=1e-12) pcovc.fit(self.X, self.Y) T = pcovc.transform(self.X) @@ -414,27 +420,27 @@ def test_Z_shape(self): """Check that PCovC returns an evidence matrix consistent with the number of samples and the number of classes. """ - n_components = 5 + n_components = 2 pcovc = self.model(n_components=n_components, tol=1e-12) - pcovc.fit(self.X, self.Y) + pcovc.fit(self.X, np.random.randint(0, 2, size=self.X.shape[0])) # Shape (n_samples, ) for binary classifcation Z = pcovc.decision_function(self.X) - - self.assertTrue(Z.ndim == 1) - self.assertTrue(Z.shape[0] == self.X.shape[0]) - - # Modify Y so that it now contains three classes - Y_multiclass = self.Y.copy() - Y_multiclass[0] = 2 - pcovc.fit(self.X, Y_multiclass) - n_classes = len(np.unique(Y_multiclass)) + self.assertEqual(Z.ndim, 1) + self.assertEqual(Z.shape[0], self.X.shape[0]) # Shape (n_samples, n_classes) for multiclass classification + pcovc.fit(self.X, self.Y) Z = pcovc.decision_function(self.X) - self.assertTrue(Z.ndim == 2) - self.assertTrue((Z.shape[0], Z.shape[1]) == (self.X.shape[0], n_classes)) + self.assertEqual(Z.ndim, 2) + self.assertEqual( + (Z.shape[0], Z.shape[1]), + ( + self.X.shape[0], + len(np.unique(self.Y)), + ), + ) def test_decision_function(self): """Check that PCovC's decision_function works when only T is @@ -544,39 +550,59 @@ def test_none_classifier(self): self.assertTrue(pcovc.classifier_ is not None) def test_incompatible_coef_shape(self): - classifier1 = LogisticRegression() - - # Modify Y to be multiclass - Y_multiclass = self.Y.copy() - Y_multiclass[0] = 2 - - classifier1.fit(self.X, Y_multiclass) - pcovc1 = self.model(mixing=0.5, classifier=classifier1) + cl_multiclass = LogisticRegression() + cl_multiclass.fit(self.X, self.Y) + pcovc_bi = self.model(mixing=0.5, classifier=cl_multiclass) # Binary classification shape mismatch with self.assertRaises(ValueError) as cm: - pcovc1.fit(self.X, self.Y) + pcovc_bi.fit(self.X, np.random.randint(0, 2, size=self.X.shape[0])) self.assertEqual( str(cm.exception), "For binary classification, expected classifier coefficients " "to have shape (1, %d) but got shape %r" - % (self.X.shape[1], classifier1.coef_.shape), + % (self.X.shape[1], cl_multiclass.coef_.shape), ) - classifier2 = LogisticRegression() - classifier2.fit(self.X, self.Y) - pcovc2 = self.model(mixing=0.5, classifier=classifier2) + cl_binary = LogisticRegression() + cl_binary.fit(self.X, np.random.randint(0, 2, size=self.X.shape[0])) + pcovc_multiclass = self.model(mixing=0.5, classifier=cl_binary) # Multiclass classification shape mismatch with self.assertRaises(ValueError) as cm: - pcovc2.fit(self.X, Y_multiclass) + pcovc_multiclass.fit(self.X, self.Y) self.assertEqual( str(cm.exception), "For multiclass classification, expected classifier coefficients " "to have shape (%d, %d) but got shape %r" - % (len(np.unique(Y_multiclass)), self.X.shape[1], classifier2.coef_.shape), + % (len(np.unique(self.Y)), self.X.shape[1], cl_binary.coef_.shape), + ) + + +class PCovCMultiOutputTest(PCovCBaseTest): + + def test_projector_shapes(self): + pass + + def test_decision_function(self): + pcovc = PCovC( + classifier=MultiOutputClassifier(LogisticRegression()), n_components=2 ) + Y_double = np.column_stack((self.Y, self.Y[::-1])) + pcovc.fit(self.X, Y_double) + + Z = pcovc.decision_function(self.X) + + # list of (n_samples, n_classes) arrays + self.assertEqual(len(Z), Y_double.shape[1]) + + for est, z_slice in zip(pcovc.z_classifier_.estimators_, Z): + with self.subTest(type="z_arrays"): + # each array is shape (n_samples, n_classes) + self.assertEqual(self.X.shape[0], z_slice.shape[0]) + self.assertEqual(est.coef_.shape[0], z_slice.shape[1]) + if __name__ == "__main__": unittest.main(verbosity=2) From 87a504fe2813e652e69b935943f5ecb6ef5ae350 Mon Sep 17 00:00:00 2001 From: Rhushil Vasavada Date: Mon, 23 Jun 2025 13:28:25 -0500 Subject: [PATCH 5/8] Fixing _version.py tracking --- src/skmatter/_version.py | 21 ----- src/skmatter/decomposition/_pcov.py | 7 +- src/skmatter/decomposition/_pcovc.py | 112 +++++++++++++-------------- src/skmatter/decomposition/_pcovr.py | 4 +- tests/test_pcovr.py | 2 +- 5 files changed, 59 insertions(+), 87 deletions(-) delete mode 100644 src/skmatter/_version.py diff --git a/src/skmatter/_version.py b/src/skmatter/_version.py deleted file mode 100644 index db8bdda40..000000000 --- a/src/skmatter/_version.py +++ /dev/null @@ -1,21 +0,0 @@ -# file generated by setuptools-scm -# don't change, don't track in version control - -__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"] - -TYPE_CHECKING = False -if TYPE_CHECKING: - from typing import Tuple - from typing import Union - - VERSION_TUPLE = Tuple[Union[int, str], ...] -else: - VERSION_TUPLE = object - -version: str -__version__: str -__version_tuple__: VERSION_TUPLE -version_tuple: VERSION_TUPLE - -__version__ = version = '0.2.1.dev58+gead41e2.d20250623' -__version_tuple__ = version_tuple = (0, 2, 1, 'dev58', 'gead41e2.d20250623') diff --git a/src/skmatter/decomposition/_pcov.py b/src/skmatter/decomposition/_pcov.py index 04dc93b4e..3b039c5cc 100644 --- a/src/skmatter/decomposition/_pcov.py +++ b/src/skmatter/decomposition/_pcov.py @@ -48,10 +48,9 @@ def __init__( self.random_state = random_state self.whiten = whiten - def fit(self, X): - """Contains the common functionality for the PCovR and PCovC fit methods, - but leaves the rest of the functionality to the subclass. - """ + def _initialize_params(self, X): + """Initializes common fit parameters for PCovR and PCovC.""" + # saved for inverse transformations from the latent space, # should be zero in the case that the features have been properly centered self.mean_ = np.mean(X, axis=0) diff --git a/src/skmatter/decomposition/_pcovc.py b/src/skmatter/decomposition/_pcovc.py index dcb03fc94..b1c9e456d 100644 --- a/src/skmatter/decomposition/_pcovc.py +++ b/src/skmatter/decomposition/_pcovc.py @@ -174,15 +174,18 @@ class PCovC(LinearClassifierMixin, _BasePCov): the projector, or weights, from the input space :math:`\mathbf{X}` to the latent-space projection :math:`\mathbf{T}` - pxz_ : ndarray of size :math:`({n_{features}, })`, :math:`({n_{features}, n_{classes}})`, \ - or :math:`({n_{components}, n_{classes}*n_{outputs}})` + pxz_ : ndarray of size :math:`({n_{features}, })`, :math:`({n_{features}, n_{classes}})` the projector, or weights, from the input space :math:`\mathbf{X}` - to the class confidence scores :math:`\mathbf{Z}` + to the class confidence scores :math:`\mathbf{Z}`. In the multioutput case, + has shape , :math:`({n_{features}, n_{classes}*n_{outputs}})`, a flattened form + of a 3D tensor. ptz_ : ndarray of size :math:`({n_{components}, })`, :math:`({n_{components}, n_{classes}})` \ or :math:`({n_{components}, n_{classes}*n_{outputs}})` the projector, or weights, from the latent-space projection - :math:`\mathbf{T}` to the class confidence scores :math:`\mathbf{Z}` + :math:`\mathbf{T}` to the class confidence scores :math:`\mathbf{Z}`. + In the multioutput case, has shape , :math:`({n_{components}, n_{classes}*n_{outputs}})`, + a flattened form of a 3D tensor. explained_variance_ : numpy.ndarray of shape (n_components,) The amount of variance explained by each of the selected components. @@ -262,7 +265,7 @@ def fit(self, X, Y, W=None): Y : numpy.ndarray, shape (n_samples,) or (n_samples, n_outputs) Training data, where n_samples is the number of samples and - n_outputs is the number of outputs. If classifier parameter is an instance + n_outputs is the number of outputs. If ``self.classifier`` is an instance of ``sklearn.multioutput.MultiOutputClassifier()``, Y can be of shape (n_samples, n_outputs). @@ -276,8 +279,9 @@ def fit(self, X, Y, W=None): X, Y = validate_data(self, X, Y, multi_output=True, y_numeric=False) check_classification_targets(Y) self.classes_ = np.unique(Y) + self.n_outputs = Y.shape[1] - super().fit(X) + super()._initialize_params(X) compatible_classifiers = ( LogisticRegression, @@ -300,35 +304,23 @@ def fit(self, X, Y, W=None): ", or `precomputed`" ) + # if type_of_target(Y) == "binary" + if self.classifier != "precomputed": if self.classifier is None: - if Y.ndim < 2: - classifier = LogisticRegression() - else: - classifier = MultiOutputClassifier(estimator=LogisticRegression()) + classifier = LogisticRegression() else: classifier = self.classifier self.z_classifier_ = check_cl_fit(classifier, X, Y) - - if isinstance(self.z_classifier_, MultiOutputClassifier): - W = np.hstack([est_.coef_.T for est_ in self.z_classifier_.estimators_]) - else: - W = self.z_classifier_.coef_.T.reshape(X.shape[1], -1) + W = self.z_classifier_.coef_.T.reshape(X.shape[1], -1) else: - if Y.ndim < 2: - # if self.classifier = "precomputed", use default classifier to predict Y from T - classifier = LogisticRegression() - if W is None: - W = LogisticRegression().fit(X, Y).coef_.T - W = W.reshape(X.shape[1], -1) - - else: - classifier = MultiOutputClassifier(estimator=LogisticRegression()) - if W is None: - _ = MultiOutputClassifier(estimator=LogisticRegression).fit(X, Y) - W = np.hstack([est_.coef_.T for est_ in _.estimators_]) + # If precomputed, use default classifier to predict Y from T + classifier = LogisticRegression() + if W is None: + W = LogisticRegression().fit(X, Y).coef_.T + W = W.reshape(X.shape[1], -1) print(f"X: {X.shape}") print(f"W: {W.shape}") @@ -460,7 +452,7 @@ def decision_function(self, X=None, T=None): n_outputs such arrays if n_outputs > 1 Confidence scores. For binary classification, has shape `(n_samples,)`, for multiclass classification, has shape `(n_samples, n_classes)`. If n_outputs > 1, - the list returned can contain such arrays with differing shapes depending on the + the list returned can contain arrays with differing shapes depending on the number of classes in each output of Y. """ check_is_fitted(self, attributes=["pxz_", "ptz_"]) @@ -518,35 +510,35 @@ def transform(self, X=None): """ return super().transform(X) - def score(self, X, Y, sample_weight=None): - """Return the accuracy on the given test data and labels. - - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Test samples. - - Y : array-like of shape (n_samples,) or (n_samples, n_outputs) - True labels for `X`. - - sample_weight : array-like of shape (n_samples,), default=None - Sample weights. Can only be used if the PCovC instance - has been trained on multitarget data. - - Returns - ------- - score : float - Accuracy scores. If the PCovC instance was trained on a 1D Y, - this will call the ``score()`` function defined by - ``sklearn.base.ClassifierMixin``. If trained on a 2D Y, this will - call the ``score()`` function defined by - ``sklearn.multioutput.MultiOutputClassifier``, to ensure multi - """ - X, Y = validate_data(self, X, Y, reset=False) - - if isinstance(self.classifier_, MultiOutputClassifier): - # LinearClassifierMixin.score fails with multioutput-multiclass Y - return self.classifier_.score(X @ self.pxt_, Y) - else: - return self.classifier_.score(X @ self.pxt_, Y, sample_weight=sample_weight) + # def score(self, X, Y, sample_weight=None): + # """Return the accuracy on the given test data and labels. Contains support + # for multiclass-multioutput data. + + # Parameters + # ---------- + # X : array-like of shape (n_samples, n_features) + # Test samples. + + # Y : array-like of shape (n_samples,) or (n_samples, n_outputs) + # True labels for `X`. + + # sample_weight : array-like of shape (n_samples,), default=None + # Sample weights. Can only be used if the PCovC instance + # has been trained on single-target data. + + # Returns + # ------- + # score : float + # Accuracy scores. If the PCovC instance was trained on a 1D Y, + # this will call the ``score()`` function defined by + # ``sklearn.base.ClassifierMixin``. If trained on a 2D Y, this will + # call the ``score()`` function defined by + # ``sklearn.multioutput.MultiOutputClassifier``. + # """ + # X, Y = validate_data(self, X, Y, reset=False) + + # if isinstance(self.classifier_, MultiOutputClassifier): + # # LinearClassifierMixin.score fails with multioutput-multiclass Y + # return self.classifier_.score(X @ self.pxt_, Y) + # else: + # return self.classifier_.score(X @ self.pxt_, Y, sample_weight=sample_weight) diff --git a/src/skmatter/decomposition/_pcovr.py b/src/skmatter/decomposition/_pcovr.py index 1c8835f12..0dda9df7b 100644 --- a/src/skmatter/decomposition/_pcovr.py +++ b/src/skmatter/decomposition/_pcovr.py @@ -231,7 +231,9 @@ def fit(self, X, Y, W=None): passed, it is assumed that `W = np.linalg.lstsq(X, Y, self.tol)[0]` """ X, Y = validate_data(self, X, Y, y_numeric=True, multi_output=True) - super().fit(X) + self.n_outputs = Y.shape[1] + + super()._initialize_params(X) compatible_regressors = (LinearRegression, Ridge, RidgeCV) diff --git a/tests/test_pcovr.py b/tests/test_pcovr.py index 597dcc2ba..0b5dfcb1d 100644 --- a/tests/test_pcovr.py +++ b/tests/test_pcovr.py @@ -401,7 +401,7 @@ def test_default_ncomponents(self): self.assertEqual(pcovr.n_components_, min(self.X.shape)) - def test_Y_Shape(self): + def test_Y_shape(self): pcovr = self.model() self.Y = np.vstack(self.Y) pcovr.fit(self.X, self.Y) From cf991cf73fd9ee69512bfc29400baed008870730 Mon Sep 17 00:00:00 2001 From: Rhushil Vasavada Date: Wed, 25 Jun 2025 15:42:09 -0500 Subject: [PATCH 6/8] Continuing multiouput work --- src/skmatter/decomposition/_pcov.py | 2 +- src/skmatter/decomposition/_pcovc.py | 18 +++++++++++++----- tests/test_pcovc.py | 15 +++++++-------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/skmatter/decomposition/_pcov.py b/src/skmatter/decomposition/_pcov.py index 3b039c5cc..53e447225 100644 --- a/src/skmatter/decomposition/_pcov.py +++ b/src/skmatter/decomposition/_pcov.py @@ -49,7 +49,7 @@ def __init__( self.whiten = whiten def _initialize_params(self, X): - """Initializes common fit parameters for PCovR and PCovC.""" + """Initializes common fit parameters for he PCovR and PCovC.""" # saved for inverse transformations from the latent space, # should be zero in the case that the features have been properly centered diff --git a/src/skmatter/decomposition/_pcovc.py b/src/skmatter/decomposition/_pcovc.py index b1c9e456d..91574b451 100644 --- a/src/skmatter/decomposition/_pcovc.py +++ b/src/skmatter/decomposition/_pcovc.py @@ -25,6 +25,8 @@ # test_check_estimator.py 'check_classifier_multioutput' (line 2479 of estimator_checks.py) # - this is the only test for MultiOutputClassifiers, so is it OK to exclude this tag? +# did a search of all classifiers that inherit from MultiOutputMixin - none of them implement +# decision function, so I don't think we need to inherit class PCovC(LinearClassifierMixin, _BasePCov): r"""Principal Covariates Classification (PCovC). @@ -277,9 +279,10 @@ def fit(self, X, Y, W=None): `` W = np.hstack([est_.coef_.T for est_ in classifier.estimators_])``. """ X, Y = validate_data(self, X, Y, multi_output=True, y_numeric=False) + check_classification_targets(Y) self.classes_ = np.unique(Y) - self.n_outputs = Y.shape[1] + self.n_outputs = 1 if Y.ndim == 1 else Y.shape[1] super()._initialize_params(X) @@ -302,9 +305,15 @@ def fit(self, X, Y, W=None): "Classifier must be an instance of `" f"{'`, `'.join(c.__name__ for c in compatible_classifiers)}`" ", or `precomputed`" - ) + ) + + # if self.n_outputs == 1: + # classifier = LogisticRegression() + # else: + # classifier = MultiOutputClassifier(estimator=LogisticRegression()) - # if type_of_target(Y) == "binary" + # if self.classifier == "precomputed": + if self.classifier != "precomputed": if self.classifier is None: @@ -313,14 +322,13 @@ def fit(self, X, Y, W=None): classifier = self.classifier self.z_classifier_ = check_cl_fit(classifier, X, Y) - W = self.z_classifier_.coef_.T.reshape(X.shape[1], -1) + W = self.z_classifier_.coef_.T else: # If precomputed, use default classifier to predict Y from T classifier = LogisticRegression() if W is None: W = LogisticRegression().fit(X, Y).coef_.T - W = W.reshape(X.shape[1], -1) print(f"X: {X.shape}") print(f"W: {W.shape}") diff --git a/tests/test_pcovc.py b/tests/test_pcovc.py index e62ac561e..31b2199a2 100644 --- a/tests/test_pcovc.py +++ b/tests/test_pcovc.py @@ -3,7 +3,7 @@ import numpy as np from sklearn import exceptions -from sklearn.datasets import load_iris as get_dataset +from sklearn.datasets import load_breast_cancer as get_dataset from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression, RidgeClassifier from sklearn.multioutput import MultiOutputClassifier @@ -426,6 +426,7 @@ def test_Z_shape(self): # Shape (n_samples, ) for binary classifcation Z = pcovc.decision_function(self.X) + print(Z.shape) self.assertEqual(Z.ndim, 1) self.assertEqual(Z.shape[0], self.X.shape[0]) @@ -470,13 +471,11 @@ def test_prefit_classifier(self): pcovc = self.model(mixing=0.5, classifier=classifier) pcovc.fit(self.X, self.Y) - Z_classifier = classifier.decision_function(self.X).reshape(self.X.shape[0], -1) - W_classifier = classifier.coef_.T.reshape(self.X.shape[1], -1) + Z_classifier = classifier.decision_function(self.X) + W_classifier = classifier.coef_.T - Z_pcovc = pcovc.z_classifier_.decision_function(self.X).reshape( - self.X.shape[0], -1 - ) - W_pcovc = pcovc.z_classifier_.coef_.T.reshape(self.X.shape[1], -1) + Z_pcovc = pcovc.z_classifier_.decision_function(self.X) + W_pcovc = pcovc.z_classifier_.coef_.T self.assertTrue(np.allclose(Z_classifier, Z_pcovc)) self.assertTrue(np.allclose(W_classifier, W_pcovc)) @@ -485,7 +484,7 @@ def test_precomputed_classification(self): classifier = LogisticRegression() classifier.fit(self.X, self.Y) - W = classifier.coef_.T.reshape(self.X.shape[1], -1) + W = classifier.coef_.T pcovc1 = self.model(mixing=0.5, classifier="precomputed", n_components=1) pcovc1.fit(self.X, self.Y, W) t1 = pcovc1.transform(self.X) From bb7147c963beacbf4f7288a43f5496f9cb4a0841 Mon Sep 17 00:00:00 2001 From: Rhushil Vasavada Date: Fri, 27 Jun 2025 16:30:48 -0500 Subject: [PATCH 7/8] Cleaning things up and adding more tests --- src/skmatter/decomposition/_pcovc.py | 104 +++++++++++++++------------ tests/test_pcovc.py | 81 ++++++++++++++++----- 2 files changed, 123 insertions(+), 62 deletions(-) diff --git a/src/skmatter/decomposition/_pcovc.py b/src/skmatter/decomposition/_pcovc.py index 91574b451..dfe76ab97 100644 --- a/src/skmatter/decomposition/_pcovc.py +++ b/src/skmatter/decomposition/_pcovc.py @@ -28,6 +28,7 @@ # did a search of all classifiers that inherit from MultiOutputMixin - none of them implement # decision function, so I don't think we need to inherit + class PCovC(LinearClassifierMixin, _BasePCov): r"""Principal Covariates Classification (PCovC). @@ -178,16 +179,11 @@ class PCovC(LinearClassifierMixin, _BasePCov): pxz_ : ndarray of size :math:`({n_{features}, })`, :math:`({n_{features}, n_{classes}})` the projector, or weights, from the input space :math:`\mathbf{X}` - to the class confidence scores :math:`\mathbf{Z}`. In the multioutput case, - has shape , :math:`({n_{features}, n_{classes}*n_{outputs}})`, a flattened form - of a 3D tensor. + to the class confidence scores :math:`\mathbf{Z}`. - ptz_ : ndarray of size :math:`({n_{components}, })`, :math:`({n_{components}, n_{classes}})` \ - or :math:`({n_{components}, n_{classes}*n_{outputs}})` - the projector, or weights, from the latent-space projection - :math:`\mathbf{T}` to the class confidence scores :math:`\mathbf{Z}`. - In the multioutput case, has shape , :math:`({n_{components}, n_{classes}*n_{outputs}})`, - a flattened form of a 3D tensor. + ptz_ : ndarray of size :math:`({n_{components}, })`, :math:`({n_{components}, n_{classes}})` + the projector, or weights, from from the latent-space projection + :math:`\mathbf{T}` to the class confidence scores :math:`\mathbf{Z}`. explained_variance_ : numpy.ndarray of shape (n_components,) The amount of variance explained by each of the selected components. @@ -279,7 +275,7 @@ def fit(self, X, Y, W=None): `` W = np.hstack([est_.coef_.T for est_ in classifier.estimators_])``. """ X, Y = validate_data(self, X, Y, multi_output=True, y_numeric=False) - + check_classification_targets(Y) self.classes_ = np.unique(Y) self.n_outputs = 1 if Y.ndim == 1 else Y.shape[1] @@ -305,33 +301,51 @@ def fit(self, X, Y, W=None): "Classifier must be an instance of `" f"{'`, `'.join(c.__name__ for c in compatible_classifiers)}`" ", or `precomputed`" - ) + ) - # if self.n_outputs == 1: - # classifier = LogisticRegression() - # else: - # classifier = MultiOutputClassifier(estimator=LogisticRegression()) + if self.n_outputs == 1 and isinstance(self.classifier, MultiOutputClassifier): + raise ValueError( + "Classifier cannot be an instance of `MultiOutputClassifier` when Y is 1D" + ) + + if ( + self.n_outputs != 1 + and self.classifier not in ["precomputed", None] + and not ( + isinstance(self.classifier, MultiOutputClassifier) + or self.classifier == "precomputed" + ) + ): + raise ValueError( + "Classifier must be an instance of `MultiOutputClassifier` when Y is 2D" + ) - # if self.classifier == "precomputed": - + if self.n_outputs == 1: + if self.classifier != "precomputed": + classifier = self.classifier or LogisticRegression() + self.z_classifier_ = check_cl_fit(classifier, X, Y) + W = self.z_classifier_.coef_.T - if self.classifier != "precomputed": - if self.classifier is None: - classifier = LogisticRegression() else: - classifier = self.classifier - - self.z_classifier_ = check_cl_fit(classifier, X, Y) - W = self.z_classifier_.coef_.T + # to be used later on as the classifier fit between T and Y + classifier = LogisticRegression() + if W is None: + W = clone(classifier).fit(X, Y).coef_.T else: - # If precomputed, use default classifier to predict Y from T - classifier = LogisticRegression() - if W is None: - W = LogisticRegression().fit(X, Y).coef_.T + if self.classifier != "precomputed": + classifier = self.classifier or MultiOutputClassifier( + estimator=LogisticRegression() + ) + self.z_classifier_ = check_cl_fit(classifier, X, Y) + W = np.hstack([est_.coef_.T for est_ in self.z_classifier_.estimators_]) - print(f"X: {X.shape}") - print(f"W: {W.shape}") + else: + # to be used later on as the classifier fit between T and Y + classifier = MultiOutputClassifier(estimator=LogisticRegression()) + if W is None: + _ = clone(classifier).fit(X, Y) + W = np.hstack([_.coef_.T for _ in _.estimators_]) Z = X @ W @@ -344,7 +358,11 @@ def fit(self, X, Y, W=None): # classifier and steal weights to get pxz and ptz self.classifier_ = clone(classifier).fit(X @ self.pxt_, Y) - if isinstance(self.classifier_, MultiOutputClassifier): + if self.n_outputs == 1: + self.ptz_ = self.classifier_.coef_.T + # print(self.ptz_.shape) + self.pxz_ = self.pxt_ @ self.ptz_ + else: self.ptz_ = np.hstack( [est_.coef_.T for est_ in self.classifier_.estimators_] ) @@ -353,12 +371,7 @@ def fit(self, X, Y, W=None): self.pxz_ = self.pxt_ @ self.ptz_ # print(f"pxz {self.pxz_.shape}") - else: - self.ptz_ = self.classifier_.coef_.T - # print(self.ptz_.shape) - self.pxz_ = self.pxt_ @ self.ptz_ - - print(self.ptz_.shape) + # print(self.ptz_.shape) if len(Y.shape) == 1 and type_of_target(Y) == "binary": self.pxz_ = self.pxz_.reshape( X.shape[1], @@ -460,7 +473,7 @@ def decision_function(self, X=None, T=None): n_outputs such arrays if n_outputs > 1 Confidence scores. For binary classification, has shape `(n_samples,)`, for multiclass classification, has shape `(n_samples, n_classes)`. If n_outputs > 1, - the list returned can contain arrays with differing shapes depending on the + the list can contain arrays with differing shapes depending on the number of classes in each output of Y. """ check_is_fitted(self, attributes=["pxz_", "ptz_"]) @@ -471,25 +484,24 @@ def decision_function(self, X=None, T=None): if X is not None: X = validate_data(self, X, reset=False) - # this is similar to how MultiOutputClassifier handles predict_proba() if n_outputs > 1 - if isinstance(self.classifier_, MultiOutputClassifier): + if self.n_outputs == 1: + # Or self.classifier_.decision_function(X @ self.pxt_) + return X @ self.pxz_ + self.classifier_.intercept_ + else: return [ est_.decision_function(X @ self.pxt_) for est_ in self.classifier_.estimators_ ] - - # Or self.classifier_.decision_function(X @ self.pxt_) - return X @ self.pxz_ + self.classifier_.intercept_ else: T = check_array(T) - if isinstance(self.classifier_, MultiOutputClassifier): + if self.n_outputs == 1: + return T @ self.ptz_ + self.classifier_.intercept_ + else: return [ est_.decision_function(T) for est_ in self.classifier_.estimators_ ] - return T @ self.ptz_ + self.classifier_.intercept_ - def predict(self, X=None, T=None): """Predicts the property labels using classification on T.""" check_is_fitted(self, attributes=["pxz_", "ptz_"]) diff --git a/tests/test_pcovc.py b/tests/test_pcovc.py index 31b2199a2..3fa54d307 100644 --- a/tests/test_pcovc.py +++ b/tests/test_pcovc.py @@ -3,7 +3,8 @@ import numpy as np from sklearn import exceptions -from sklearn.datasets import load_breast_cancer as get_dataset +from sklearn.calibration import LinearSVC +from sklearn.datasets import load_iris as get_dataset from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression, RidgeClassifier from sklearn.multioutput import MultiOutputClassifier @@ -93,7 +94,7 @@ def test_simple_prediction(self): Yp = pcovc.predict(self.X) self.assertLessEqual( - np.linalg.norm(Yp - Yhat) ** 2.0 / np.linalg.norm(Yp) ** 2.0, + np.linalg.norm(Yp - Yhat) ** 2.0 / np.linalg.norm(Yhat) ** 2.0, self.error_tol, ) @@ -426,7 +427,6 @@ def test_Z_shape(self): # Shape (n_samples, ) for binary classifcation Z = pcovc.decision_function(self.X) - print(Z.shape) self.assertEqual(Z.ndim, 1) self.assertEqual(Z.shape[0], self.X.shape[0]) @@ -435,13 +435,7 @@ def test_Z_shape(self): Z = pcovc.decision_function(self.X) self.assertEqual(Z.ndim, 2) - self.assertEqual( - (Z.shape[0], Z.shape[1]), - ( - self.X.shape[0], - len(np.unique(self.Y)), - ), - ) + self.assertEqual(Z.shape, (self.X.shape[0], len(np.unique(self.Y)))) def test_decision_function(self): """Check that PCovC's decision_function works when only T is @@ -551,11 +545,11 @@ def test_none_classifier(self): def test_incompatible_coef_shape(self): cl_multiclass = LogisticRegression() cl_multiclass.fit(self.X, self.Y) - pcovc_bi = self.model(mixing=0.5, classifier=cl_multiclass) + pcovc_binary = self.model(mixing=0.5, classifier=cl_multiclass) # Binary classification shape mismatch with self.assertRaises(ValueError) as cm: - pcovc_bi.fit(self.X, np.random.randint(0, 2, size=self.X.shape[0])) + pcovc_binary.fit(self.X, np.random.randint(0, 2, size=self.X.shape[0])) self.assertEqual( str(cm.exception), "For binary classification, expected classifier coefficients " @@ -580,15 +574,56 @@ def test_incompatible_coef_shape(self): class PCovCMultiOutputTest(PCovCBaseTest): - def test_projector_shapes(self): - pass + def test_prefit_multioutput(self): + """Check that PCovC works if a prefit classifier is passed when `n_ouputs > 1`.""" + classifier = MultiOutputClassifier(estimator=LogisticRegression()) + Y_double = np.column_stack((self.Y, self.Y)) - def test_decision_function(self): + classifier.fit(self.X, Y_double) + pcovc = self.model(mixing=0.25, classifier=classifier) + pcovc.fit(self.X, Y_double) + + W_classifier = np.hstack([est_.coef_.T for est_ in classifier.estimators_]) + Z_classifier = self.X @ W_classifier + + W_pcovc = np.hstack([est_.coef_.T for est_ in pcovc.z_classifier_.estimators_]) + Z_pcovc = self.X @ W_pcovc + + self.assertTrue(np.allclose(Z_classifier, Z_pcovc)) + self.assertTrue(np.allclose(W_classifier, W_pcovc)) + + def test_precomputed_multioutput(self): + """Check that PCovC works if classifier=`precomputed` and `n_ouputs > 1`.""" + classifier = MultiOutputClassifier(estimator=LogisticRegression()) + Y_double = np.column_stack((self.Y, self.Y)) + + classifier.fit(self.X, Y_double) + W = np.hstack([est_.coef_.T for est_ in classifier.estimators_]) + pcovc1 = self.model(mixing=0.5, classifier="precomputed", n_components=1) + pcovc1.fit(self.X, Y_double, W) + t1 = pcovc1.transform(self.X) + + pcovc2 = self.model(mixing=0.5, classifier=classifier, n_components=1) + pcovc2.fit(self.X, Y_double) + t2 = pcovc2.transform(self.X) + + self.assertTrue(np.linalg.norm(t1 - t2) < self.error_tol) + + # Now check for match when W is not passed: + pcovc3 = self.model(mixing=0.5, classifier="precomputed", n_components=1) + pcovc3.fit(self.X, Y_double) + t3 = pcovc3.transform(self.X) + + self.assertTrue(np.linalg.norm(t3 - t2) < self.error_tol) + self.assertTrue(np.linalg.norm(t3 - t1) < self.error_tol) + + def test_Z_shape_multioutput(self): + """Check that PCovC returns the evidence Z in the desired form when `n_ouputs > 1`.""" pcovc = PCovC( classifier=MultiOutputClassifier(LogisticRegression()), n_components=2 ) - Y_double = np.column_stack((self.Y, self.Y[::-1])) + Y_double = np.column_stack((self.Y, self.Y)) pcovc.fit(self.X, Y_double) Z = pcovc.decision_function(self.X) @@ -602,6 +637,20 @@ def test_decision_function(self): self.assertEqual(self.X.shape[0], z_slice.shape[0]) self.assertEqual(est.coef_.shape[0], z_slice.shape[1]) + def test_decision_function_multioutput(self): + """Check that PCovC's decision_function works in edge cases when `n_ouputs > 1`.""" + pcovc = self.model(classifier=MultiOutputClassifier(estimator=LinearSVC())) + pcovc.fit(self.X, np.column_stack((self.Y, self.Y))) + with self.assertRaises(ValueError) as cm: + _ = pcovc.decision_function() + self.assertEqual( + str(cm.exception), + "Either X or T must be supplied.", + ) + + T = pcovc.transform(self.X) + _ = pcovc.decision_function(T=T) + if __name__ == "__main__": unittest.main(verbosity=2) From 838c116a348e27e106072d036fa12a756017ffe2 Mon Sep 17 00:00:00 2001 From: Rhushil Vasavada Date: Mon, 30 Jun 2025 11:09:12 -0500 Subject: [PATCH 8/8] Adding multioutput support for KPCovC --- src/skmatter/decomposition/_kernel_pcovc.py | 102 ++++++++++----- src/skmatter/decomposition/_pcovc.py | 129 ++++++------------- src/skmatter/decomposition/_pcovr.py | 1 - src/skmatter/utils/_pcovc_utils.py | 6 +- tests/test_kernel_pcovc.py | 130 +++++++++++++++++--- tests/test_pcovc.py | 17 ++- 6 files changed, 237 insertions(+), 148 deletions(-) diff --git a/src/skmatter/decomposition/_kernel_pcovc.py b/src/skmatter/decomposition/_kernel_pcovc.py index 63cc5f9fb..92ed4cb05 100644 --- a/src/skmatter/decomposition/_kernel_pcovc.py +++ b/src/skmatter/decomposition/_kernel_pcovc.py @@ -1,6 +1,7 @@ import numpy as np from sklearn import clone +from sklearn.multioutput import MultiOutputClassifier from sklearn.svm import LinearSVC from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.linear_model import ( @@ -24,7 +25,7 @@ class KernelPCovC(LinearClassifierMixin, _BaseKPCov): r"""Kernel Principal Covariates Classification (KPCovC). - KPCovC is a modification on the PrincipalCovariates Classification + KPCovC is a modification on the Principal Covariates Classification proposed in [Jorgensen2025]_. It determines a latent-space projection :math:`\mathbf{T}` which minimizes a combined loss in supervised and unsupervised tasks in the reproducing kernel Hilbert space (RKHS). @@ -52,6 +53,9 @@ class KernelPCovC(LinearClassifierMixin, _BaseKPCov): n_components == n_samples + n_outputs : int + The number of outputs when ``fit`` is performed. + svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto' If auto : The solver is selected by a default policy based on `X.shape` and @@ -78,13 +82,14 @@ class KernelPCovC(LinearClassifierMixin, _BaseKPCov): - ``sklearn.linear_model.LogisticRegressionCV()`` - ``sklearn.svm.LinearSVC()`` - ``sklearn.discriminant_analysis.LinearDiscriminantAnalysis()`` + - ``sklearn.multioutput.MultiOutputClassifier()`` - ``sklearn.linear_model.RidgeClassifier()`` - ``sklearn.linear_model.RidgeClassifierCV()`` - ``sklearn.linear_model.Perceptron()`` If a pre-fitted classifier is provided, it is used to compute :math:`{\mathbf{Z}}`. - If None, ``sklearn.linear_model.LogisticRegression()`` - is used as the classifier. + If None and ``n_outputs < 2``, ``sklearn.linear_model.LogisticRegression()`` is used. + If None and ``n_outputs == 2``, ``sklearn.multioutput.MultiOutputClassifier()`` is used. kernel : {"linear", "poly", "rbf", "sigmoid", "precomputed"} or callable, default="linear" Kernel. @@ -132,6 +137,9 @@ class KernelPCovC(LinearClassifierMixin, _BaseKPCov): Attributes ---------- + n_outputs : int + The number of outputs when ``fit`` is performed. + classifier : estimator object The linear classifier passed for fitting. If pre-fitted, it is assummed to be fit on a precomputed kernel :math:`\mathbf{K}` and :math:`\mathbf{Y}`. @@ -268,9 +276,11 @@ def fit(self, X, Y, W=None): self: object Returns the instance itself. """ - X, Y = validate_data(self, X, Y, y_numeric=False) + X, Y = validate_data(self, X, Y, multi_output=True, y_numeric=False) + check_classification_targets(Y) self.classes_ = np.unique(Y) + self.n_outputs = 1 if Y.ndim == 1 else Y.shape[1] super().fit(X) @@ -285,6 +295,7 @@ def fit(self, X, Y, W=None): LogisticRegressionCV, LinearSVC, LinearDiscriminantAnalysis, + MultiOutputClassifier, RidgeClassifier, RidgeClassifierCV, SGDClassifier, @@ -300,28 +311,37 @@ def fit(self, X, Y, W=None): ", or `precomputed`" ) - if self.classifier != "precomputed": - if self.classifier is None: - classifier = LogisticRegression() - else: - classifier = self.classifier + multioutput = self.n_outputs != 1 + precomputed = self.classifier == "precomputed" - # for convergence warnings - if hasattr(classifier, "max_iter") and ( - classifier.max_iter is None or classifier.max_iter < 500 - ): - classifier.max_iter = 500 + if self.classifier is None or precomputed: + # used as the default classifier for subsequent computations + classifier = ( + MultiOutputClassifier(LogisticRegression()) + if multioutput + else LogisticRegression() + ) + else: + classifier = self.classifier - # Check if classifier is fitted; if not, fit with precomputed K - self.z_classifier_ = check_cl_fit(classifier, K, Y) - W = self.z_classifier_.coef_.T.reshape(K.shape[1], -1) + if hasattr(classifier, "max_iter") and ( + classifier.max_iter is None or classifier.max_iter < 500 + ): + classifier.max_iter = 500 + + if precomputed and W is None: + _ = clone(classifier).fit(K, Y) + if multioutput: + W = np.hstack([_.coef_.T for _ in _.estimators_]) + else: + W = _.coef_.T else: - # If precomputed, use default classifier to predict Y from T - classifier = LogisticRegression(max_iter=500) - if W is None: - W = LogisticRegression().fit(K, Y).coef_.T - W = W.reshape(K.shape[1], -1) + self.z_classifier_ = check_cl_fit(classifier, K, Y) + if multioutput: + W = np.hstack([est_.coef_.T for est_ in self.z_classifier_.estimators_]) + else: + W = self.z_classifier_.coef_.T Z = K @ W @@ -334,10 +354,16 @@ def fit(self, X, Y, W=None): self.classifier_ = clone(classifier).fit(K @ self.pkt_, Y) - self.ptz_ = self.classifier_.coef_.T - self.pkz_ = self.pkt_ @ self.ptz_ + if multioutput: + self.ptz_ = np.hstack( + [est_.coef_.T for est_ in self.classifier_.estimators_] + ) + self.pkz_ = self.pkt_ @ self.ptz_ + else: + self.ptz_ = self.classifier_.coef_.T + self.pkz_ = self.pkt_ @ self.ptz_ - if len(Y.shape) == 1 and type_of_target(Y) == "binary": + if not multioutput and type_of_target(Y) == "binary": self.pkz_ = self.pkz_.reshape( K.shape[1], ) @@ -346,6 +372,7 @@ def fit(self, X, Y, W=None): ) self.components_ = self.pkt_.T # for sklearn compatibility + return self def predict(self, X=None, T=None): @@ -425,9 +452,12 @@ def decision_function(self, X=None, T=None): Returns ------- - Z : numpy.ndarray, shape (n_samples,) or (n_samples, n_classes) + Z : numpy.ndarray, shape (n_samples,) or (n_samples, n_classes), or a list of \ + n_outputs such arrays if n_outputs > 1 Confidence scores. For binary classification, has shape `(n_samples,)`, - for multiclass classification, has shape `(n_samples, n_classes)` + for multiclass classification, has shape `(n_samples, n_classes)`. + If n_outputs > 1, the list can contain arrays with differing shapes + depending on the number of classes in each output of Y. """ check_is_fitted(self, attributes=["pkz_", "ptz_"]) @@ -440,9 +470,21 @@ def decision_function(self, X=None, T=None): if self.center: K = self.centerer_.transform(K) - # Or self.classifier_.decision_function(K @ self.pxt_) - return K @ self.pkz_ + self.classifier_.intercept_ + if self.n_outputs == 1: + # Or self.classifier_.decision_function(K @ self.pkt_) + return K @ self.pkz_ + self.classifier_.intercept_ + else: + return [ + est_.decision_function(K @ self.pkt_) + for est_ in self.classifier_.estimators_ + ] else: T = check_array(T) - return T @ self.ptz_ + self.classifier_.intercept_ + + if self.n_outputs == 1: + T @ self.ptz_ + self.classifier_.intercept_ + else: + return [ + est_.decision_function(T) for est_ in self.classifier_.estimators_ + ] diff --git a/src/skmatter/decomposition/_pcovc.py b/src/skmatter/decomposition/_pcovc.py index dfe76ab97..8ddeb0e0c 100644 --- a/src/skmatter/decomposition/_pcovc.py +++ b/src/skmatter/decomposition/_pcovc.py @@ -22,11 +22,11 @@ # No inheritance from MultiOutputMixin because decision_function would fail -# test_check_estimator.py 'check_classifier_multioutput' (line 2479 of estimator_checks.py) -# - this is the only test for MultiOutputClassifiers, so is it OK to exclude this tag? +# test_check_estimator.py 'check_classifier_multioutput' (line 2479 of estimator_checks.py). +# This is the only test for multioutput classifiers, so is it OK to exclude this tag? # did a search of all classifiers that inherit from MultiOutputMixin - none of them implement -# decision function, so I don't think we need to inherit +# decision function class PCovC(LinearClassifierMixin, _BasePCov): @@ -112,7 +112,7 @@ class PCovC(LinearClassifierMixin, _BasePCov): default=`sample` when :math:`{n_{samples} < n_{features}}` and `feature` when :math:`{n_{features} < n_{samples}}` - classifier: `estimator object` or `precomputed`, default=None + classifier: `estimator object` or `precomputed`, default=None classifier for computing :math:`{\mathbf{Z}}`. The classifier should be one of the following: @@ -120,6 +120,7 @@ class PCovC(LinearClassifierMixin, _BasePCov): - ``sklearn.linear_model.LogisticRegressionCV()`` - ``sklearn.svm.LinearSVC()`` - ``sklearn.discriminant_analysis.LinearDiscriminantAnalysis()`` + - ``sklearn.multioutput.MultiOutputClassifier()`` - ``sklearn.linear_model.RidgeClassifier()`` - ``sklearn.linear_model.RidgeClassifierCV()`` - ``sklearn.linear_model.Perceptron()`` @@ -131,8 +132,8 @@ class PCovC(LinearClassifierMixin, _BasePCov): `sklearn.pipeline.Pipeline` with model caching. In such cases, the classifier will be re-fitted on the same training data as the composite estimator. - If None and ``Y.ndim < 2``, ``sklearn.linear_model.LogisticRegression()`` is used. - If None and ``Y.ndim == 2``, ``sklearn.multioutput.MultiOutputClassifier()`` is used. + If None and ``n_outputs < 2``, ``sklearn.linear_model.LogisticRegression()`` is used. + If None and ``n_outputs == 2``, ``sklearn.multioutput.MultiOutputClassifier()`` is used. iterated_power : int or 'auto', default='auto' Number of iterations for the power method computed by @@ -164,6 +165,9 @@ class PCovC(LinearClassifierMixin, _BasePCov): n_components, or the lesser value of n_features and n_samples if n_components is None. + n_outputs : int + The number of outputs when ``fit`` is performed. + classifier : estimator object The linear classifier passed for fitting. @@ -263,16 +267,14 @@ def fit(self, X, Y, W=None): Y : numpy.ndarray, shape (n_samples,) or (n_samples, n_outputs) Training data, where n_samples is the number of samples and - n_outputs is the number of outputs. If ``self.classifier`` is an instance - of ``sklearn.multioutput.MultiOutputClassifier()``, Y can be of shape - (n_samples, n_outputs). + n_outputs is the number of outputs. W : numpy.ndarray, shape (n_features, n_classes) or (n_features, n_classes*n_outputs) Classification weights, optional when classifier = `precomputed`. If not passed, it is assumed that the weights will be taken from a linear classifier fit between :math:`\mathbf{X}` and :math:`\mathbf{Y}`. - In the case of a multioutput classifier ``classifier``, - `` W = np.hstack([est_.coef_.T for est_ in classifier.estimators_])``. + In the multioutput case, + `` W = np.hstack([est_.coef_.T for est_ in classifier.estimators_])``. """ X, Y = validate_data(self, X, Y, multi_output=True, y_numeric=False) @@ -303,49 +305,31 @@ def fit(self, X, Y, W=None): ", or `precomputed`" ) - if self.n_outputs == 1 and isinstance(self.classifier, MultiOutputClassifier): - raise ValueError( - "Classifier cannot be an instance of `MultiOutputClassifier` when Y is 1D" - ) + multioutput = self.n_outputs != 1 + precomputed = self.classifier == "precomputed" - if ( - self.n_outputs != 1 - and self.classifier not in ["precomputed", None] - and not ( - isinstance(self.classifier, MultiOutputClassifier) - or self.classifier == "precomputed" - ) - ): - raise ValueError( - "Classifier must be an instance of `MultiOutputClassifier` when Y is 2D" + if self.classifier is None or precomputed: + # used as the default classifier for subsequent computations + classifier = ( + MultiOutputClassifier(LogisticRegression()) + if multioutput + else LogisticRegression() ) + else: + classifier = self.classifier - if self.n_outputs == 1: - if self.classifier != "precomputed": - classifier = self.classifier or LogisticRegression() - self.z_classifier_ = check_cl_fit(classifier, X, Y) - W = self.z_classifier_.coef_.T - + if precomputed and W is None: + _ = clone(classifier).fit(X, Y) + if multioutput: + W = np.hstack([_.coef_.T for _ in _.estimators_]) else: - # to be used later on as the classifier fit between T and Y - classifier = LogisticRegression() - if W is None: - W = clone(classifier).fit(X, Y).coef_.T - + W = _.coef_.T else: - if self.classifier != "precomputed": - classifier = self.classifier or MultiOutputClassifier( - estimator=LogisticRegression() - ) - self.z_classifier_ = check_cl_fit(classifier, X, Y) + self.z_classifier_ = check_cl_fit(classifier, X, Y) + if multioutput: W = np.hstack([est_.coef_.T for est_ in self.z_classifier_.estimators_]) - else: - # to be used later on as the classifier fit between T and Y - classifier = MultiOutputClassifier(estimator=LogisticRegression()) - if W is None: - _ = clone(classifier).fit(X, Y) - W = np.hstack([_.coef_.T for _ in _.estimators_]) + W = self.z_classifier_.coef_.T Z = X @ W @@ -358,11 +342,7 @@ def fit(self, X, Y, W=None): # classifier and steal weights to get pxz and ptz self.classifier_ = clone(classifier).fit(X @ self.pxt_, Y) - if self.n_outputs == 1: - self.ptz_ = self.classifier_.coef_.T - # print(self.ptz_.shape) - self.pxz_ = self.pxt_ @ self.ptz_ - else: + if multioutput: self.ptz_ = np.hstack( [est_.coef_.T for est_ in self.classifier_.estimators_] ) @@ -370,9 +350,13 @@ def fit(self, X, Y, W=None): # print(f"ptz {self.ptz_.shape}") self.pxz_ = self.pxt_ @ self.ptz_ # print(f"pxz {self.pxz_.shape}") + else: + self.ptz_ = self.classifier_.coef_.T + # print(self.ptz_.shape) + self.pxz_ = self.pxt_ @ self.ptz_ # print(self.ptz_.shape) - if len(Y.shape) == 1 and type_of_target(Y) == "binary": + if not multioutput and type_of_target(Y) == "binary": self.pxz_ = self.pxz_.reshape( X.shape[1], ) @@ -472,9 +456,9 @@ def decision_function(self, X=None, T=None): Z : numpy.ndarray, shape (n_samples,) or (n_samples, n_classes), or a list of \ n_outputs such arrays if n_outputs > 1 Confidence scores. For binary classification, has shape `(n_samples,)`, - for multiclass classification, has shape `(n_samples, n_classes)`. If n_outputs > 1, - the list can contain arrays with differing shapes depending on the - number of classes in each output of Y. + for multiclass classification, has shape `(n_samples, n_classes)`. + If n_outputs > 1, the list can contain arrays with differing shapes + depending on the number of classes in each output of Y. """ check_is_fitted(self, attributes=["pxz_", "ptz_"]) @@ -529,36 +513,3 @@ def transform(self, X=None): and n_features is the number of features. """ return super().transform(X) - - # def score(self, X, Y, sample_weight=None): - # """Return the accuracy on the given test data and labels. Contains support - # for multiclass-multioutput data. - - # Parameters - # ---------- - # X : array-like of shape (n_samples, n_features) - # Test samples. - - # Y : array-like of shape (n_samples,) or (n_samples, n_outputs) - # True labels for `X`. - - # sample_weight : array-like of shape (n_samples,), default=None - # Sample weights. Can only be used if the PCovC instance - # has been trained on single-target data. - - # Returns - # ------- - # score : float - # Accuracy scores. If the PCovC instance was trained on a 1D Y, - # this will call the ``score()`` function defined by - # ``sklearn.base.ClassifierMixin``. If trained on a 2D Y, this will - # call the ``score()`` function defined by - # ``sklearn.multioutput.MultiOutputClassifier``. - # """ - # X, Y = validate_data(self, X, Y, reset=False) - - # if isinstance(self.classifier_, MultiOutputClassifier): - # # LinearClassifierMixin.score fails with multioutput-multiclass Y - # return self.classifier_.score(X @ self.pxt_, Y) - # else: - # return self.classifier_.score(X @ self.pxt_, Y, sample_weight=sample_weight) diff --git a/src/skmatter/decomposition/_pcovr.py b/src/skmatter/decomposition/_pcovr.py index 0dda9df7b..45326fe40 100644 --- a/src/skmatter/decomposition/_pcovr.py +++ b/src/skmatter/decomposition/_pcovr.py @@ -231,7 +231,6 @@ def fit(self, X, Y, W=None): passed, it is assumed that `W = np.linalg.lstsq(X, Y, self.tol)[0]` """ X, Y = validate_data(self, X, Y, y_numeric=True, multi_output=True) - self.n_outputs = Y.shape[1] super()._initialize_params(X) diff --git a/src/skmatter/utils/_pcovc_utils.py b/src/skmatter/utils/_pcovc_utils.py index 3203dca82..e1f346b85 100644 --- a/src/skmatter/utils/_pcovc_utils.py +++ b/src/skmatter/utils/_pcovc_utils.py @@ -45,9 +45,9 @@ def check_cl_fit(classifier, X, y): # number of classes in y if isinstance(fitted_classifier, MultiOutputClassifier): for est_ in fitted_classifier.estimators_: - check_cl_coef(X, est_.coef_, len(est_.classes_)) + _check_cl_coef(X, est_.coef_, len(est_.classes_)) else: - check_cl_coef(X, fitted_classifier.coef_, len(np.unique(y))) + _check_cl_coef(X, fitted_classifier.coef_, len(np.unique(y))) except NotFittedError: fitted_classifier = clone(classifier) @@ -56,7 +56,7 @@ def check_cl_fit(classifier, X, y): return fitted_classifier -def check_cl_coef(X, classifier_coef_, n_classes): +def _check_cl_coef(X, classifier_coef_, n_classes): if n_classes == 2: if classifier_coef_.shape[0] != 1: raise ValueError( diff --git a/tests/test_kernel_pcovc.py b/tests/test_kernel_pcovc.py index 10ef589af..40872e10d 100644 --- a/tests/test_kernel_pcovc.py +++ b/tests/test_kernel_pcovc.py @@ -4,10 +4,11 @@ from sklearn import exceptions from sklearn.calibration import LinearSVC from sklearn.datasets import load_breast_cancer as get_dataset +from sklearn.multioutput import MultiOutputClassifier from sklearn.naive_bayes import GaussianNB from sklearn.utils.validation import check_X_y from sklearn.preprocessing import StandardScaler -from sklearn.linear_model import LogisticRegression, RidgeClassifier +from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier from sklearn.metrics.pairwise import pairwise_kernels from skmatter.decomposition import KernelPCovC @@ -30,17 +31,12 @@ def __init__(self, *args, **kwargs): scaler = StandardScaler() self.X = scaler.fit_transform(self.X) - self.model = ( - lambda mixing=0.5, - classifier=LogisticRegression(), - n_components=4, - **kwargs: KernelPCovC( - mixing=mixing, - classifier=classifier, - n_components=n_components, - svd_solver=kwargs.pop("svd_solver", "full"), - **kwargs, - ) + self.model = lambda mixing=0.5, classifier=LogisticRegression(), n_components=4, **kwargs: KernelPCovC( + mixing=mixing, + classifier=classifier, + n_components=n_components, + svd_solver=kwargs.pop("svd_solver", "full"), + **kwargs, ) def setUp(self): @@ -222,7 +218,10 @@ def test_prefit_classifier(self): classifier = LinearSVC() classifier.fit(K, self.Y) - kpcovc = KernelPCovC(mixing=0.5, classifier=classifier, **kernel_params) + kpcovc = KernelPCovC( + mixing=0.5, + classifier=classifier, + ) kpcovc.fit(self.X, self.Y) Z_classifier = classifier.decision_function(K).reshape(K.shape[0], -1) @@ -261,7 +260,7 @@ def test_incompatible_classifier(self): str(cm.exception), "Classifier must be an instance of " "`LogisticRegression`, `LogisticRegressionCV`, `LinearSVC`, " - "`LinearDiscriminantAnalysis`, `RidgeClassifier`, " + "`LinearDiscriminantAnalysis`, `MultiOutputClassifier`, `RidgeClassifier`, " "`RidgeClassifierCV`, `SGDClassifier`, `Perceptron`, " "or `precomputed`", ) @@ -283,7 +282,10 @@ def test_incompatible_coef_shape(self): classifier1 = LinearSVC() classifier1.fit(K, Y_multiclass) - kpcovc1 = self.model(mixing=0.5, classifier=classifier1, **kernel_params) + kpcovc1 = self.model( + mixing=0.5, + classifier=classifier1, + ) # Binary classification shape mismatch with self.assertRaises(ValueError) as cm: @@ -492,5 +494,103 @@ def test_bad_n_components(self): ) +class KernelPCovCMultiOutputTest(KernelPCovCBaseTest): + + def test_prefit_multioutput(self): + """Check that KPCovC works if a prefit classifier is passed when `n_outputs > 1`.""" + kernel_params = {"kernel": "sigmoid", "gamma": 1, "degree": 3, "coef0": 0} + K = pairwise_kernels( + self.X, metric="sigmoid", filter_params=True, **kernel_params + ) + + classifier = MultiOutputClassifier(estimator=LogisticRegression()) + Y_double = np.column_stack((self.Y, self.Y)) + + classifier.fit(K, Y_double) + kpcovc = self.model( + mixing=0.10, + classifier=classifier, + ) + kpcovc.fit(self.X, Y_double) + + W_classifier = np.hstack([est_.coef_.T for est_ in classifier.estimators_]) + Z_classifier = K @ W_classifier + + W_kpcovc = np.hstack( + [est_.coef_.T for est_ in kpcovc.z_classifier_.estimators_] + ) + Z_kpcovc = K @ W_kpcovc + + self.assertTrue(np.allclose(Z_classifier, Z_kpcovc)) + self.assertTrue(np.allclose(W_classifier, W_kpcovc)) + + def test_precomputed_multioutput(self): + """Check that KPCovC works if classifier=`precomputed` and `n_outputs > 1`.""" + kernel_params = {"kernel": "linear", "gamma": 5, "degree": 3, "coef0": 2} + K = pairwise_kernels( + self.X, metric="linear", filter_params=True, **kernel_params + ) + + classifier = MultiOutputClassifier(estimator=LogisticRegression()) + Y_double = np.column_stack((self.Y, self.Y)) + + classifier.fit(K, Y_double) + W = np.hstack([est_.coef_.T for est_ in classifier.estimators_]) + + kpcovc1 = self.model(mixing=0.5, classifier="precomputed", **kernel_params) + kpcovc1.fit(self.X, Y_double, W) + t1 = kpcovc1.transform(self.X) + + kpcovc2 = self.model(mixing=0.5, classifier=classifier, **kernel_params) + kpcovc2.fit(self.X, Y_double) + t2 = kpcovc2.transform(self.X) + + self.assertTrue(np.linalg.norm(t1 - t2) < self.error_tol) + + # Now check for match when W is not passed: + kpcovc3 = self.model(mixing=0.5, classifier="precomputed", **kernel_params) + kpcovc3.fit(self.X, Y_double) + t3 = kpcovc3.transform(self.X) + + self.assertTrue(np.linalg.norm(t3 - t2) < self.error_tol) + self.assertTrue(np.linalg.norm(t3 - t1) < self.error_tol) + + def test_Z_shape_multioutput(self): + """Check that KPCovC returns the evidence Z in the desired form when `n_outputs > 1`.""" + kpcovc = KernelPCovC(classifier=MultiOutputClassifier(estimator=Perceptron())) + + Y_double = np.column_stack((self.Y, self.Y)) + kpcovc.fit(self.X, Y_double) + + Z = kpcovc.decision_function(self.X) + + # list of (n_samples, ) arrays when each column of Y is binary + self.assertEqual(len(Z), Y_double.shape[1]) + + for est, z_slice in zip(kpcovc.z_classifier_.estimators_, Z): + with self.subTest(type="z_arrays"): + # each array is shape (n_samples, ): + self.assertEqual(self.X.shape[0], z_slice.shape[0]) + self.assertEqual(z_slice.ndim, 1) + + def test_decision_function_multioutput(self): + """Check that KPCovC's decision_function works in edge cases when `n_outputs > 1`.""" + kpcovc = self.model( + classifier=MultiOutputClassifier(estimator=LinearSVC()), center=True + ) + kpcovc.fit(self.X, np.column_stack((self.Y, self.Y))) + + with self.assertRaises(ValueError) as cm: + _ = kpcovc.decision_function() + self.assertEqual( + str(cm.exception), + "Either X or T must be supplied.", + ) + + _ = kpcovc.decision_function(self.X) + T = kpcovc.transform(self.X) + _ = kpcovc.decision_function(T=T) + + if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/tests/test_pcovc.py b/tests/test_pcovc.py index 3fa54d307..c7a950904 100644 --- a/tests/test_pcovc.py +++ b/tests/test_pcovc.py @@ -26,7 +26,6 @@ def __init__(self, *args, **kwargs): ) self.error_tol = 1e-5 - self.X, self.Y = get_dataset(return_X_y=True) scaler = StandardScaler() @@ -575,7 +574,7 @@ def test_incompatible_coef_shape(self): class PCovCMultiOutputTest(PCovCBaseTest): def test_prefit_multioutput(self): - """Check that PCovC works if a prefit classifier is passed when `n_ouputs > 1`.""" + """Check that PCovC works if a prefit classifier is passed when `n_outputs > 1`.""" classifier = MultiOutputClassifier(estimator=LogisticRegression()) Y_double = np.column_stack((self.Y, self.Y)) @@ -593,7 +592,7 @@ def test_prefit_multioutput(self): self.assertTrue(np.allclose(W_classifier, W_pcovc)) def test_precomputed_multioutput(self): - """Check that PCovC works if classifier=`precomputed` and `n_ouputs > 1`.""" + """Check that PCovC works if classifier=`precomputed` and `n_outputs > 1`.""" classifier = MultiOutputClassifier(estimator=LogisticRegression()) Y_double = np.column_stack((self.Y, self.Y)) @@ -618,27 +617,25 @@ def test_precomputed_multioutput(self): self.assertTrue(np.linalg.norm(t3 - t1) < self.error_tol) def test_Z_shape_multioutput(self): - """Check that PCovC returns the evidence Z in the desired form when `n_ouputs > 1`.""" - pcovc = PCovC( - classifier=MultiOutputClassifier(LogisticRegression()), n_components=2 - ) + """Check that PCovC returns the evidence Z in the desired form when `n_outputs > 1`.""" + pcovc = PCovC() Y_double = np.column_stack((self.Y, self.Y)) pcovc.fit(self.X, Y_double) Z = pcovc.decision_function(self.X) - # list of (n_samples, n_classes) arrays + # list of (n_samples, n_classes) arrays when each column of Y is multiclass self.assertEqual(len(Z), Y_double.shape[1]) for est, z_slice in zip(pcovc.z_classifier_.estimators_, Z): with self.subTest(type="z_arrays"): - # each array is shape (n_samples, n_classes) + # each array is shape (n_samples, n_classes): self.assertEqual(self.X.shape[0], z_slice.shape[0]) self.assertEqual(est.coef_.shape[0], z_slice.shape[1]) def test_decision_function_multioutput(self): - """Check that PCovC's decision_function works in edge cases when `n_ouputs > 1`.""" + """Check that PCovC's decision_function works in edge cases when `n_outputs > 1`.""" pcovc = self.model(classifier=MultiOutputClassifier(estimator=LinearSVC())) pcovc.fit(self.X, np.column_stack((self.Y, self.Y))) with self.assertRaises(ValueError) as cm: