Fixing errors with PCovC self.classifier_

rvasav26 · rvasav26 · commit c534a68e06dc · 2025-04-21T15:05:19.000-05:00
diff --git a/examples/pcovc/PCovC-BreastCancerDataset.ipynb b/examples/pcovc/PCovC-BreastCancerDataset.ipynb
@@ -25,7 +25,7 @@
     "\n",
     "import sys\n",
     "sys.path.append('../../')\n",
-    "from src.skmatter.decomposition._pcovc import PCovC\n",
+    "from src.skmatter.decomposition.pcovc_new import PCovC\n",
     "\n",
     "plt.rcParams[\"image.cmap\"] = \"tab10\"\n",
     "plt.rcParams['scatter.edgecolors'] = \"k\"\n",
@@ -216,7 +216,7 @@
     {
      "data": {
       "text/plain": [
-       "<matplotlib.legend.Legend at 0x110136e40>"
+       "<matplotlib.legend.Legend at 0x10fc7b620>"
       ]
      },
      "execution_count": 4,
@@ -264,7 +264,7 @@
     {
      "data": {
       "text/plain": [
-       "<matplotlib.collections.PathCollection at 0x1103cead0>"
+       "<matplotlib.collections.PathCollection at 0x10ff1afd0>"
       ]
      },
      "execution_count": 5,
diff --git a/src/skmatter/decomposition/_kernel_pcovc.py b/src/skmatter/decomposition/_kernel_pcovc.py
@@ -1,6 +1,7 @@
 import numbers
 
 import numpy as np
+import scipy.sparse as sp
 from scipy import linalg
 from scipy.sparse.linalg import svds
 from sklearn.decomposition._base import _BasePCA
@@ -249,7 +250,7 @@ def __init__(
         gamma="scale",
         degree=3,
         coef0=0.0,
-        kernel_params=None,
+     #   kernel_params=None,
         center=False,
         fit_inverse_transform=False,
         tol=1e-12,
@@ -270,7 +271,7 @@ def __init__(
         self.gamma = gamma
         self.degree = degree
         self.coef0 = coef0
-        self.kernel_params = kernel_params
+    #    self.kernel_params = kernel_params
 
         self.n_jobs = n_jobs
 
@@ -279,10 +280,23 @@ def __init__(
         self.classifier = classifier
 
     def _get_kernel(self, X, Y=None):
+        sparse = sp.issparse(X)
+
         if callable(self.kernel):
-            params = self.kernel_params or {}
+            params = {} #self.kernel_params or {}
         else:
-            params = {"gamma": self.gamma, "degree": self.degree, "coef0": self.coef0}
+            if self.gamma == "scale":
+                X_var = (X.multiply(X)).mean() - (X.mean()) ** 2 if sparse else X.var()
+                self._gamma = 1.0 / (X.shape[1] * X_var) if X_var != 0 else 1.0
+            elif self.gamma == "auto":
+                self._gamma = 1.0 / X.shape[1]
+            else:
+                self._gamma = self.gamma
+            params = {"gamma": self._gamma, "degree": self.degree, "coef0": self.coef0}
+        print("Params")
+        print(params)
+        
+
         return pairwise_kernels(
             X, Y, metric=self.kernel, filter_params=True, n_jobs=self.n_jobs, **params
         )
@@ -435,7 +449,7 @@ def fit(self, X, y, W=None):
             z_classifier_ = check_krr_fit(classifier, K, X, y)
             '''
             z_classifier_ = check_krr_fit(classifier, K, X, y) #Pkz as weights
-            
+            print(z_classifier_)
             W = z_classifier_.dual_coef_.reshape(self.n_samples_in_, -1) #Pkz 
 
             # Use this instead of `self.classifier_.predict(K)`
diff --git a/src/skmatter/decomposition/_pcov.py b/src/skmatter/decomposition/_pcov.py
@@ -1,3 +1,4 @@
+from copy import deepcopy
 import numbers
 import numpy as np
 import warnings
@@ -8,6 +9,7 @@
 from scipy.linalg import sqrtm as MatrixSqrt
 from scipy.sparse.linalg import svds
 
+from sklearn import clone
 from sklearn.base import check_X_y
 from sklearn.calibration import column_or_1d
 from sklearn.decomposition._base import _BasePCA
@@ -151,15 +153,15 @@ def fit(self, X, y, W=None):
                 else:
                     classifier = self.classifier
 
-                z_classifier_ = check_cl_fit(classifier, X, y=y)  #change to z classifier, fits linear classifier on x and y to get Pxz
+                self.z_classifier_ = check_cl_fit(classifier, X, y=y)  #change to z classifier, fits linear classifier on x and y to get Pxz
 
-                if isinstance(z_classifier_, MultiOutputClassifier):
-                    W = np.hstack([est_.coef_.T for est_ in z_classifier_.estimators_])
+                if isinstance(self.z_classifier_, MultiOutputClassifier):
+                    W = np.hstack([est_.coef_.T for est_ in self.z_classifier_.estimators_])
                     Z = X @ W #computes Z, basically Z=XPxz
 
                 else:
-                    W = z_classifier_.coef_.T.reshape(X.shape[1], -1)
-                    Z = z_classifier_.decision_function(X).reshape(X.shape[0], -1) #computes Z
+                    W = self.z_classifier_.coef_.T.reshape(X.shape[1], -1)
+                    Z = self.z_classifier_.decision_function(X).reshape(X.shape[0], -1) #computes Z this will throw an error since pxz and ptz aren't defined yet
 
             else:
                 Z = y.copy()
@@ -171,13 +173,34 @@ def fit(self, X, y, W=None):
             if not self._label_binarizer.y_type_.startswith("multilabel"):
                 y = column_or_1d(y, warn=True)
          
-
             if self.space_ == "feature":
                 self._fit_feature_space(X, Y.reshape(Z.shape), Z)
             else:
                 self._fit_sample_space(X, Y.reshape(Z.shape), Z, W)
             
-            self.classifier_ = check_cl_fit(classifier, X @ self.pxt_, y=y)
+                # instead of using linear regression solution, refit with the classifier
+                # and steal weights to get ptz
+                # this is failing because self.classifier is never changed from None if None is passed as classifier
+                # change self.classifier to classifier and see what happens. if classifier is precomputed, there might be more errors so be careful.
+                # if classifier is precomputed, I don't think we need to check if the classifier is fit or not?
+
+                #cases:
+                #1. if classifier has been fit with X and Y already, we need to use classifier that hasn't been fitted and refit on T, y
+                #2. if classifier has not been fit with X and Y, we call check_cl_fit
+
+                # if (fitted(X,y)):
+                #   
+                # else:
+                # check_cl_fit
+
+            #self.classifier_ = check_cl_fit(classifier, X @ self.pxt_, y=y)
+            #we don't want to copy ALl parameters of classifier, such as n_features_in, since we are re-fitting it on T, y
+            if self.classifier != "precomputed":
+                self.classifier_ = clone(classifier).fit(X @ self.pxt_, y)
+            else:
+                self.classifier_ = LogisticRegression().fit(X @ self.pxt_, y)
+
+            self.classifier_._validate_data(X @ self.pxt_, y, reset=False)
 
             #self.classifier_ = LogisticRegression().fit(X @ self.pxt_, y)
             #check_cl_fit(classifier., X @ self.pxt_, y=y) #Has Ptz as weights 
diff --git a/src/skmatter/decomposition/_pcovc.py b/src/skmatter/decomposition/_pcovc.py
@@ -438,7 +438,7 @@ def fit(self, X, y, W=None):
         #cases:
         #1. if classifier has been fit with X and Y already, we dont need to perform a check_cl_fit
         #2. if classifier has not been fit with X or Y, we dont need to 
-        #3. if classifier has been fit with T and Y, we need to perform check_cl_fit (doesn't make sense actually, why would we fit with T and y)
+        #3. if classifier has been fit with T and Y, we need to perform check_cl_fit
 
         # old: self.classifier_ = check_cl_fit(self.classifier, X @ self.pxt_, y=y) #Has Ptz as weights 
 
diff --git a/src/skmatter/decomposition/pcovc_new.py b/src/skmatter/decomposition/pcovc_new.py
@@ -260,7 +260,7 @@ class likelihoods, :math:`{\mathbf{Z}}`.
             )
         return super().fit(X, Y, W)
     
-    def _fit_feature_space(self, X, Y, Yhat):
+    def _fit_feature_space(self, X, Y, Z):
         r"""In feature-space PCovC, the projectors are determined by:
 
         .. math::
@@ -282,9 +282,9 @@ def _fit_feature_space(self, X, Y, Yhat):
                                 \mathbf{U}_\mathbf{\tilde{C}}^T
                                 (\mathbf{X}^T \mathbf{X})^{\frac{1}{2}}
         """
-        return super()._fit_feature_space(X, Y, Yhat)
+        return super()._fit_feature_space(X, Y, Z)
 
-    def _fit_sample_space(self, X, Y, Yhat, W):
+    def _fit_sample_space(self, X, Y, Z, W):
         r"""In sample-space PCovC, the projectors are determined by:
 
         .. math::
@@ -303,7 +303,7 @@ def _fit_sample_space(self, X, Y, Yhat, W):
             \mathbf{P}_{TX} = \mathbf{\Lambda}_\mathbf{\tilde{K}}^{-\frac{1}{2}}
                                 \mathbf{U}_\mathbf{\tilde{K}}^T \mathbf{X}
         """
-        return super()._fit_sample_space(X, Y, Yhat, W)
+        return super()._fit_sample_space(X, Y, Z, W)
     
     def _decompose_truncated(self, mat):
         return super()._decompose_truncated(mat)
diff --git a/src/skmatter/decomposition/pcovr_new.py b/src/skmatter/decomposition/pcovr_new.py
@@ -57,11 +57,12 @@ class PCovR(_BasePCov):
     mixing: float, default=0.5
         mixing parameter, as described in PCovR as :math:`{\alpha}`, here named to avoid
         confusion with regularization parameter `alpha`
+
     n_components : int, float or str, default=None
         Number of components to keep.
         if n_components is not set all components are kept::
-
             n_components == min(n_samples, n_features)
+
     svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto'
         If auto :
             The solver is selected by a default policy based on `X.shape` and
@@ -78,13 +79,16 @@ class PCovR(_BasePCov):
             min(X.shape)
         If randomized :
             run randomized SVD by the method of Halko et al.
+
     tol : float, default=1e-12
         Tolerance for singular values computed by svd_solver == 'arpack'. Must be of
         range [0.0, infinity).
+
     space: {'feature', 'sample', 'auto'}, default='auto'
         whether to compute the PCovR in `sample` or `feature` space default=`sample`
         when :math:`{n_{samples} < n_{features}}` and `feature` when
         :math:`{n_{features} < n_{samples}}`
+
     regressor: {`Ridge`, `RidgeCV`, `LinearRegression`, `precomputed`}, default=None
         regressor for computing approximated :math:`{\mathbf{\hat{Y}}}`. The regressor
         should be one `sklearn.linear_model.Ridge`, `sklearn.linear_model.RidgeCV`, or
@@ -98,42 +102,52 @@ class PCovR(_BasePCov):
         regressed form of the targets :math:`{\mathbf{\hat{Y}}}`. If None,
         ``sklearn.linear_model.Ridge('alpha':1e-6, 'fit_intercept':False, 'tol':1e-12)``
         is used as the regressor.
+
     iterated_power : int or 'auto', default='auto'
          Number of iterations for the power method computed by svd_solver ==
          'randomized'. Must be of range [0, infinity).
+
     random_state : int, :class:`numpy.random.RandomState` instance or None, default=None
          Used when the 'arpack' or 'randomized' solvers are used. Pass an int for
          reproducible results across multiple function calls.
+         
     whiten : bool, deprecated
 
     Attributes
     ----------
     mixing: float, default=0.5
         mixing parameter, as described in PCovR as :math:`{\alpha}`
+
     tol: float, default=1e-12
         Tolerance for singular values computed by svd_solver == 'arpack'.
         Must be of range [0.0, infinity).
+
     space: {'feature', 'sample', 'auto'}, default='auto'
         whether to compute the PCovR in `sample` or `feature` space default=`sample`
         when :math:`{n_{samples} < n_{features}}` and `feature` when
         :math:`{n_{features} < n_{samples}}`
+
     n_components_ : int
         The estimated number of components, which equals the parameter n_components, or
         the lesser value of n_features and n_samples if n_components is None.
+
     pxt_ : numpy.ndarray of size :math:`({n_{samples}, n_{components}})`
         the projector, or weights, from the input space :math:`\mathbf{X}` to the
         latent-space projection :math:`\mathbf{T}`
+
     pty_ : numpy.ndarray of size :math:`({n_{components}, n_{properties}})`
         the projector, or weights, from the latent-space projection :math:`\mathbf{T}`
         to the properties :math:`\mathbf{Y}`
+
     pxy_ : numpy.ndarray of size :math:`({n_{samples}, n_{properties}})`
         the projector, or weights, from the input space :math:`\mathbf{X}` to the
         properties :math:`\mathbf{Y}`
+
     explained_variance_ : numpy.ndarray of shape (n_components,)
         The amount of variance explained by each of the selected components.
-
         Equal to n_components largest eigenvalues
         of the PCovR-modified covariance matrix of :math:`\mathbf{X}`.
+        
     singular_values_ : numpy.ndarray of shape (n_components,)
         The singular values corresponding to each of the selected components.
 
@@ -195,6 +209,7 @@ def fit(self, X, Y, W=None):
             means and scaled. If features are related, the matrix should be scaled
             to have unit variance, otherwise :math:`\mathbf{X}` should be
             scaled so that each feature has a variance of 1 / n_features.
+
         Y : numpy.ndarray, shape (n_samples, n_properties)
             Training data, where n_samples is the number of samples and n_properties is
             the number of properties
@@ -206,6 +221,7 @@ def fit(self, X, Y, W=None):
 
             If the passed regressor = `precomputed`, it is assumed that Y is the
             regressed form of the properties, :math:`{\mathbf{\hat{Y}}}`.
+
         W : numpy.ndarray, shape (n_features, n_properties)
             Regression weights, optional when regressor=`precomputed`. If not
             passed, it is assumed that `W = np.linalg.lstsq(X, Y, self.tol)[0]`
diff --git a/src/skmatter/decomposition/playground.py b/src/skmatter/decomposition/playground.py
@@ -1,5 +1,7 @@
  
+from sklearn.base import check_is_fitted
 from sklearn.discriminant_analysis import StandardScaler
+from sklearn.exceptions import NotFittedError
 from sklearn.kernel_ridge import KernelRidge
 from sklearn.linear_model import LogisticRegression, LinearRegression
 from sklearn.svm import SVC
@@ -9,32 +11,50 @@
 from sklearn.datasets import load_breast_cancer as get_dataset
 from sklearn.datasets import load_diabetes as get_dataset2
 from sklearn.metrics import accuracy_score
-from pcovr_new import PCovR
+from _kernel_pcovr import KernelPCovR
 
-X, Y = get_dataset2(return_X_y=True)
+X, Y = get_dataset(return_X_y=True)
 
+X_or = X
 scaler = StandardScaler()
 X = scaler.fit_transform(X)
 
+
+pcovc = PCovC(mixing=0.0, classifier=LogisticRegression(), n_components=2)
+pcovc.fit(X,Y)
+T = pcovc.transform(X)
+
+pcovc2 = PCovC(mixing=0.0, classifier=LogisticRegression(), n_components=2)
+pcovc2.classifier.fit(X, Y)
+print(pcovc2.classifier.coef_.shape)
+pcovc2.classifier.fit(T, Y)
+print(pcovc2.classifier.coef_.shape)
+
+
+
+
+
 # model = PCovR(mixing=0.5, regressor=LinearRegression())
 # model.fit(X,Y)
 # print(isinstance(model, PCovR))
 
-import numpy as np
+# import numpy as np
 
-X = np.array([[-1, 0, -2, 3], [3, -2, 0, 1], [-3, 0, -1, -1], [1, 3, 0, -2]])
-Y = np.array([[0], [1], [2], [0]])
-             
-pcovc = PCovC(mixing=0.1, n_components=2)
-pcovc.fit(X, Y)
-T= pcovc.transform(X)
-print(T)
+# X = np.array([[-1, 0, -2, 3], [3, -2, 0, 1], [-3, 0, -1, -1], [1, 3, 0, -2]])
+# Y = np.array([[0], [1], [2], [0]])
+
+# print("AA23")       
+# print(Y.shape)
+# pcovc = PCovC(mixing=0.1, n_components=2)
+# pcovc.fit(X, Y)
+# T= pcovc.transform(X)
+# print(T)
 # array([[ 3.2630561 ,  0.06663787],
 #            [-2.69395511, -0.41582771],
 #            [ 3.48683147, -0.83164387],
 #            [-4.05593245,  1.18083371]])
-Y = pcovc.predict(X)
-print(Y.shape)
+# Y = pcovc.predict(X)
+# print(Y.shape)
 # array([[ 0.01371776, -5.00945512],
 #            [-1.02805338,  1.06736871],
 #            [ 0.98166504, -4.98307078],
diff --git a/src/skmatter/utils/_pcovc_utils.py b/src/skmatter/utils/_pcovc_utils.py
@@ -11,8 +11,7 @@ def check_cl_fit(classifier, X, y):
 
             # Check compatibility with X
             fitted_classifier._validate_data(X, y, reset=False, multi_output=True)
-            print("X shape "+str(X.shape))
-            print("y shape " + str(y.shape))
+   
             # Check compatibility with y
 
             # changed from if fitted_classifier.coef_.ndim != y.ndim:
@@ -22,10 +21,11 @@ def check_cl_fit(classifier, X, y):
                 raise ValueError(
                     "The classifier coefficients have a shape incompatible "
                     "with the supplied feature space. "
-                    "The coefficients have shape %d and the features "
-                    "have shape %d" % (fitted_classifier.coef_.shape, X.shape)
+                    "The coefficients have shape %r and the features "
+                    "have shape %r" % (fitted_classifier.coef_.shape, X.shape)
                 )
-            # LogisticRegression does not support multioutput, but RidgeClassifier does
+            # LogisticRegression does not support multioutput, but RidgeClassifier does.
+            # We need to check this...
             elif y.ndim == 2:
                 if fitted_classifier.coef_.shape[0] != y.shape[1]:
                     raise ValueError(
diff --git a/tests/test_pcovc.py b/tests/test_pcovc.py