|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +from typing import Optional, Union |
| 4 | + |
| 5 | +import numpy as np |
| 6 | +from numpy.typing import NDArray |
| 7 | +from sklearn import utils as sku |
| 8 | +from sklearn.base import BaseEstimator, TransformerMixin |
| 9 | + |
| 10 | +from qolmat.utils import utils |
| 11 | +from qolmat.imputations.rpca import rpca_utils |
| 12 | + |
| 13 | + |
| 14 | +class SoftImpute(BaseEstimator, TransformerMixin): |
| 15 | + """ |
| 16 | + This class implements the SoftImpute ALS algorithm presented in |
| 17 | + Hastie, Trevor, et al. "Matrix completion and low-rank SVD |
| 18 | + via fast alternating least squares." The Journal of Machine Learning |
| 19 | + Research 16.1 (2015): 3367-3402. |
| 20 | + min_A,B || Proj(X - AB')||_F^2 + tau * (|| A ||_F^2 + || B ||_F^2) |
| 21 | +
|
| 22 | + Parameters |
| 23 | + ---------- |
| 24 | + period : int |
| 25 | + Number of rows of the array if the array is 1D and |
| 26 | + reshaped into a 2D array. Corresponds to the period of the time series, |
| 27 | + if 1D time series is passed. |
| 28 | + rank : int |
| 29 | + Estimated rank of the matrix |
| 30 | + tolerance : float |
| 31 | + Tolerance for the convergence criterion |
| 32 | + tau : float |
| 33 | + regularisation parameter |
| 34 | + max_iterations : int |
| 35 | + Maximum number of iterations |
| 36 | + random_state : int, optional |
| 37 | + The seed of the pseudo random number generator to use, for reproductibility |
| 38 | + verbose : bool |
| 39 | + flag for verbosity |
| 40 | + projected : bool |
| 41 | + If true, only imputed values are changed. |
| 42 | + If False, the matrix obtained via the algorithm is returned, by default True |
| 43 | +
|
| 44 | + Examples |
| 45 | + -------- |
| 46 | + >>> import numpy as np |
| 47 | + >>> from qolmat.imputations.softimpute import SoftImpute |
| 48 | + >>> X = np.array([[1, 2, np.nan, 4], [1, 5, 3, np.nan], [4, 2, 3, 2], [1, 1, 5, 4]]) |
| 49 | + >>> X_imputed = SoftImpute().fit_transform(X) |
| 50 | + >>> print(X_imputed) |
| 51 | + """ |
| 52 | + |
| 53 | + def __init__( |
| 54 | + self, |
| 55 | + period: int = 1, |
| 56 | + rank: int = 2, |
| 57 | + tolerance: float = 1e-05, |
| 58 | + tau: float = 0, |
| 59 | + max_iterations: int = 100, |
| 60 | + random_state: Union[None, int, np.random.RandomState] = None, |
| 61 | + verbose: bool = False, |
| 62 | + projected: bool = True, |
| 63 | + ): |
| 64 | + self.period = period |
| 65 | + self.rank = rank |
| 66 | + self.tolerance = tolerance |
| 67 | + self.tau = tau |
| 68 | + self.max_iterations = max_iterations |
| 69 | + self.random_state = sku.check_random_state(random_state) |
| 70 | + self.verbose = verbose |
| 71 | + self.projected = projected |
| 72 | + self.u: NDArray = np.empty(0) |
| 73 | + self.d: NDArray = np.empty(0) |
| 74 | + self.v: NDArray = np.empty(0) |
| 75 | + |
| 76 | + def fit(self, X: NDArray, y=None) -> SoftImpute: |
| 77 | + """Fit the imputer on X. |
| 78 | +
|
| 79 | + Parameters |
| 80 | + ---------- |
| 81 | + X : NDArray |
| 82 | + Input data |
| 83 | +
|
| 84 | + y : Ignored |
| 85 | + Not used, present here for API consistency by convention. |
| 86 | +
|
| 87 | + Returns |
| 88 | + ------- |
| 89 | + self : object |
| 90 | + The fitted `SoftImpute` class instance. |
| 91 | + """ |
| 92 | + X_imputed = X.copy() |
| 93 | + X_imputed = utils.prepare_data(X_imputed, self.period) |
| 94 | + |
| 95 | + if not isinstance(X_imputed, np.ndarray): |
| 96 | + raise AssertionError("Invalid type. X must be a NDArray.") |
| 97 | + |
| 98 | + n, m = X_imputed.shape |
| 99 | + mask = np.isnan(X_imputed) |
| 100 | + V = np.zeros((m, self.rank)) |
| 101 | + U = self.random_state.normal(0.0, 1.0, (n, self.rank)) |
| 102 | + U, _, _ = np.linalg.svd(U, full_matrices=False) |
| 103 | + Dsq = np.ones((self.rank, 1)) |
| 104 | + col_means = np.nanmean(X_imputed, axis=0) |
| 105 | + np.copyto(X_imputed, col_means, where=np.isnan(X_imputed)) |
| 106 | + if self.rank is None: |
| 107 | + self.rank = rpca_utils.approx_rank(X_imputed) |
| 108 | + for iter_ in range(self.max_iterations): |
| 109 | + U_old = U |
| 110 | + V_old = V |
| 111 | + Dsq_old = Dsq |
| 112 | + |
| 113 | + B = U.T @ X_imputed |
| 114 | + if self.tau > 0: |
| 115 | + tmp = Dsq / (Dsq + self.tau) |
| 116 | + B = B * tmp |
| 117 | + Bsvd = np.linalg.svd(B.T, full_matrices=False) |
| 118 | + V = Bsvd[0] |
| 119 | + Dsq = Bsvd[1][:, np.newaxis] |
| 120 | + U = U @ Bsvd[2] |
| 121 | + tmp = Dsq * V.T |
| 122 | + X_hat = U @ tmp |
| 123 | + X_imputed[mask] = X_hat[mask] |
| 124 | + |
| 125 | + A = (X_imputed @ V).T |
| 126 | + if self.tau > 0: |
| 127 | + tmp = Dsq / (Dsq + self.tau) |
| 128 | + A = A * tmp |
| 129 | + Asvd = np.linalg.svd(A.T, full_matrices=False) |
| 130 | + U = Asvd[0] |
| 131 | + Dsq = Asvd[1][:, np.newaxis] |
| 132 | + V = V @ Asvd[2] |
| 133 | + tmp = Dsq * V.T |
| 134 | + X_hat = U @ tmp |
| 135 | + X_imputed[mask] = X_hat[mask] |
| 136 | + |
| 137 | + ratio = self._check_convergence(U_old, Dsq_old, V_old, U, Dsq, V) |
| 138 | + if self.verbose: |
| 139 | + print(f"iter {iter_}: ratio = {round(ratio, 4)}") |
| 140 | + if ratio < self.tolerance: |
| 141 | + break |
| 142 | + |
| 143 | + self.u = U[:, : self.rank] |
| 144 | + self.d = Dsq[: self.rank] |
| 145 | + self.v = V[:, : self.rank] |
| 146 | + |
| 147 | + return self |
| 148 | + |
| 149 | + def _check_convergence( |
| 150 | + self, |
| 151 | + U_old: NDArray, |
| 152 | + Ds_qold: NDArray, |
| 153 | + V_old: NDArray, |
| 154 | + U: NDArray, |
| 155 | + Dsq: NDArray, |
| 156 | + V: NDArray, |
| 157 | + ) -> float: |
| 158 | + """Given a pair of iterates (U_old, Ds_qold, V_old) and (U, Dsq, V), |
| 159 | + it computes the relative change in Frobenius norm given by |
| 160 | + || U_old @ Dsq_old @ V_old.T - U @ Dsq @ V.T ||_F^2 |
| 161 | + / || U_old @ Ds_qold @ V_old.T ||_F^2 |
| 162 | +
|
| 163 | + Parameters |
| 164 | + ---------- |
| 165 | + U_old : NDArray |
| 166 | + previous matrix U |
| 167 | + Ds_qold : NDArray |
| 168 | + previous matrix Dsq |
| 169 | + V_old : NDArray |
| 170 | + previous matrix V |
| 171 | + U : NDArray |
| 172 | + current matrix U |
| 173 | + Dsq : NDArray |
| 174 | + current matrix Dsq |
| 175 | + V : NDArray |
| 176 | + current matrix V |
| 177 | +
|
| 178 | + Returns |
| 179 | + ------- |
| 180 | + float |
| 181 | + relative change |
| 182 | + """ |
| 183 | + if any(arg is None for arg in (U_old, Ds_qold, V_old, U, Dsq, V)): |
| 184 | + raise ValueError("One or more arguments are None.") |
| 185 | + |
| 186 | + denom = (Ds_qold**2).sum() |
| 187 | + utu = Dsq * (U.T @ U_old) |
| 188 | + vtv = Ds_qold * (V_old.T @ V) |
| 189 | + uvprod = (utu @ vtv).diagonal().sum() |
| 190 | + num = denom + (Ds_qold**2).sum() - 2 * uvprod |
| 191 | + return num / max(denom, 1e-9) |
| 192 | + |
| 193 | + def transform(self, X: NDArray) -> NDArray: |
| 194 | + """Impute all missing values in X. |
| 195 | +
|
| 196 | + Parameters |
| 197 | + ---------- |
| 198 | + X : array-like of shape (n_samples, n_features) |
| 199 | + The input data to complete. |
| 200 | +
|
| 201 | + Returns |
| 202 | + ------- |
| 203 | + X : NDArray |
| 204 | + The imputed dataset. |
| 205 | + """ |
| 206 | + X_transformed = self.u @ np.diag(self.d.T[0]) @ (self.v).T |
| 207 | + if self.projected: |
| 208 | + X_ = utils.prepare_data(X, self.period) |
| 209 | + mask = np.isnan(X_) |
| 210 | + X_transformed[~mask] = X_[~mask] |
| 211 | + |
| 212 | + X_transformed = utils.get_shape_original(X_transformed, X.shape) |
| 213 | + |
| 214 | + if np.all(np.isnan(X_transformed)): |
| 215 | + raise AssertionError("Result contains NaN. This is a bug.") |
| 216 | + |
| 217 | + return X_transformed |
0 commit comments