Merge pull request #76 from Quantmetry/softimpute

JulienRoussel77 · web-flow · commit 8c227dcccb88 · 2023-10-09T18:46:10.000+02:00
INIT: softimpute
diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py
@@ -18,6 +18,7 @@
 
 from qolmat.imputations import em_sampler
 from qolmat.imputations.rpca import rpca, rpca_noisy, rpca_pcp
+from qolmat.imputations import softimpute
 from qolmat.utils.exceptions import NotDataFrame
 from qolmat.utils.utils import HyperValue
 
@@ -1772,6 +1773,120 @@ def _transform_element(
         return df_imputed
 
 
+class ImputerSoftImpute(_Imputer):
+    """_summary_
+
+    Parameters
+    ----------
+    """
+
+    def __init__(
+        self,
+        groups: Tuple[str, ...] = (),
+        columnwise: bool = False,
+        random_state: Union[None, int, np.random.RandomState] = None,
+        period: int = 1,
+        rank: int = 2,
+        tolerance: float = 1e-05,
+        tau: float = 0,
+        max_iterations: int = 100,
+        verbose: bool = False,
+        projected: bool = True,
+    ):
+        super().__init__(
+            imputer_params=(
+                "period",
+                "rank",
+                "tolerance",
+                "tau",
+                "max_iterations",
+                "verbose",
+                "projected",
+            ),
+            groups=groups,
+            columnwise=columnwise,
+            random_state=random_state,
+        )
+        self.period = period
+        self.rank = rank
+        self.tolerance = tolerance
+        self.tau = tau
+        self.max_iterations = max_iterations
+        self.verbose = verbose
+        self.projected = projected
+
+    def _fit_element(
+        self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0
+    ) -> softimpute.SoftImpute:
+        """
+        Fits the imputer on `df`, at the group and/or column level depending on
+        self.groups and self.columnwise.
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            Dataframe on which the imputer is fitted
+        col : str, optional
+            Column on which the imputer is fitted, by default "__all__"
+        ngroup : int, optional
+            Id of the group on which the method is applied
+
+        Returns
+        -------
+        Any
+            Return fitted SoftImpute model
+
+        Raises
+        ------
+        NotDataFrame
+            Input has to be a pandas.DataFrame.
+        """
+        self._check_dataframe(df)
+        assert col == "__all__"
+        hyperparams = self.get_hyperparams()
+        model = softimpute.SoftImpute(random_state=self._rng, **hyperparams)
+        model = model.fit(df.values)
+        return model
+
+    def _transform_element(
+        self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0
+    ) -> pd.DataFrame:
+        """
+        Transforms the fataframe `df`, at the group level depending on
+        self.groups
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            Dataframe or column to impute
+        col : str, optional
+            Column transformed by the imputer, by default "__all__"
+
+        Returns
+        -------
+        pd.DataFrame
+            Imputed dataframe
+
+        Raises
+        ------
+        NotDataFrame
+            Input has to be a pandas.DataFrame.
+        """
+        self._check_dataframe(df)
+        assert col == "__all__"
+        model = self._dict_fitting["__all__"][ngroup]
+        X_imputed = model.transform(df.values)
+        return pd.DataFrame(X_imputed, index=df.index, columns=df.columns)
+
+    def _more_tags(self):
+        return {
+            "_xfail_checks": {
+                "check_fit2d_1sample": "This test shouldn't be running at all!",
+                "check_fit2d_1feature": "This test shouldn't be running at all!",
+            },
+        }
+
+
 class ImputerEM(_Imputer):
     """
     This class implements an imputation method based on joint modelling and an inference using a
@@ -1873,7 +1988,7 @@ def get_model(self, **hyperparams) -> em_sampler.EM:
 
     def _fit_element(
         self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0
-    ) -> IterativeImputer:
+    ) -> em_sampler.EM:
         """
         Fits the imputer on `df`, at the group and/or column level depending onself.groups and
         self.columnwise.
@@ -1890,7 +2005,7 @@ def _fit_element(
         Returns
         -------
         Any
-            Return fitted KNN model
+            Return fitted EM model
 
         Raises
         ------
diff --git a/qolmat/imputations/softimpute.py b/qolmat/imputations/softimpute.py
@@ -0,0 +1,217 @@
+from __future__ import annotations
+
+from typing import Optional, Union
+
+import numpy as np
+from numpy.typing import NDArray
+from sklearn import utils as sku
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from qolmat.utils import utils
+from qolmat.imputations.rpca import rpca_utils
+
+
+class SoftImpute(BaseEstimator, TransformerMixin):
+    """
+    This class implements the SoftImpute ALS algorithm presented in
+    Hastie, Trevor, et al. "Matrix completion and low-rank SVD
+    via fast alternating least squares." The Journal of Machine Learning
+    Research 16.1 (2015): 3367-3402.
+    min_A,B || Proj(X - AB')||_F^2 + tau * (|| A ||_F^2 + || B ||_F^2)
+
+    Parameters
+    ----------
+    period : int
+        Number of rows of the array if the array is 1D and
+        reshaped into a 2D array. Corresponds to the period of the time series,
+        if 1D time series is passed.
+    rank : int
+        Estimated rank of the matrix
+    tolerance : float
+        Tolerance for the convergence criterion
+    tau : float
+        regularisation parameter
+    max_iterations : int
+        Maximum number of iterations
+    random_state : int, optional
+        The seed of the pseudo random number generator to use, for reproductibility
+    verbose : bool
+        flag for verbosity
+    projected : bool
+        If true, only imputed values are changed.
+        If False, the matrix obtained via the algorithm is returned, by default True
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from qolmat.imputations.softimpute import SoftImpute
+    >>> X = np.array([[1, 2, np.nan, 4], [1, 5, 3, np.nan], [4, 2, 3, 2], [1, 1, 5, 4]])
+    >>> X_imputed = SoftImpute().fit_transform(X)
+    >>> print(X_imputed)
+    """
+
+    def __init__(
+        self,
+        period: int = 1,
+        rank: int = 2,
+        tolerance: float = 1e-05,
+        tau: float = 0,
+        max_iterations: int = 100,
+        random_state: Union[None, int, np.random.RandomState] = None,
+        verbose: bool = False,
+        projected: bool = True,
+    ):
+        self.period = period
+        self.rank = rank
+        self.tolerance = tolerance
+        self.tau = tau
+        self.max_iterations = max_iterations
+        self.random_state = sku.check_random_state(random_state)
+        self.verbose = verbose
+        self.projected = projected
+        self.u: NDArray = np.empty(0)
+        self.d: NDArray = np.empty(0)
+        self.v: NDArray = np.empty(0)
+
+    def fit(self, X: NDArray, y=None) -> SoftImpute:
+        """Fit the imputer on X.
+
+        Parameters
+        ----------
+        X : NDArray
+            Input data
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            The fitted `SoftImpute` class instance.
+        """
+        X_imputed = X.copy()
+        X_imputed = utils.prepare_data(X_imputed, self.period)
+
+        if not isinstance(X_imputed, np.ndarray):
+            raise AssertionError("Invalid type. X must be a NDArray.")
+
+        n, m = X_imputed.shape
+        mask = np.isnan(X_imputed)
+        V = np.zeros((m, self.rank))
+        U = self.random_state.normal(0.0, 1.0, (n, self.rank))
+        U, _, _ = np.linalg.svd(U, full_matrices=False)
+        Dsq = np.ones((self.rank, 1))
+        col_means = np.nanmean(X_imputed, axis=0)
+        np.copyto(X_imputed, col_means, where=np.isnan(X_imputed))
+        if self.rank is None:
+            self.rank = rpca_utils.approx_rank(X_imputed)
+        for iter_ in range(self.max_iterations):
+            U_old = U
+            V_old = V
+            Dsq_old = Dsq
+
+            B = U.T @ X_imputed
+            if self.tau > 0:
+                tmp = Dsq / (Dsq + self.tau)
+                B = B * tmp
+            Bsvd = np.linalg.svd(B.T, full_matrices=False)
+            V = Bsvd[0]
+            Dsq = Bsvd[1][:, np.newaxis]
+            U = U @ Bsvd[2]
+            tmp = Dsq * V.T
+            X_hat = U @ tmp
+            X_imputed[mask] = X_hat[mask]
+
+            A = (X_imputed @ V).T
+            if self.tau > 0:
+                tmp = Dsq / (Dsq + self.tau)
+                A = A * tmp
+            Asvd = np.linalg.svd(A.T, full_matrices=False)
+            U = Asvd[0]
+            Dsq = Asvd[1][:, np.newaxis]
+            V = V @ Asvd[2]
+            tmp = Dsq * V.T
+            X_hat = U @ tmp
+            X_imputed[mask] = X_hat[mask]
+
+            ratio = self._check_convergence(U_old, Dsq_old, V_old, U, Dsq, V)
+            if self.verbose:
+                print(f"iter {iter_}: ratio = {round(ratio, 4)}")
+            if ratio < self.tolerance:
+                break
+
+        self.u = U[:, : self.rank]
+        self.d = Dsq[: self.rank]
+        self.v = V[:, : self.rank]
+
+        return self
+
+    def _check_convergence(
+        self,
+        U_old: NDArray,
+        Ds_qold: NDArray,
+        V_old: NDArray,
+        U: NDArray,
+        Dsq: NDArray,
+        V: NDArray,
+    ) -> float:
+        """Given a pair of iterates (U_old, Ds_qold, V_old) and (U, Dsq, V),
+        it computes the relative change in Frobenius norm given by
+        || U_old @  Dsq_old @ V_old.T - U @  Dsq @ V.T ||_F^2
+        / || U_old @  Ds_qold @ V_old.T ||_F^2
+
+        Parameters
+        ----------
+        U_old : NDArray
+            previous matrix U
+        Ds_qold : NDArray
+            previous matrix Dsq
+        V_old : NDArray
+            previous matrix V
+        U : NDArray
+            current matrix U
+        Dsq : NDArray
+            current matrix Dsq
+        V : NDArray
+            current matrix V
+
+        Returns
+        -------
+        float
+            relative change
+        """
+        if any(arg is None for arg in (U_old, Ds_qold, V_old, U, Dsq, V)):
+            raise ValueError("One or more arguments are None.")
+
+        denom = (Ds_qold**2).sum()
+        utu = Dsq * (U.T @ U_old)
+        vtv = Ds_qold * (V_old.T @ V)
+        uvprod = (utu @ vtv).diagonal().sum()
+        num = denom + (Ds_qold**2).sum() - 2 * uvprod
+        return num / max(denom, 1e-9)
+
+    def transform(self, X: NDArray) -> NDArray:
+        """Impute all missing values in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data to complete.
+
+        Returns
+        -------
+        X : NDArray
+            The imputed dataset.
+        """
+        X_transformed = self.u @ np.diag(self.d.T[0]) @ (self.v).T
+        if self.projected:
+            X_ = utils.prepare_data(X, self.period)
+            mask = np.isnan(X_)
+            X_transformed[~mask] = X_[~mask]
+
+        X_transformed = utils.get_shape_original(X_transformed, X.shape)
+
+        if np.all(np.isnan(X_transformed)):
+            raise AssertionError("Result contains NaN. This is a bug.")
+
+        return X_transformed
diff --git a/tests/imputations/test_imputers.py b/tests/imputations/test_imputers.py
@@ -275,18 +275,31 @@ def test_ImputerRPCA_fit_transform(df: pd.DataFrame) -> None:
     np.testing.assert_allclose(result, expected, atol=1e-2)
 
 
+@pytest.mark.parametrize("df", [df_incomplete])
+def test_ImputerSoftImpute_fit_transform(df: pd.DataFrame) -> None:
+    imputer = imputers.ImputerSoftImpute(
+        columnwise=False, max_iterations=100, tau=0.3, random_state=4
+    )
+    result = imputer.fit_transform(df)
+    expected = pd.DataFrame(
+        {
+            "col1": [0, 1.327, 2, 3, 0.137],
+            "col2": [-1, 0.099, 0.5, 0.122, 1.5],
+        }
+    )
+    np.testing.assert_allclose(result, expected, atol=1e-2)
+
+
 @pytest.mark.parametrize("df", [df_timeseries])
 def test_ImputerEM_fit_transform(df: pd.DataFrame) -> None:
     imputer = imputers.ImputerEM(method="sample", dt=1e-3, random_state=42)
     result = imputer.fit_transform(df)
-    print(result)
     expected = pd.DataFrame(
         {
             "col1": [i for i in range(20)],
             "col2": [0, 0.773, 2, 2.621, 2] + [i for i in range(5, 20)],
         }
     )
-    print(result)
     np.testing.assert_allclose(result, expected, atol=1e-2)
 
 
diff --git a/tests/imputations/test_softimpute.py b/tests/imputations/test_softimpute.py