ENH: test imputers and fixture

Hông-Lan Botterman · Hông-Lan Botterman · commit 47bb77039b25 · 2023-07-07T11:15:30.000+02:00
diff --git a/qolmat/imputations/rpca/rpca_noisy.py b/qolmat/imputations/rpca/rpca_noisy.py
@@ -12,54 +12,6 @@
 from qolmat.utils.exceptions import CostFunctionRPCANotMinimized
 
 
-def _check_cost_function_minimized(
-    observations: NDArray,
-    low_rank: NDArray,
-    anomalies: NDArray,
-    tau: float,
-    lam: float,
-    norm: str,
-):
-    """Check that the functional minimized by the RPCA
-    is smaller at the end than at the beginning
-
-    Parameters
-    ----------
-    observations : NDArray
-        observations matrix with first linear interpolation
-    low_rank : NDArray
-        low_rank matrix resulting from RPCA
-    anomalies : NDArray
-        sparse matrix resulting from RPCA
-    tau : float
-        parameter penalizing the nuclear norm of the low rank part
-    lam : float
-        parameter penalizing the L1-norm of the anomaly/sparse part
-    norm : str
-        norm of the temporal penalisation. Has to be `L1` or `L2`
-
-    Raises
-    ------
-    CostFunctionRPCANotMinimized
-        The RPCA does not minimized the cost function:
-        the starting cost is at least equal to the final one.
-    """
-    value_start = tau * np.linalg.norm(observations, "nuc")
-    if norm == "L1":
-        anomalies_norm = np.sum(np.abs(anomalies))
-        function_str = "||D-M-A||_2 + tau ||D||_* + lam ||A||_1"
-    elif norm == "L2":
-        anomalies_norm = np.sum(anomalies**2)
-        function_str = "||D-M-A||_2 + tau ||D||_* + lam ||A||_2"
-    value_end = (
-        np.sum((observations - low_rank - anomalies) ** 2)
-        + tau * np.linalg.norm(low_rank, "nuc")
-        + lam * anomalies_norm
-    )
-    if value_start + 1e-4 < value_end:
-        raise CostFunctionRPCANotMinimized(function_str, value_start, value_end)
-
-
 class RPCANoisy(RPCA):
     """
     This class implements a noisy version of the so-called 'improved RPCA'
@@ -423,12 +375,54 @@ def decompose_rpca(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
         elif self.norm == "L2":
             M, A, U, V = self.decompose_rpca_L2(D, Omega, lam, tau, rank)
 
-        print("D")
-        print(D)
-        print("M")
-        print(M)
-        print("A")
-        print(A)
-        _check_cost_function_minimized(D, M, A, tau, lam, self.norm)
+        self._check_cost_function_minimized(D, M, A, tau, lam, self.norm)
 
         return M, A
+
+    @staticmethod
+    def _check_cost_function_minimized(
+        observations: NDArray,
+        low_rank: NDArray,
+        anomalies: NDArray,
+        tau: float,
+        lam: float,
+        norm: str,
+    ):
+        """Check that the functional minimized by the RPCA
+        is smaller at the end than at the beginning
+
+        Parameters
+        ----------
+        observations : NDArray
+            observations matrix with first linear interpolation
+        low_rank : NDArray
+            low_rank matrix resulting from RPCA
+        anomalies : NDArray
+            sparse matrix resulting from RPCA
+        tau : float
+            parameter penalizing the nuclear norm of the low rank part
+        lam : float
+            parameter penalizing the L1-norm of the anomaly/sparse part
+        norm : str
+            norm of the temporal penalisation. Has to be `L1` or `L2`
+
+        Raises
+        ------
+        CostFunctionRPCANotMinimized
+            The RPCA does not minimized the cost function:
+            the starting cost is at least equal to the final one.
+        """
+        value_start = tau * np.linalg.norm(observations, "nuc")
+        if norm == "L1":
+            anomalies_norm = np.sum(np.abs(anomalies))
+            function_str = "||D-M-A||_2 + tau ||D||_* + lam ||A||_1"
+        elif norm == "L2":
+            anomalies_norm = np.sum(anomalies**2)
+            function_str = "||D-M-A||_2 + tau ||D||_* + lam ||A||_2"
+        value_end = (
+            np.sum((observations - low_rank - anomalies) ** 2)
+            + tau * np.linalg.norm(low_rank, "nuc")
+            + lam * anomalies_norm
+        )
+        if value_start + 1e-4 <= value_end:
+            raise CostFunctionRPCANotMinimized(function_str, value_start, value_end)
diff --git a/qolmat/imputations/rpca/rpca_pcp.py b/qolmat/imputations/rpca/rpca_pcp.py
@@ -11,39 +11,6 @@
 from qolmat.utils.exceptions import CostFunctionRPCANotMinimized
 
 
-def _check_cost_function_minimized(
-    observations: NDArray,
-    low_rank: NDArray,
-    anomalies: NDArray,
-    lam: float,
-):
-    """Check that the functional minimized by the RPCA
-    is smaller at the end than at the beginning
-
-    Parameters
-    ----------
-    observations : NDArray
-        observations matrix with first linear interpolation
-    low_rank : NDArray
-        low_rank matrix resulting from RPCA
-    anomalies : NDArray
-        sparse matrix resulting from RPCA
-    lam : float
-        parameter penalizing the L1-norm of the anomaly/sparse part
-
-    Raises
-    ------
-    CostFunctionRPCANotMinimized
-        The RPCA does not minimized the cost function:
-        the starting cost is at least equal to the final one.
-    """
-    value_start = np.linalg.norm(observations, "nuc")
-    value_end = np.linalg.norm(low_rank, "nuc") + lam * np.sum(np.abs(anomalies))
-    if value_start + 1e-9 < value_end:
-        function_str = "||D||_* + lam ||A||_1"
-        raise CostFunctionRPCANotMinimized(function_str, value_start, value_end)
-
-
 class RPCAPCP(RPCA):
     """
     This class implements the basic RPCA decomposition using Alternating Lagrangian Multipliers.
@@ -112,6 +79,39 @@ def decompose_rpca(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
             if error < self.tol:
                 break
 
-        _check_cost_function_minimized(D, M, A, lam)
+        self._check_cost_function_minimized(D, M, A, lam)
 
         return M, A
+
+    @staticmethod
+    def _check_cost_function_minimized(
+        observations: NDArray,
+        low_rank: NDArray,
+        anomalies: NDArray,
+        lam: float,
+    ):
+        """Check that the functional minimized by the RPCA
+        is smaller at the end than at the beginning
+
+        Parameters
+        ----------
+        observations : NDArray
+            observations matrix with first linear interpolation
+        low_rank : NDArray
+            low_rank matrix resulting from RPCA
+        anomalies : NDArray
+            sparse matrix resulting from RPCA
+        lam : float
+            parameter penalizing the L1-norm of the anomaly/sparse part
+
+        Raises
+        ------
+        CostFunctionRPCANotMinimized
+            The RPCA does not minimized the cost function:
+            the starting cost is at least equal to the final one.
+        """
+        value_start = np.linalg.norm(observations, "nuc")
+        value_end = np.linalg.norm(low_rank, "nuc") + lam * np.sum(np.abs(anomalies))
+        if value_start + 1e-4 <= value_end:
+            function_str = "||D||_* + lam ||A||_1"
+            raise CostFunctionRPCANotMinimized(function_str, value_start, value_end)
diff --git a/tests/imputations/rpca/test_rpca_noisy.py b/tests/imputations/rpca/test_rpca_noisy.py
@@ -2,7 +2,7 @@
 import pytest
 from numpy.typing import NDArray
 
-from qolmat.imputations.rpca.rpca_noisy import RPCANoisy, _check_cost_function_minimized
+from qolmat.imputations.rpca.rpca_noisy import RPCANoisy
 from qolmat.utils import utils
 from qolmat.utils.data import generate_artificial_ts
 from qolmat.utils.exceptions import CostFunctionRPCANotMinimized
@@ -12,18 +12,22 @@
 X_interpolated = np.array([[1, 2], [3, 3]], dtype=float)
 omega = np.array([[True, True], [True, False]])
 max_iterations = 100
-# synthetic temporal data
-n_samples = 1000
-periods = [100, 20]
-amp_anomalies = 0.5
-ratio_anomalies = 0.05
-amp_noise = 0.1
-X_true, A_true, E_true = generate_artificial_ts(
-    n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise
-)
-signal = X_true + A_true + E_true
-mask = np.random.choice(len(signal), round(len(signal) / 20))
-signal[mask] = np.nan
+
+
+@pytest.fixture
+def synthetic_temporal_data():
+    n_samples = 1000
+    periods = [100, 20]
+    amp_anomalies = 0.5
+    ratio_anomalies = 0.05
+    amp_noise = 0.1
+    X_true, A_true, E_true = generate_artificial_ts(
+        n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise
+    )
+    signal = X_true + A_true + E_true
+    mask = np.random.choice(len(signal), round(len(signal) / 20))
+    signal[mask] = np.nan
+    return signal
 
 
 @pytest.mark.parametrize(
@@ -42,8 +46,9 @@
 def test_check_cost_function_minimized_raise_expection(
     obs: NDArray, lr: NDArray, ano: NDArray, lam: float, tau: float, norm: str
 ):
+    rpca = RPCANoisy()
     with pytest.raises(CostFunctionRPCANotMinimized):
-        _check_cost_function_minimized(obs, lr, ano, lam, tau, norm)
+        rpca._check_cost_function_minimized(obs, lr, ano, lam, tau, norm)
 
 
 @pytest.mark.parametrize("X", [X_complete])
@@ -85,8 +90,8 @@ def test_rpca_pcp_zero_lambda(X: NDArray, tau: float, X_interpolated: NDArray):
     np.testing.assert_allclose(A_result, X_interpolated, atol=1e-4)
 
 
-@pytest.mark.parametrize("signal", [signal])
-def test_rpca_temporal_signal(signal: NDArray):
+def test_rpca_temporal_signal(synthetic_temporal_data):
+    signal = synthetic_temporal_data
     period = 100
     tau = 1
     lam = 0.1
diff --git a/tests/imputations/rpca/test_rpca_pcp.py b/tests/imputations/rpca/test_rpca_pcp.py
@@ -2,7 +2,7 @@
 import pytest
 from numpy.typing import NDArray
 
-from qolmat.imputations.rpca.rpca_pcp import RPCAPCP, _check_cost_function_minimized
+from qolmat.imputations.rpca.rpca_pcp import RPCAPCP
 from qolmat.utils import utils
 from qolmat.utils.data import generate_artificial_ts
 from qolmat.utils.exceptions import CostFunctionRPCANotMinimized
@@ -12,18 +12,22 @@
 max_iterations = 50
 small_mu = 1e-5
 large_mu = 1e5
-# synthetic temporal data
-n_samples = 1000
-periods = [100, 20]
-amp_anomalies = 0.5
-ratio_anomalies = 0.05
-amp_noise = 0.1
-X_true, A_true, E_true = generate_artificial_ts(
-    n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise
-)
-signal = X_true + A_true + E_true
-mask = np.random.choice(len(signal), round(len(signal) / 20))
-signal[mask] = np.nan
+
+
+@pytest.fixture
+def synthetic_temporal_data():
+    n_samples = 1000
+    periods = [100, 20]
+    amp_anomalies = 0.5
+    ratio_anomalies = 0.05
+    amp_noise = 0.1
+    X_true, A_true, E_true = generate_artificial_ts(
+        n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise
+    )
+    signal = X_true + A_true + E_true
+    mask = np.random.choice(len(signal), round(len(signal) / 20))
+    signal[mask] = np.nan
+    return signal
 
 
 @pytest.mark.parametrize(
@@ -41,13 +45,14 @@ def test_check_cost_function_minimized_raise_expection(
     obs: NDArray, lr: NDArray, ano: NDArray, lam: float
 ):
     function_str = "||D||_* + lam ||A||_1"
+    rpca = RPCAPCP()
     with pytest.raises(
         CostFunctionRPCANotMinimized,
         match="PCA algorithm may provide bad results. "
         f"{function_str} is larger at the end "
         "of the algorithm than at the start.",
     ):
-        _check_cost_function_minimized(obs, lr, ano, lam)
+        rpca._check_cost_function_minimized(obs, lr, ano, lam)
 
 
 @pytest.mark.parametrize("X", [X_complete])
@@ -85,8 +90,8 @@ def test_rpca_rpca_pcp_large_lambda_small_mu(X: NDArray, mu: float):
     np.testing.assert_allclose(A_result, np.full_like(X, 0), atol=1e-4)
 
 
-@pytest.mark.parametrize("signal", [signal])
-def test_rpca_temporal_signal(signal: NDArray):
+def test_rpca_temporal_signal(synthetic_temporal_data):
+    signal = synthetic_temporal_data
     period = 100
     lam = 0.1
     rpca = RPCAPCP(period=period, lam=lam, mu=0.01)
diff --git a/tests/imputations/test_imputers.py b/tests/imputations/test_imputers.py
@@ -4,7 +4,7 @@
 import pandas as pd
 import pytest
 from sklearn.ensemble import ExtraTreesRegressor
-from sklearn.utils.estimator_checks import parametrize_with_checks
+from sklearn.utils.estimator_checks import check_estimator, parametrize_with_checks
 from qolmat.benchmark.hyperparameters import HyperValue
 
 from qolmat.imputations import imputers
@@ -303,7 +303,7 @@ def test_ImputerEM_fit_transform(df: pd.DataFrame) -> None:
         imputers.KNNImputer(),
         imputers.ImputerMICE(),
         imputers.ImputerRegressor(),
-        imputers.ImputerRPCA(),
+        imputers.ImputerRPCA(tau=0, lam=0),
         imputers.ImputerEM(),
     ]
 )