Correction MR

Gsaes · Gsaes · commit ba0496d5320d · 2023-05-31T15:52:38.000+02:00
diff --git a/qolmat/benchmark/cross_validation.py b/qolmat/benchmark/cross_validation.py
@@ -34,6 +34,8 @@ def get_dimension(dict_bounds: Dict, name_dimension: str) -> Dimension:
         return Real(low=dict_bounds["min"], high=dict_bounds["max"], name=name_dimension)
     elif dict_bounds["type"] == "Categorical":
         return Categorical(categories=dict_bounds["categories"], name=name_dimension)
+    else:
+        ValueError("The 'type' must be 'Integer', 'Real' or 'Categorical")
 
 
 def get_search_space(dict_config_opti_imputer: Dict) -> List[Dimension]:
diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py
@@ -103,6 +103,8 @@ def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
                 df_imputed[col] = self.impute_element(df[[col]])
 
         else:
+            if any(isinstance(value, dict) for value in hyperparams.values()):
+                raise AssertionError("hyperparams contains a dictionary. Columnwise must be True.")
             self.hyperparams_element = hyperparams
             df_imputed = self.impute_element(df)
 
diff --git a/qolmat/imputations/rpca/rpca.py b/qolmat/imputations/rpca/rpca.py
@@ -33,10 +33,12 @@ def __init__(
         period: Optional[int] = None,
         max_iter: int = int(1e4),
         tol: float = 1e-6,
+        random_state: Union[None, int, np.random.RandomState] = None,
     ) -> None:
         self.n_rows = period
         self.max_iter = max_iter
         self.tol = tol
+        self.random_state = random_state
 
     def _prepare_data(self, X: NDArray) -> NDArray:
         """
diff --git a/qolmat/imputations/rpca/rpca_noisy.py b/qolmat/imputations/rpca/rpca_noisy.py
@@ -58,7 +58,6 @@ def __init__(
         max_iter: int = int(1e4),
         tol: float = 1e-6,
         norm: Optional[str] = "L2",
-        random_state: Union[None, int, np.random.RandomState] = None,
     ) -> None:
         super().__init__(period=period, max_iter=max_iter, tol=tol)
         self.rank = rank
@@ -67,7 +66,6 @@ def __init__(
         self.list_periods = list_periods
         self.list_etas = list_etas
         self.norm = norm
-        self.random_state = random_state
 
     def decompose_rpca_L1(
         self, D: NDArray, Omega: NDArray, lam: float, tau: float, rank: int
diff --git a/qolmat/imputations/rpca/rpca_pcp.py b/qolmat/imputations/rpca/rpca_pcp.py
@@ -33,7 +33,6 @@ def __init__(
         lam: Optional[float] = None,
         max_iter: int = int(1e4),
         tol: float = 1e-6,
-        random_state: Union[None, int, np.random.RandomState] = None,
     ) -> None:
         super().__init__(
             period=period,
@@ -42,7 +41,6 @@ def __init__(
         )
         self.mu = mu
         self.lam = lam
-        self.random_state = random_state
 
     def get_params_scale(self, D: NDArray):
         mu = D.size / (4.0 * utils.l1_norm(D))
diff --git a/tests/benchmark/test_comparator.py b/tests/benchmark/test_comparator.py
@@ -7,44 +7,36 @@
 from qolmat.benchmark.missing_patterns import EmpiricalHoleGenerator
 
 df_origin = pd.DataFrame({"col1": [0, np.nan, 2, 4, np.nan], "col2": [-1, np.nan, 0.5, 1, 1.5]})
-
 df_imputed = pd.DataFrame({"col1": [0, 1, 2, 3.5, 4], "col2": [-1.5, 0, 1.5, 2, 1.5]})
-
 df_mask = pd.DataFrame(
     {"col1": [False, False, True, True, False], "col2": [True, False, True, True, False]}
 )
 
 cols_to_impute = ["col1", "col2"]
 generator_holes = EmpiricalHoleGenerator(n_splits=1, ratio_masked=0.5)
-dict_imputers_median = {"median": ImputerMedian()}
-dict_imputers_rpca = {"rpca": ImputerRPCA(max_iter=100, tau=2)}
-search_params = {"rpca": {"lam": {"min": 0.1, "max": 1, "type": "Real"}}}
-
-comparison_median = comparator.Comparator(
-    dict_models=dict_imputers_median,
-    selected_columns=cols_to_impute,
-    generator_holes=generator_holes,
-)
+dict_imputers = {"rpca": ImputerRPCA(max_iter=100, tau=2)}
+dict_config_opti = {"rpca": {"lam": {"min": 0.1, "max": 1, "type": "Real"}}}
 
 comparison_rpca = comparator.Comparator(
-    dict_models=dict_imputers_rpca,
+    dict_models=dict_imputers,
     selected_columns=cols_to_impute,
     generator_holes=generator_holes,
-    dict_config_opti=search_params,
+    dict_config_opti=dict_config_opti,
 )
 
 comparison_bug = comparator.Comparator(
-    dict_models=dict_imputers_median,
+    dict_models=dict_imputers,
     selected_columns=["bug"],
     generator_holes=generator_holes,
-    dict_config_opti=search_params,
+    dict_config_opti=dict_config_opti,
 )
 
-result_expected_median = [3.0, 0.5, 0.75, 0.5, 37.88948, 39.68123]
-result_expected_rpca = [3.0, 0.5, 0.75, 0.5, 37.88948, 39.68123]
-
-comparison_dict = {"median": comparison_median, "rpca": comparison_rpca, "bug": comparison_bug}
-result_expected_dict = {"median": result_expected_median, "rpca": result_expected_rpca}
+dict_comparison = {"rpca": comparison_rpca, "bug": comparison_bug}
+index_tuples_expected = pd.MultiIndex.from_product(
+    [["mae", "wmape", "KL_columnwise"], ["col1", "col2"]]
+)
+data_expected = [3.0, 0.5, 0.75, 0.5, 37.88948, 39.68123]
+result_expected = pd.Series(data_expected, index=index_tuples_expected)
 
 
 @pytest.mark.parametrize("df1", [df_origin])
@@ -53,31 +45,29 @@
 def test_benchmark_comparator_get_errors(
     df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
 ) -> None:
-    result_comparison = comparison_median.get_errors(
-        df_origin=df1, df_imputed=df2, df_mask=df_mask
+    result = comparison_rpca.get_errors(df_origin=df1, df_imputed=df2, df_mask=df_mask)
+    index_tuples_expected = pd.MultiIndex.from_product(
+        [["mae", "wmape", "KL_columnwise"], ["col1", "col2"]]
+    )
+    result_expected = pd.Series(
+        [0.25, 0.83333, 0.0625, 1.16666, 18.80089, 36.63671], index=index_tuples_expected
     )
-    result = list(result_comparison.values)
-    result_expected = [0.25, 0.83333, 0.0625, 1.16666, 18.80089, 36.63671]
     np.testing.assert_allclose(result, result_expected, atol=1e-5)
 
 
 @pytest.mark.parametrize("df1", [df_origin])
 def test_benchmark_comparator_evaluate_errors_sample(df1: pd.DataFrame) -> None:
-    result_comparison = comparison_median.evaluate_errors_sample(
-        dict_imputers_median["median"], df1
-    )
-    result = comparison_rpca.evaluate_errors_sample(dict_imputers_rpca["rpca"], df1)
-    result = list(result_comparison.values)
-    np.testing.assert_allclose(result, result_expected_median, atol=1e-5)
+    result = comparison_rpca.evaluate_errors_sample(dict_imputers["rpca"], df1)
+    np.testing.assert_allclose(result, result_expected, atol=1e-5)
 
 
 @pytest.mark.parametrize("df1", [df_origin])
-@pytest.mark.parametrize("imputer", ["median", "rpca", "bug"])
+@pytest.mark.parametrize("imputer", ["rpca", "bug"])
 def test_benchmark_comparator_compare(df1: pd.DataFrame, imputer: str) -> None:
-    comparison = comparison_dict[imputer]
+    comparison = dict_comparison[imputer]
     if imputer == "bug":
-        np.testing.assert_raises(Exception, comparison.compare, df1)
+        np.testing.assert_raises(Exception, comparison.compare, df_origin)
     else:
-        result_comparison = comparison.compare(df1)
-        result = list(result_comparison.values.flatten())
-        np.testing.assert_allclose(result, result_expected_dict[imputer], atol=1e-5)
+        result = comparison.compare(df_origin)
+        result_expected_DataFrame = pd.DataFrame(result_expected)
+        np.testing.assert_allclose(result, result_expected_DataFrame, atol=1e-5)
diff --git a/tests/benchmark/test_cross_validation.py b/tests/benchmark/test_cross_validation.py
@@ -10,61 +10,75 @@
 df_origin = pd.DataFrame({"col1": [0, np.nan, 2, 4, np.nan], "col2": [-1, np.nan, 0.5, 1, 1.5]})
 df_imputed = pd.DataFrame({"col1": [0, 1, 2, 3.5, 4], "col2": [-1.5, 0, 1.5, 2, 1.5]})
 df_mask = pd.DataFrame(
-    {"col1": [False, False, True, True, False], "col2": [True, False, True, True, False]}
+    {"col1": [False, False, True, False, False], "col2": [True, False, True, True, False]}
 )
 df_corrupted = df_origin.copy()
 df_corrupted[df_mask] = np.nan
 
-imputer_rpca = ImputerRPCA(tau=2, random_state=42)
+imputer_rpca = ImputerRPCA(tau=2, random_state=42, columnwise=True, period=1)
 dict_imputers_rpca = {"rpca": imputer_rpca}
 generator_holes = EmpiricalHoleGenerator(n_splits=1, ratio_masked=0.5)
 dict_config_opti = {
     "rpca": {
-        "lam": {"min": 0.1, "max": 1, "type": "Real"},
+        "lam": {
+            "col1": {"min": 0.1, "max": 6, "type": "Real"},
+            "col2": {"min": 1, "max": 4, "type": "Real"},
+        },
+        "tol": {"min": 1e-6, "max": 0.1, "type": "Real"},
         "max_iter": {"min": 99, "max": 100, "type": "Integer"},
         "norm": {"categories": ["L1", "L2"], "type": "Categorical"},
     }
 }
-dict_config_opti_imputer = dict_config_opti.get("rpca", {})
-hyperparams_flat = {"lam": 0.93382, "max_iter": 100, "norm": "L1"}
+dict_config_opti_imputer = dict_config_opti["rpca"]
+hyperparams_flat = {"lam/col1": 4.7, "lam/col2": 1.5, "tol": 0.07, "max_iter": 100, "norm": "L1"}
 
 cv = cross_validation.CrossValidation(
     imputer=imputer_rpca,
     dict_config_opti_imputer=dict_config_opti_imputer,
     hole_generator=generator_holes,
 )
 
-result_params_expected = {"lam": (0.1, 1), "max_iter": (99, 100), "norm": ("L1", "L2")}
+result_params_expected = {
+    "lam1": (0.1, 6),
+    "lam2": (1, 4),
+    "tol": (1e-6, 0.1),
+    "max_iter": (99, 100),
+    "norm": ("L1", "L2"),
+}
 
 
-@pytest.mark.parametrize("dict_bounds", [dict_config_opti_imputer])
-@pytest.mark.parametrize("param", ["lam", "max_iter", "norm"])
-def test_benchmark_cross_validation_get_dimension(dict_bounds: Dict, param: str) -> None:
-    result = cross_validation.get_dimension(dict_bounds=dict_bounds[param], name_dimension=param)
+@pytest.mark.parametrize("dict_config_opti_imputer", [dict_config_opti_imputer])
+@pytest.mark.parametrize("param", ["tol", "max_iter", "norm"])
+def test_benchmark_cross_validation_get_dimension(
+    dict_config_opti_imputer: Dict, param: str
+) -> None:
+    result = cross_validation.get_dimension(
+        dict_bounds=dict_config_opti_imputer[param], name_dimension=param
+    )
     result_expected = result_params_expected[param]
-    np.testing.assert_equal(result.bounds, result_expected)
+    assert result.bounds == result_expected
 
 
 @pytest.mark.parametrize("dict_config_opti_imputer", [dict_config_opti_imputer])
 def test_benchmark_cross_validation_get_search_space(dict_config_opti_imputer: Dict) -> None:
     list_result = cross_validation.get_search_space(dict_config_opti_imputer)
-    result_expected = [
-        result_params_expected["lam"],
-        result_params_expected["max_iter"],
-        result_params_expected["norm"],
-    ]
-    for i in range(3):
-        np.testing.assert_equal(list_result[i].bounds, result_expected[i])
+    list_expected_bounds = list(result_params_expected.values())
+    for result, expected_bounds in zip(list_result, list_expected_bounds):
+        assert result.bounds == expected_bounds
 
 
 @pytest.mark.parametrize("hyperparams_flat", [hyperparams_flat])
 def test_benchmark_cross_validation_deflat_hyperparams(
     hyperparams_flat: Dict[str, Union[float, int, str]]
 ) -> None:
-    resul_deflat = cross_validation.deflat_hyperparams(hyperparams_flat=hyperparams_flat)
-    result = list(resul_deflat.values())
-    result_expected = [0.93382, 100, "L1"]
-    np.testing.assert_equal(result, result_expected)
+    result_deflat = cross_validation.deflat_hyperparams(hyperparams_flat=hyperparams_flat)
+    result_expected = {
+        "lam": {"col1": 4.7, "col2": 1.5},
+        "tol": 0.07,
+        "max_iter": 100,
+        "norm": "L1",
+    }
+    assert result_deflat == result_expected
 
 
 @pytest.mark.parametrize("df1", [df_origin])
@@ -78,23 +92,27 @@ def test_benchmark_cross_validation_loss_function(
     np.testing.assert_raises(ValueError, cv.loss_function, df1, df2, df_mask)
     cv.loss_norm = 2
     result_cv2 = cv.loss_function(df_origin=df1, df_imputed=df2, df_mask=df_mask)
-    np.testing.assert_allclose(result_cv2, 1.58113, atol=1e-5)
+    np.testing.assert_allclose(result_cv2, 1.5, atol=1e-5)
     cv.loss_norm = 1
     result_cv1 = cv.loss_function(df_origin=df1, df_imputed=df2, df_mask=df_mask)
-    np.testing.assert_allclose(result_cv1, 3, atol=1e-5)
+    np.testing.assert_allclose(result_cv1, 2.5, atol=1e-5)
 
 
 @pytest.mark.parametrize("df", [df_corrupted])
 def test_benchmark_cross_validation_optimize_hyperparams(df: pd.DataFrame) -> None:
     result_hp = cv.optimize_hyperparams(df)
-    result = list(result_hp.values())
-    result_expected = [0.8168886881742098, 99, "L2"]
-    np.testing.assert_equal(result, result_expected)
+    result_expected = {
+        "lam/col1": 4.799603622475375,
+        "lam/col2": 1.5503043695984915,
+        "tol": 0.07796932033627668,
+        "max_iter": 100,
+        "norm": "L1",
+    }
+    assert result_hp == result_expected
 
 
 @pytest.mark.parametrize("df", [df_corrupted])
 def test_benchmark_cross_validation_fit_transform(df: pd.DataFrame) -> None:
     result_cv = cv.fit_transform(df)
-    result = np.array(result_cv)
-    result_expected = np.array([[0, 1.5], [0, 1.5], [0, 1.5], [0, 1.5], [0, 1.5]])
-    np.testing.assert_allclose(result, result_expected, atol=1e-5)
+    result_expected = pd.DataFrame({"col1": [0, 2, 2, 4, 2], "col2": [1.5, 1.5, 1.5, 1.5, 1.5]})
+    np.testing.assert_allclose(result_cv, result_expected, atol=1e-5)