Merge pull request #111 from Quantmetry/add_test_varpem_nonregression

JulienRoussel77 · web-flow · commit edaf2d3e6cf4 · 2023-10-30T10:39:29.000+01:00
new test added to check docstring non reproducibility
diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py
@@ -326,10 +326,16 @@ def fit(self, X: NDArray) -> Self:
                 self.p = p
                 self.fit_X(X)
                 n1, n2 = self.X.shape
-                aic = np.log(np.linalg.det(self.S)) + 2 * p * (n2**2) / n1
+                det = np.linalg.det(self.S)
+                if abs(det) < 1e-12:
+                    aic = -np.inf
+                else:
+                    aic = np.log(det) + 2 * p * (n2**2) / n1
                 if len(aics) > 0 and aic > aics[-1]:
                     break
                 aics.append(aic)
+                if aic == -np.inf:
+                    break
             self.p = int(np.argmin(aics))
             self.fit_X(X)
 
@@ -352,15 +358,15 @@ def transform(self, X: NDArray) -> NDArray:
         NDArray
             Final array after EM sampling.
         """
+        mask_na = np.isnan(X)
+
         # shape_original = X.shape
         if hash(X.tobytes()) == self.hash_fit:
             X = self.X
         else:
             X = utils.prepare_data(X, self.period)
             X = utils.linear_interpolation(X)
 
-        mask_na = np.isnan(X)
-
         if self.method == "mle":
             X_transformed = self._maximize_likelihood(X, mask_na)
         elif self.method == "sample":
@@ -842,6 +848,7 @@ def combine_parameters(self) -> None:
         stack_YY = np.stack(list_YY)
         self.YY = np.mean(stack_YY, axis=0)
         self.S = self.YY - self.ZY.T @ self.B - self.B.T @ self.ZY + self.B.T @ self.ZZ @ self.B
+        self.S[self.S < 1e-12] = 0
         self.S_inv = np.linalg.pinv(self.S, rcond=1e-10)
 
     def _check_convergence(self) -> bool:
diff --git a/tests/imputations/test_em_sampler.py b/tests/imputations/test_em_sampler.py
@@ -306,6 +306,22 @@ def test_parameters_after_imputation_varpem(p: int):
     np.testing.assert_allclose(em.S, S, rtol=1e-1, atol=1e-1)
 
 
+def test_varpem_fit_transform():
+    imputer = em_sampler.VARpEM(method="sample", random_state=11)
+    X = np.array([[1, 1, 1, 1], [np.nan, np.nan, 3, 2], [1, 2, 2, 1], [2, 2, 2, 2]])
+    result = imputer.fit_transform(X)
+    expected = np.array(
+        [
+            [1.0, 1.0, 1.0, 1.0],
+            [1.0, 1.5, 3.0, 2.0],
+            [1.0, 2.0, 2.0, 1.0],
+            [2.0, 2.0, 2.0, 2.0],
+        ]
+    )
+    np.testing.assert_allclose(result, expected, atol=1e-12)
+    # assert False
+
+
 @pytest.mark.parametrize(
     "X, em, p",
     [(X_first_guess, em_sampler.MultiNormalEM(), 0), (X_first_guess, em_sampler.VARpEM(p=2), 2)],
diff --git a/tests/imputations/test_imputers.py b/tests/imputations/test_imputers.py
@@ -297,9 +297,10 @@ def test_ImputerEM_fit_transform(df: pd.DataFrame) -> None:
     expected = pd.DataFrame(
         {
             "col1": [i for i in range(20)],
-            "col2": [0, 0.773, 2, 2.621, 2] + [i for i in range(5, 20)],
+            "col2": [0, 0.638, 2, 2.714, 2] + [i for i in range(5, 20)],
         }
     )
+    print(result)
     np.testing.assert_allclose(result, expected, atol=1e-2)
 
 

Original file line number	Diff line number	Diff line change
`@@ -297,9 +297,10 @@ def test_ImputerEM_fit_transform(df: pd.DataFrame) -> None:`
`297`	`297`	`expected = pd.DataFrame(`
`298`	`298`	`{`
`299`	`299`	`"col1": [i for i in range(20)],`
`300`		`- "col2": [0, 0.773, 2, 2.621, 2] + [i for i in range(5, 20)],`
	`300`	`+ "col2": [0, 0.638, 2, 2.714, 2] + [i for i in range(5, 20)],`
`301`	`301`	`}`
`302`	`302`	`)`
	`303`	`+ print(result)`
`303`	`304`	`np.testing.assert_allclose(result, expected, atol=1e-2)`
`304`	`305`
`305`	`306`