Merge remote-tracking branch 'origin/dev' into chp_add_rand_state_ddpm

baruch11 · baruch11 · commit ddb6d695de20 · 2023-10-30T14:56:13.000+01:00
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1,20 +1,22 @@
-name: Unit test Qolmat
+name: Unit test on many environments
 
 on:
   push:
     branches:
       -dev
       -main
   pull_request:
+    types: [opened, synchronize, reopened, ready_for_review]
   workflow_dispatch:
 
 jobs:
   build-linux:
+    if: github.event.pull_request.draft == false
     runs-on: ${{matrix.os}}
     strategy:
       matrix:
         os: [ubuntu-latest, windows-latest]
-        python-version: [3.8, 3.9]
+        python-version: ['3.8', '3.9', '3.10', '3.11']
     defaults:
       run:
         shell: bash -l {0}
diff --git a/.github/workflows/test_quick.yml b/.github/workflows/test_quick.yml
@@ -0,0 +1,43 @@
+name: Unit test Qolmat
+
+on:
+  push:
+    branches-ignore:
+      - dev
+      - main
+  workflow_dispatch:
+
+jobs:
+  basic-testing:
+    runs-on: ${{matrix.os}}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.8]
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - name: Git clone
+        uses: actions/checkout@v3
+      - name: Set up venv for ci
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          python-version: ${{matrix.python-version}}
+          environment-file: environment.ci.yml
+      - name: Lint with flake8
+        run: |
+          conda install flake8
+          flake8
+      - name: Test with pytest
+        run: |
+          conda install pytest
+          pip install -e .[pytorch]
+          make coverage
+      - name: Test docstrings
+        run: make doctest
+      - name: typing with mypy
+        run: |
+          mypy qolmat
+          echo you should uncomment mypy qolmat and delete this line
diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py
@@ -863,7 +863,6 @@ def kl_divergence_gaussian_exact(
     norm_M = (M**2).sum().sum()
     norm_y = (y**2).sum()
     term_diag_L = 2 * np.sum(np.log(np.diagonal(L2) / np.diagonal(L1)))
-    print(norm_M, "-", n_variables, "+", norm_y, "+", term_diag_L)
     div_kl = 0.5 * (norm_M - n_variables + norm_y + term_diag_L)
     return div_kl
 
diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py
@@ -269,7 +269,7 @@ def _sample_ou(
         X_init = X.copy()
         gamma = self.get_gamma()
         sqrt_gamma = np.real(spl.sqrtm(gamma))
-        for _ in range(self.n_iter_ou):
+        for i in range(self.n_iter_ou):
             noise = self.ampli * self.rng.normal(0, 1, size=(n_variables, n_samples))
             grad_X = self.gradient_X_loglik(X_copy)
             X_copy += self.dt * grad_X @ gamma + np.sqrt(2 * self.dt) * noise @ sqrt_gamma
@@ -489,8 +489,8 @@ def get_gamma(self) -> NDArray:
         NDArray
             Gamma matrix
         """
-        gamma = np.diag(np.diagonal(self.cov))
-        # gamma = self.cov
+        # gamma = np.diag(np.diagonal(self.cov))
+        gamma = self.cov
         # gamma = np.eye(len(self.cov))
         return gamma
 
@@ -571,9 +571,9 @@ def _maximize_likelihood(self, X: NDArray, mask_na: NDArray) -> NDArray:
         NDArray
             DataFrame with imputed values.
         """
-        X_center = X - self.means[:, None]
+        X_center = X - self.means
         X_imputed = _conjugate_gradient(self.cov_inv, X_center, mask_na)
-        X_imputed = self.means[:, None] + X_imputed
+        X_imputed = self.means + X_imputed
         return X_imputed
 
     def _check_convergence(self) -> bool:
@@ -675,14 +675,7 @@ class VARpEM(EM):
     >>> X = np.array([[1, 1, 1, 1],
     ...               [np.nan, np.nan, 3, 2],
     ...               [1, 2, 2, 1], [2, 2, 2, 2]])
-    >>> imputer.fit_transform(X)
-    EM converged after 9 iterations.
-    EM converged after 20 iterations.
-    EM converged after 13 iterations.
-    array([[1.        , 1.        , 1.        , 1.        ],
-           [1.17054054, 1.49986137, 3.        , 2.        ],
-           [1.        , 2.        , 2.        , 1.        ],
-           [2.        , 2.        , 2.        , 2.        ]])
+    >>> imputer.fit_transform(X)  # doctest: +SKIP
     """
 
     def __init__(
diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py
@@ -159,10 +159,9 @@ def fit(self, X: pd.DataFrame, y=None) -> Self:
         else:
             self.ngroups_ = pd.Series(0, index=df.index).rename("_ngroup")
 
-        cols_with_nans = df.columns[df.isna().any()]
         self._setup_fit()
         if self.columnwise:
-            for col in cols_with_nans:
+            for col in df.columns:
                 self._fit_allgroups(df[[col]], col=col)
         else:
             self._fit_allgroups(df)
diff --git a/tests/imputations/test_em_sampler.py b/tests/imputations/test_em_sampler.py
@@ -1,5 +1,4 @@
-from typing import List
-
+from typing import List, Literal
 import numpy as np
 import pytest
 from numpy.typing import NDArray
@@ -279,6 +278,31 @@ def test_mean_covariance_multinormalem():
     np.testing.assert_allclose(covariance_imputed, covariance, rtol=1e-1, atol=1e-1)
 
 
+def test_multinormal_em_minimize_llik():
+    X, X_missing, mean, covariance = generate_multinormal_predefined_mean_cov(d=2, n=1000)
+    imputer = em_sampler.MultiNormalEM(method="mle", random_state=11)
+    X_imputed = imputer.fit_transform(X_missing)
+    llikelihood_imputed = imputer.get_loglikelihood(X_imputed)
+    for _ in range(10):
+        Delta = imputer.rng.uniform(0, 1, size=X.shape)
+        X_perturbated = X_imputed + Delta
+        llikelihood_perturbated = imputer.get_loglikelihood(X_perturbated)
+        assert llikelihood_perturbated < llikelihood_imputed
+    X_perturbated = X
+    X_perturbated[np.isnan(X)] = 0
+    llikelihood_perturbated = imputer.get_loglikelihood(X_perturbated)
+    assert llikelihood_perturbated < llikelihood_imputed
+
+
+@pytest.mark.parametrize("method", ["sample", "mle"])
+def test_multinormal_em_fit_transform(method: Literal["mle", "sample"]):
+    imputer = em_sampler.MultiNormalEM(method=method, random_state=11)
+    X = np.array([[1, 1, 1, 1], [np.nan, np.nan, 3, 2], [1, 2, 2, 1], [2, 2, 2, 2]])
+    result = imputer.fit_transform(X)
+    assert result.shape == X.shape
+    np.testing.assert_allclose(result[~np.isnan(X)], X[~np.isnan(X)])
+
+
 @pytest.mark.parametrize(
     "p",
     [1],
@@ -319,7 +343,6 @@ def test_varpem_fit_transform():
         ]
     )
     np.testing.assert_allclose(result, expected, atol=1e-12)
-    # assert False
 
 
 @pytest.mark.parametrize(
diff --git a/tests/imputations/test_imputers.py b/tests/imputations/test_imputers.py
@@ -174,7 +174,6 @@ def test_ImputerShuffle_fit_transform1(df: pd.DataFrame) -> None:
 def test_ImputerShuffle_fit_transform2(df: pd.DataFrame) -> None:
     imputer = imputers.ImputerShuffle(random_state=42)
     result = imputer.fit_transform(df)
-    print(result)
     expected = pd.DataFrame({"col1": [0, 3, 2, 3, 0], "col2": [-1, 1.5, 0.5, 1.5, 1.5]})
     np.testing.assert_allclose(result, expected)
 
@@ -290,20 +289,6 @@ def test_ImputerSoftImpute_fit_transform(df: pd.DataFrame) -> None:
     np.testing.assert_allclose(result, expected, atol=1e-2)
 
 
-@pytest.mark.parametrize("df", [df_timeseries])
-def test_ImputerEM_fit_transform(df: pd.DataFrame) -> None:
-    imputer = imputers.ImputerEM(method="sample", dt=1e-3, random_state=42)
-    result = imputer.fit_transform(df)
-    expected = pd.DataFrame(
-        {
-            "col1": [i for i in range(20)],
-            "col2": [0, 0.638, 2, 2.714, 2] + [i for i in range(5, 20)],
-        }
-    )
-    print(result)
-    np.testing.assert_allclose(result, expected, atol=1e-2)
-
-
 index_grouped = pd.MultiIndex.from_product([["a", "b"], range(4)], names=["group", "date"])
 dict_values = {"col1": [0, np.nan, 0, np.nan, 1, 1, 1, 1], "col2": [1, 1, 1, 1, 2, 2, 2, 2]}
 df_grouped = pd.DataFrame(dict_values, index=index_grouped)
diff --git a/tests/imputations/test_imputers_pytorch.py b/tests/imputations/test_imputers_pytorch.py
@@ -54,7 +54,6 @@ def test_ImputerRegressorPyTorch_fit_transform(df: pd.DataFrame) -> None:
             "col5": [93, 75, 2.132, 12, 2.345],
         }
     )
-    print(result["col5"])
     np.testing.assert_allclose(result, expected, atol=1e-3)
 
 
diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py
@@ -186,11 +186,9 @@ def test_utils_data_get_data(name_data: str, df: pd.DataFrame, mocker: MockerFix
         assert df_result.columns.tolist() == expected_columns
     elif name_data == "Monach_weather":
         assert mock_download.call_count == 1
-        print(df_result)
         pd.testing.assert_frame_equal(df_result, df_monach_weather_preprocess)
     elif name_data == "Monach_electricity_australia":
         assert mock_download.call_count == 1
-        print(df_result)
         pd.testing.assert_frame_equal(df_result, df_monach_elec_preprocess)
     else:
         assert False

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,6 @@ def test_ImputerRegressorPyTorch_fit_transform(df: pd.DataFrame) -> None:`
`54`	`54`	`"col5": [93, 75, 2.132, 12, 2.345],`
`55`	`55`	`}`
`56`	`56`	`)`
`57`		`- print(result["col5"])`
`58`	`57`	`np.testing.assert_allclose(result, expected, atol=1e-3)`
`59`	`58`
`60`	`59`