Merge pull request #80 from Quantmetry/gsa_bench

JulienRoussel77 · web-flow · commit dc34a1d6fd68 · 2023-10-11T19:25:10.000+02:00
Gsa bench
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -2,22 +2,20 @@
 Credits
 =======
 
-Development Lead
+Development Team
 ----------------
 
 * Julien Roussel <jroussel@quantmetry.com>
-
-Maintainers
-------------
-
-* Mikail Duran <mduran@quantmetry.com>
 * Anh Khoa Ngo Ho <angoho@quantmetry.com>
+* Charles-Henri Prat <chprat@quantmetry.com>
 * Guillaume Saës <gsaes@quantmetry.com>
 
-Contributors
-------------
+Past Contributors
+-----------------
 
 * Hong-Lan Botterman
+* Nicolas Brunel
 * Firas Dakhli
+* Mikaïl Duran
 * Rima Hajou
-* Vianey Taquet
+* Thomas Morzadec
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -12,7 +12,8 @@ History
 * Implementation of TabDDPM and TsDDPM, which are diffusion-based models for tabular data and time-series data, based on Denoising Diffusion Probabilistic Models. Their implementations follow the work of Tashiro et al., (2021) and Kotelnikov et al., (2023).
 * ImputerDiffusion is an imputer-wrapper of these two models TabDDPM and TsDDPM.
 * Docstrings and tests improved for the EM sampler
-* Online documentation reworked, with new tutorials on hole generators and a benchmark for time series imputation
+* Fix ImputerPytorch
+* Update Benchmark Deep Learning
 
 0.0.15 (2023-08-03)
 -------------------
diff --git a/README.rst b/README.rst
@@ -171,6 +171,14 @@ The following table contains the available imputation methods. We distinguish si
      - Imputes missing values via EM algorithm
      - both
      - both
+   * - MLP
+     - Imputer based Multi-Layers Perceptron Model
+     - both
+     - both
+   * - Autoencoder
+     - Imputer based Autoencoder Model with Variationel method
+     - both
+     - both
    * - TabDDPM
      - Imputer based on Denoising Diffusion Probabilistic Models
      - both
diff --git a/examples/benchmark.md b/examples/benchmark.md
@@ -8,9 +8,9 @@ jupyter:
       format_version: '1.3'
       jupytext_version: 1.14.4
   kernelspec:
-    display_name: env_qolmat_dev
+    display_name: env_qolmat
     language: python
-    name: env_qolmat_dev
+    name: env_qolmat
 ---
 
 **This notebook aims to present the Qolmat repo through an example of a multivariate time series.
@@ -172,8 +172,8 @@ dict_imputers = {
     # "locf": imputer_locf,
     # "nocb": imputer_nocb,
     # "knn": imputer_knn,
-    # "ols": imputer_regressor,
-    # "mice_ols": imputer_mice,
+    "ols": imputer_regressor,
+    "mice_ols": imputer_mice,
 }
 n_imputers = len(dict_imputers)
 ```
@@ -295,13 +295,14 @@ plt.show()
 
 ```
 
-## (Optional) Neuronal Network Model
+## (Optional) Deep Learning Model
 
 
 In this section, we present an MLP model of data imputation using Keras, which can be installed using a "pip install tensorflow".
 
 ```python
 from qolmat.imputations import imputers_pytorch
+from qolmat.imputations.diffusions.ddpms import TabDDPM
 try:
     import torch.nn as nn
 except ModuleNotFoundError:
@@ -323,33 +324,56 @@ For the example, we use a simple MLP model with 3 layers of neurons.
 Then we train the model without taking a group on the stations
 
 ```python
-estimator = nn.Sequential(
-        nn.Linear(np.sum(df_data.isna().sum()==0), 256),
-        nn.ReLU(),
-        nn.Linear(256, 128),
-        nn.ReLU(),
-        nn.Linear(128, 64),
-        nn.ReLU(),
-        nn.Linear(64, 1)
-    )
-# imputers_pytorch.build_mlp_example(input_dim=np.sum(df_data.isna().sum()==0), list_num_neurons=[256,128,64])
-dict_imputers["MLP"] = imputer_mlp = imputers_pytorch.ImputerRegressorPyTorch(estimator=estimator, groups=['station'], handler_nan = "column", epochs=500)
+fig = plt.figure(figsize=(10 * n_stations, 3 * n_cols))
+for i_station, (station, df) in enumerate(df_data.groupby("station")):
+    df_station = df_data.loc[station]
+    for i_col, col in enumerate(cols_to_impute):
+        fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1)
+        plt.plot(df_station[col], '.', label=station)
+        # break
+        plt.ylabel(col)
+        plt.xticks(rotation=15)
+        if i_col == 0:
+            plt.title(station)
+        if i_col != n_cols - 1:
+            plt.xticks([], [])
+plt.show()
+```
+
+```python
+# estimator = nn.Sequential(
+#         nn.Linear(np.sum(df_data.isna().sum()==0), 256),
+#         nn.ReLU(),
+#         nn.Linear(256, 128),
+#         nn.ReLU(),
+#         nn.Linear(128, 64),
+#         nn.ReLU(),
+#         nn.Linear(64, 1)
+#     )
+estimator = imputers_pytorch.build_mlp(input_dim=np.sum(df_data.isna().sum()==0), list_num_neurons=[256,128,64])
+encoder, decoder  = imputers_pytorch.build_autoencoder(input_dim=df_data.values.shape[1],latent_dim=4, output_dim=df_data.values.shape[1], list_num_neurons=[4*4, 2*4])
+```
+
+```python
+dict_imputers["MLP"] = imputer_mlp = imputers_pytorch.ImputerRegressorPyTorch(estimator=estimator, groups=('station',), handler_nan = "column", epochs=500)
+dict_imputers["Autoencoder"] = imputer_autoencoder = imputers_pytorch.ImputerAutoencoder(encoder, decoder, max_iterations=100, epochs=100)
+dict_imputers["Diffusion"] = imputer_diffusion = imputers_pytorch.ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=100, batch_size=100)
 ```
 
 We can re-run the imputation model benchmark as before.
 ```python tags=[]
-generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], subset=cols_to_impute, ratio_masked=ratio_masked)
+generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=3, groups=('station',), subset=cols_to_impute, ratio_masked=ratio_masked)
 
 comparison = comparator.Comparator(
     dict_imputers,
-    cols_to_impute,
+    selected_columns = df_data.columns,
     generator_holes = generator_holes,
     metrics=["mae", "wmape", "KL_columnwise", "ks_test"],
     max_evals=10,
     dict_config_opti=dict_config_opti,
 )
 results = comparison.compare(df_data)
-results
+results.style.highlight_min(color="green", axis=1)
 ```
 ```python tags=[]
 df_plot = df_data
diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py
@@ -1022,8 +1022,8 @@ def distance_anticorr(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFram
     float
         Distance correlation score
     """
-    df1 = df1[df_mask.any(axis=1)]
-    df2 = df2[df_mask.any(axis=1)]
+    df1 = df1.loc[df_mask.any(axis=1)]
+    df2 = df2.loc[df_mask.any(axis=1)]
     return (1 - dcor.distance_correlation(df1.values, df2.values)) / 2
 
 
@@ -1059,8 +1059,8 @@ def pattern_based_weighted_mean_metric(
     """
     scores = []
     weights = []
-    df1 = df1[df_mask.any(axis=1)]
-    df2 = df2[df_mask.any(axis=1)]
+    df1 = df1.loc[df_mask.any(axis=1)]
+    df2 = df2.loc[df_mask.any(axis=1)]
     df_nan = df1.notna()
     max_num_row = 0
     for tup_pattern, df_nan_pattern in df_nan.groupby(df_nan.columns.tolist()):
diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py
@@ -1600,7 +1600,6 @@ def _transform_element(
         # df_imputed = df.apply(pd.DataFrame.median, result_type="broadcast", axis=0)
         df_imputed = df.copy()
         cols_with_nans = df.columns[df.isna().any()]
-
         for col in cols_with_nans:
             model = self._dict_fitting["__all__"][ngroup][col]
             if model is None:
@@ -1613,6 +1612,7 @@ def _transform_element(
             X = X.loc[is_na]
 
             y_hat = self._predict_estimator(model, X)
+            y_hat.index = X.index
             df_imputed.loc[X.index, col] = y_hat
         return df_imputed
 
diff --git a/qolmat/imputations/imputers_pytorch.py b/qolmat/imputations/imputers_pytorch.py
@@ -368,7 +368,6 @@ def _transform_element(
         )
         X = df_train_scaler.values
         mask = df.isna().values
-
         for _ in range(self.max_iterations):
             self.fit(X, X)
             Z = autoencoder.encode(X)
@@ -382,7 +381,7 @@ def _transform_element(
         return df_imputed
 
 
-def build_mlp_example(
+def build_mlp(
     input_dim: int,
     list_num_neurons: List[int],
     output_dim: int = 1,
@@ -414,7 +413,7 @@ def build_mlp_example(
 
     Examples
     --------
-    >>> model = build_mlp_example(input_dim=10, list_num_neurons=[32, 64, 128], output_dim=1)
+    >>> model = build_mlp(input_dim=10, list_num_neurons=[32, 64, 128], output_dim=1)
     >>> print(model)
     Sequential(
       (0): Linear(in_features=10, out_features=32, bias=True)
@@ -437,7 +436,7 @@ def build_mlp_example(
     return estimator
 
 
-def build_autoencoder_example(
+def build_autoencoder(
     input_dim: int,
     latent_dim: int,
     list_num_neurons: List[int],
@@ -472,7 +471,7 @@ def build_autoencoder_example(
 
     Examples
     --------
-    >>> encoder, decoder = build_autoencoder_example(
+    >>> encoder, decoder = build_autoencoder(
                                                         input_dim=10,
                                                         latent_dim=4,
                                                         list_num_neurons=[32, 64, 128],
@@ -500,13 +499,13 @@ def build_autoencoder_example(
     )
     """
 
-    encoder = build_mlp_example(
+    encoder = build_mlp(
         input_dim=input_dim,
         output_dim=latent_dim,
         list_num_neurons=np.sort(list_num_neurons)[::-1].tolist(),
         activation=activation,
     )
-    decoder = build_mlp_example(
+    decoder = build_mlp(
         input_dim=latent_dim,
         output_dim=output_dim,
         list_num_neurons=np.sort(list_num_neurons).tolist(),
diff --git a/setup.py b/setup.py
@@ -13,22 +13,31 @@
 LICENSE = "new BSD"
 AUTHORS = """
 Hong-Lan Botterman,
-Julien Roussel,
-Thomas Morzadec,
-Rima Hajou,
 Firas Dakhli,
+Rima Hajou,
+Thomas Morzadec,
 Anh Khoa Ngo Ho,
 Charles-Henri Prat
+Julien Roussel,
+Guillaume Saës,
 """
 AUTHORS_EMAIL = """
 hlbotterman@quantmetry.com,
-jroussel@quantmetry.com,
-tmorzadec@quantmetry.com,
-rhajou@quantmetry.com,
 fdakhli@quantmetry.com,
+rhajou@quantmetry.com,
+tmorzadec@quantmetry.com,
 angoho@quantmetry.com,
 chprat@quantmetry.com
+jroussel@quantmetry.com,
+gsaes@quantmetry.com,
 """
+MAINTAINER = "Julien ROUSSEL, Anh Khoa NGO HO, Charles-Henri PRAT, Guillaume SAËS"
+MAINTAINER_EMAIL = (
+    "jroussel@quantmetry.com, "
+    "akngoho@quantmetry.com, "
+    "chprat@quantmetry.com, "
+    "gsaes@quantmetry.com"
+)
 URL = "https://github.com/Quantmetry/qolmat"
 DOWNLOAD_URL = "https://pypi.org/project/qolmat/#files"
 PROJECT_URLS = {
diff --git a/tests/imputations/test_imputers_pytorch.py b/tests/imputations/test_imputers_pytorch.py
@@ -29,7 +29,7 @@ def test_ImputerRegressorPyTorch_fit_transform(df: pd.DataFrame) -> None:
     nn.manual_seed(42)
     if nn.cuda.is_available():
         nn.cuda.manual_seed(42)
-    estimator = imputers_pytorch.build_mlp_example(input_dim=2, list_num_neurons=[64, 32])
+    estimator = imputers_pytorch.build_mlp(input_dim=2, list_num_neurons=[64, 32])
     imputer = imputers_pytorch.ImputerRegressorPyTorch(
         estimator=estimator, handler_nan="column", epochs=10
     )
@@ -47,13 +47,14 @@ def test_ImputerRegressorPyTorch_fit_transform(df: pd.DataFrame) -> None:
 
     expected = pd.DataFrame(
         {
-            "col1": [2.031, 15, 19, 23, 33],
+            "col1": [2.031, 15, 2.132, 23, 33],
             "col2": [69, 76, 74, 80, 78],
-            "col3": [174, 166, 182, 177, 175.5],
+            "col3": [174, 166, 182, 177, 2.345],
             "col4": [9, 12, 11, 12, 8],
-            "col5": [93, 75, 75, 12, 75],
+            "col5": [93, 75, 2.132, 12, 2.345],
         }
     )
+    print(result["col5"])
     np.testing.assert_allclose(result, expected, atol=1e-3)
 
 

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ def test_ImputerRegressorPyTorch_fit_transform(df: pd.DataFrame) -> None:`
`29`	`29`	`nn.manual_seed(42)`
`30`	`30`	`if nn.cuda.is_available():`
`31`	`31`	`nn.cuda.manual_seed(42)`
`32`		`- estimator = imputers_pytorch.build_mlp_example(input_dim=2, list_num_neurons=[64, 32])`
	`32`	`+ estimator = imputers_pytorch.build_mlp(input_dim=2, list_num_neurons=[64, 32])`
`33`	`33`	`imputer = imputers_pytorch.ImputerRegressorPyTorch(`
`34`	`34`	`estimator=estimator, handler_nan="column", epochs=10`
`35`	`35`	`)`
`@@ -47,13 +47,14 @@ def test_ImputerRegressorPyTorch_fit_transform(df: pd.DataFrame) -> None:`
`47`	`47`
`48`	`48`	`expected = pd.DataFrame(`
`49`	`49`	`{`
`50`		`- "col1": [2.031, 15, 19, 23, 33],`
	`50`	`+ "col1": [2.031, 15, 2.132, 23, 33],`
`51`	`51`	`"col2": [69, 76, 74, 80, 78],`
`52`		`- "col3": [174, 166, 182, 177, 175.5],`
	`52`	`+ "col3": [174, 166, 182, 177, 2.345],`
`53`	`53`	`"col4": [9, 12, 11, 12, 8],`
`54`		`- "col5": [93, 75, 75, 12, 75],`
	`54`	`+ "col5": [93, 75, 2.132, 12, 2.345],`
`55`	`55`	`}`
`56`	`56`	`)`
	`57`	`+ print(result["col5"])`
`57`	`58`	`np.testing.assert_allclose(result, expected, atol=1e-3)`
`58`	`59`
`59`	`60`