Merge branch 'patch_multimarkov' into 'dev'

Rima Hajou · Rima Hajou · commit f6a6646ec39c · 2023-01-19T09:11:28.000Z
multimarkov patched

See merge request quantmetry/retd/qolmat!9
diff --git a/qolmat/benchmark/comparator.py b/qolmat/benchmark/comparator.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 from qolmat.benchmark import cross_validation, utils
-from qolmat.benchmark.missing_patterns import HoleGenerator
+from qolmat.benchmark.missing_patterns import _HoleGenerator
 
 
 class Comparator:
@@ -31,7 +31,7 @@ def __init__(
         self,
         dict_models: Dict,
         selected_columns: List[str],
-        generator_holes: HoleGenerator,
+        generator_holes: _HoleGenerator,
         columnwise_evaluation: Optional[bool] = True,
         search_params: Optional[Dict] = {},
         n_cv_calls: Optional[int] = 10,
diff --git a/qolmat/benchmark/missing_patterns.py b/qolmat/benchmark/missing_patterns.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import functools
 import logging
 from typing import List, Optional, Tuple
 
@@ -22,12 +23,26 @@ def compute_transition_matrix(states: pd.Series, ngroups: List = None):
     if ngroups is None:
         df_counts = compute_transition_counts_matrix(states)
     else:
-        df_counts = states.groupby(ngroups).apply(compute_transition_counts_matrix).sum()
+        list_counts = [compute_transition_counts_matrix(df) for _, df in states.groupby(ngroups)]
+        df_counts = functools.reduce(lambda a, b: a.add(b, fill_value=0), list_counts)
+
     df_transition = df_counts.div(df_counts.sum(axis=1), axis=0)
     return df_transition
 
 
-class HoleGenerator:
+def get_sizes_max(values_isna: pd.Series) -> pd.Series:
+    ids_hole = (values_isna.diff() != 0).cumsum()
+    sizes_max = (
+        values_isna.groupby(ids_hole)
+        .apply(lambda x: (~x) * np.arange(len(x)))
+        .shift(1)
+        .fillna(0)
+        .astype(int)
+    )
+    return sizes_max
+
+
+class _HoleGenerator:
     """
     This class implements a method to get indices of observed and missing values.
 
@@ -59,7 +74,7 @@ def __init__(
         self.random_state = random_state
         self.groups = groups
 
-    def fit(self, X: pd.DataFrame) -> HoleGenerator:
+    def fit(self, X: pd.DataFrame) -> _HoleGenerator:
         """
         Fits the generator.
 
@@ -118,7 +133,7 @@ def _check_subset(self, X: pd.DataFrame):
                 )
 
 
-class UniformHoleGenerator(HoleGenerator):
+class UniformHoleGenerator(_HoleGenerator):
     """This class implements a way to generate holes in a dataframe.
     The holes are generated randomly, using the resample method of scikit learn.
 
@@ -176,8 +191,8 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame:
         return df_mask
 
 
-class SamplerHoleGenerator(HoleGenerator):
-    """This class implements a way to generate holes in a dataframe.
+class _SamplerHoleGenerator(HoleGenerator):
+    """This abstract class implements a generic way to generate holes in a dataframe by sampling 1D hole size distributions.
 
     Parameters
     ----------
@@ -250,14 +265,7 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame:
         for column in self.subset:
             states = X[column].isna()
 
-            ids_hole = (states.diff() != 0).cumsum()
-            sizes_max = (
-                states.groupby(ids_hole)
-                .apply(lambda x: (~x) * np.arange(len(x)))
-                .shift(1)
-                .fillna(0)
-                .astype(int)
-            )
+            sizes_max = get_sizes_max(states)
             n_masked_left = n_masked_col
 
             sizes_sampled = self.generate_hole_sizes(column, n_masked_col, sort=True)
@@ -284,7 +292,7 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame:
         return mask
 
 
-class GeometricHoleGenerator(SamplerHoleGenerator):
+class GeometricHoleGenerator(_SamplerHoleGenerator):
     """This class implements a way to generate holes in a dataframe.
     The holes are generated following a Markov 1D process.
 
@@ -353,7 +361,7 @@ def sample_sizes(self, column, n_masked):
         return sizes_sampled
 
 
-class EmpiricalHoleGenerator(SamplerHoleGenerator):
+class EmpiricalHoleGenerator(_SamplerHoleGenerator):
     """This class implements a way to generate holes in a dataframe.
     The distribution of holes is learned from the data.
     The distributions are learned column by column.
@@ -447,7 +455,7 @@ def sample_sizes(self, column, n_masked):
         return sizes_sampled
 
 
-class MultiMarkovHoleGenerator(HoleGenerator):
+class MultiMarkovHoleGenerator(_HoleGenerator):
     """This class implements a way to generate holes in a dataframe.
     The holes are generated according to a Markov process.
     Each line of the dataframe mask (np.nan) represents a state of the Markov chain.
@@ -500,7 +508,7 @@ def fit(self, X: pd.DataFrame) -> MultiMarkovHoleGenerator:
             The model itself
 
         """
-        self._check_subset(X)
+        super().fit(X)
 
         states = X[self.subset].isna().apply(lambda x: tuple(x), axis=1)
         self.df_transition = compute_transition_matrix(states, self.ngroups)
@@ -564,30 +572,38 @@ def generate_mask(self, X: pd.DataFrame) -> List[pd.DataFrame]:
         X_subset = X[self.subset]
         mask = pd.DataFrame(False, columns=X_subset.columns, index=X_subset.index)
 
-        mask_init = X_subset.isna().any(axis=1)
-        n_masked = X[self.subset].size * self.ratio_masked
+        values_hasna = X_subset.isna().any(axis=1)
+
+        sizes_max = get_sizes_max(values_hasna)
+        n_masked_left = int(X[self.subset].size * self.ratio_masked)
 
-        realisations = self.generate_multi_realisation(n_masked)
+        realisations = self.generate_multi_realisation(n_masked_left)
         realisations = sorted(realisations, reverse=True)
         for realisation in realisations:
             size_hole = len(realisation)
-            is_valid = (
-                ~(mask_init | mask).T.all().rolling(size_hole + 2).max().fillna(1).astype(bool)
-            )
-            if not np.any(is_valid):
-                logger.warning(f"No place to introduce sampled hole of size {size_hole}!")
-                continue
-            i_hole = np.random.choice(np.where(is_valid)[0])
+            n_masked = sum([sum(row) for row in realisation])
+            size_hole = min(size_hole, sizes_max.max())
+            realisation = realisation[:size_hole]
+            i_hole = np.random.choice(np.where(size_hole <= sizes_max)[0])
+            assert (~mask.iloc[i_hole - size_hole : i_hole]).all().all()
             mask.iloc[i_hole - size_hole : i_hole] = mask.iloc[i_hole - size_hole : i_hole].where(
                 ~np.array(realisation), other=True
             )
+            n_masked_left -= n_masked
+
+            sizes_max.iloc[i_hole - size_hole : i_hole] = 0
+            sizes_max.iloc[i_hole:] = np.minimum(
+                sizes_max.iloc[i_hole:], np.arange(len(sizes_max.iloc[i_hole:]))
+            )
+            if n_masked_left <= 0:
+                break
 
         complete_mask = pd.DataFrame(False, columns=X.columns, index=X.index)
         complete_mask[self.subset] = mask[self.subset]
         return mask
 
 
-class GroupedHoleGenerator(HoleGenerator):
+class GroupedHoleGenerator(_HoleGenerator):
     """This class implements a way to generate holes in a dataframe.
     The holes are generated from groups, specified by the user.
     This class uses the GroupShuffleSplit function of sklearn.
diff --git a/qolmat/notebooks/benchmark.md b/qolmat/notebooks/benchmark.md
@@ -185,13 +185,18 @@ This allows an easy comparison of the different imputations.
 
 Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
 
+```python
+missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=0.1)
+```
+
 ```python
 doy = pd.Series(df_data.reset_index().datetime.dt.isocalendar().week.values, index=df_data.index)
 
 generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=0.1)
 # generator_holes = missing_patterns.GeometricHoleGenerator(n_splits=10, groups=["station"], ratio_masked=0.1)
 # generator_holes = missing_patterns.UniformHoleGenerator(n_splits=2, ratio_masked=0.4)
 # generator_holes = missing_patterns.GroupedHoleGenerator(n_splits=2, groups=["station", doy], ratio_masked=0.4)
+# generator_holes = missing_patterns.MultiMarkovHoleGenerator(n_splits=2, groups=["station"], ratio_masked=0.1)
 
 comparison = comparator.Comparator(
     dict_models,