EM with single sample managed

Julien Roussel · Julien Roussel · commit 43e8102858d4 · 2023-07-05T20:29:48.000+02:00
diff --git a/examples/benchmark.md b/examples/benchmark.md
@@ -19,6 +19,18 @@ In Qolmat, a few data imputation methods are implemented as well as a way to eva
 
 First, import some useful librairies
 
+```python
+X= np.array([[0], [1], [2]])
+```
+
+```python
+np.cov(X)
+```
+
+```python
+
+```
+
 ```python
 import warnings
 # warnings.filterwarnings('error')
diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py
@@ -179,6 +179,10 @@ def fit(self, X: NDArray):
 
         # first imputation
         X_sample_last = utils.linear_interpolation(X)
+        print("X_sample_last")
+        print(X_sample_last)
+        print("x")
+        print(X)
         self.fit_distribution(X_sample_last)
 
         for iter_em in range(self.max_iter_em):
@@ -300,7 +304,13 @@ def __init__(
 
     def fit_distribution(self, X):
         self.means = np.mean(X, axis=1)
-        self.cov = np.cov(X).reshape(len(X), -1)
+        n_rows, n_cols = X.shape
+        if n_cols == 1:
+            self.cov = np.eye(n_rows)
+        else:
+            self.cov = np.cov(X).reshape(n_rows, -1)
+        print("cov")
+        print(self.cov)
         self.cov_inv = np.linalg.pinv(self.cov, rcond=1e-2)
 
     def get_loglikelihood(self, X: NDArray) -> float:
diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py
@@ -1222,9 +1222,10 @@ def fit(self, X: pd.DataFrame, y=None):
             Returns self.
         """
         super().fit(X)
+        df = self._check_input(X)
         hyperparameters = self.get_hyperparams()
         self.imputer_ = KNNImputer(metric="nan_euclidean", **hyperparameters)
-        self.imputer_.fit(X)
+        self.imputer_.fit(df)
         return self
 
     def _transform_element(self, df: pd.DataFrame, col: str = "__all__") -> pd.DataFrame:
@@ -1327,10 +1328,9 @@ def fit(self, X: pd.DataFrame, y=None):
         """
         hyperparams = self.get_hyperparams()
         super().fit(X)
-        if not isinstance(X, (pd.DataFrame)):
-            X = pd.DataFrame(np.array(X), columns=[i for i in range(np.array(X).shape[1])])
+        df = self._check_input(X)
         self.imputer_ = IterativeImputer(estimator=self.estimator, **hyperparams)
-        self.imputer_.fit(X)
+        self.imputer_.fit(df)
         self.n_iter_ = self.imputer_.n_iter_
         return self
 
@@ -1444,18 +1444,17 @@ def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> _Imputer:
         """
 
         super().fit(X)
-        if not isinstance(X, (pd.DataFrame)):
-            X = pd.DataFrame(np.array(X), columns=[i for i in range(np.array(X).shape[1])])
+        df = self._check_input(X)
 
-        cols_with_nans = X.columns[X.isna().any()]
+        cols_with_nans = df.columns[df.isna().any()]
         self.estimators_ = {}
         for col in cols_with_nans:
             # Define the Train and Test set
-            X_ = X.drop(columns=col, errors="ignore")
-            y_ = X[col]
+            X_ = df.drop(columns=col, errors="ignore")
+            y_ = df[col]
 
             # Selects only the valid values in the Train Set according to the chosen method
-            is_valid = pd.Series(True, index=X.index)
+            is_valid = pd.Series(True, index=df.index)
             if self.handler_nan == "fit":
                 pass
             elif self.handler_nan == "row":
@@ -1787,30 +1786,31 @@ def fit(self, X: pd.DataFrame, y=None):
             Returns self.
         """
         super().fit(X)
+        df = self._check_input(X)
 
-        n_rows, n_cols = X.shape
-        if n_rows == 1:
-            raise ValueError("n_samples=1 is not allowed!")
+        # n_rows, n_cols = df.shape
+        # if n_rows == 1:
+        #     raise ValueError("n_samples=1 is not allowed!")
 
         if self.model not in ["multinormal", "VAR1"]:
             raise ValueError(
                 f"Model argument `{self.model}` is invalid!"
                 " Valid values are `multinormal`and `VAR`."
             )
 
-        cols_with_nans = X.columns[X.isna().any()]
+        cols_with_nans = df.columns[df.isna().any()]
 
         self._models = {}
         if self.columnwise:
             for col in cols_with_nans:
                 hyperparams = self.get_hyperparams(col=col)
                 model = self.get_model(random_state=self.rng_, **hyperparams)
-                model.fit(X[col].values)
+                model.fit(df[col].values)
                 self._models[col] = model
         else:
             hyperparams = self.get_hyperparams()
             model = self.get_model(random_state=self.rng_, **hyperparams)
-            model.fit(X.values.T)
+            model.fit(df.values.T)
             self._models["__all__"] = model
         return self