Skip to content

Commit 7107cba

Browse files
Julien RousselJulien Roussel
authored andcommitted
init completed
1 parent a07c82f commit 7107cba

File tree

5 files changed

+39
-15
lines changed

5 files changed

+39
-15
lines changed

examples/benchmark.md

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,10 @@ This dataset only contains numerical vairables.
6464

6565
```python
6666
df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120)
67-
df_data["cat"] = [i % 3 for i in range(len(df_data))]
6867

6968
# cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"]
7069
# cols_to_impute = df_data.columns[df_data.isna().any()]
71-
cols_to_impute = ["TEMP", "PRES", "cat"]
70+
cols_to_impute = ["TEMP", "PRES"]
7271

7372
```
7473

@@ -113,13 +112,9 @@ All presented methods are group-wise: here each station is imputed independently
113112
Some methods require hyperparameters. The user can directly specify them, or rather determine them through an optimization step using the `search_params` dictionary. The keys are the imputation method's name and the values are a dictionary specifying the minimum, maximum or list of categories and type of values (Integer, Real, Category or a dictionary indexed by the variable names) to search.
114113
In pratice, we rely on a cross validation to find the best hyperparams values minimizing an error reconstruction.
115114

116-
```python tags=[]
117-
hasattr(imputers.ImputerMean(), "groups")
118-
```
119-
120115
```python
121116
imputer_mean = imputers.ImputerMean(groups=["station"])
122-
imputer_median = imputers.ImputerMedian(groups=["station", "cat"])
117+
imputer_median = imputers.ImputerMedian(groups=["station"])
123118
imputer_mode = imputers.ImputerMode(groups=["station"])
124119
imputer_locf = imputers.ImputerLOCF(groups=["station"])
125120
imputer_nocb = imputers.ImputerNOCB(groups=["station"])
@@ -248,6 +243,8 @@ for col in cols_to_impute:
248243
```
249244

250245
```python
246+
# plot.plot_imputations(df_station, dfs_imputed_station)
247+
251248
n_columns = len(df_plot.columns)
252249
n_imputers = len(dict_imputers)
253250

@@ -272,7 +269,6 @@ for name_imputer in dict_imputers:
272269
ax.xaxis.set_major_locator(loc)
273270
ax.tick_params(axis='both', which='major', labelsize=17)
274271
i_plot += 1
275-
plt.xlim(0, 100)
276272
plt.savefig("figures/imputations_benchmark.png")
277273
plt.show()
278274

461 KB
Loading

qolmat/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1+
from . import benchmark, imputations, utils
12
from ._version import __version__
23

3-
from . import utils
4-
54
__all__ = ["utils", "__version__"]

qolmat/imputations/imputers.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,10 @@ def impute_element(self, df: pd.DataFrame) -> pd.DataFrame:
105105
imputation_values = self.fit_transform_element(df)
106106

107107
df = df.fillna(imputation_values)
108-
# # fill na by applying imputation method without groups
109-
# if df.isna().any().any():
110-
# imputation_values = self.fit_transform_fallback(df)
111-
# df = df.fillna(imputation_values)
108+
# fill na by applying imputation method without groups
109+
if df.isna().any().any():
110+
imputation_values = self.fit_transform_fallback(df)
111+
df = df.fillna(imputation_values)
112112

113113
return df
114114

qolmat/utils/plot.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44

55
from __future__ import annotations
66

7-
from typing import List, Optional, Tuple, Union
7+
from typing import Dict, List, Optional, Tuple, Union
88

99
import matplotlib as mpl
1010
import matplotlib.pyplot as plt
11+
import matplotlib.ticker as plticker
1112
import numpy as np
1213
import pandas as pd
1314
import scipy
@@ -256,3 +257,31 @@ def multibar(df, ax=None, orientation="vertical", colors=None, decimals=0):
256257
# ax.bar_label(rects2, padding=3)
257258

258259
# plt.tight_layout()
260+
261+
262+
def plot_imputations(df: pd.DataFrame, dict_df_imputed: Dict[str, pd.DataFrame]):
263+
n_columns = len(df.columns)
264+
n_imputers = len(dict_df_imputed)
265+
266+
fig = plt.figure(figsize=(8 * n_columns, 6 * n_imputers))
267+
i_plot = 1
268+
for name_imputer, df_imputed in dict_df_imputed.items():
269+
for col in df:
270+
271+
ax = fig.add_subplot(n_imputers, n_columns, i_plot)
272+
values_orig = df[col]
273+
274+
plt.plot(values_orig, ".", color="black", label="original")
275+
# plt.plot(df.iloc[870:1000][col], markers[0], color='k', linestyle='-' , ms=3)
276+
277+
values_imp = df_imputed[col].copy()
278+
values_imp[values_orig.notna()] = np.nan
279+
plt.plot(values_imp, ".", color=tab10(0), label=name_imputer, alpha=1)
280+
plt.ylabel(col, fontsize=16)
281+
if i_plot % n_columns == 0:
282+
plt.legend(loc=[1, 0], fontsize=18)
283+
loc = plticker.MultipleLocator(base=2 * 365)
284+
ax.xaxis.set_major_locator(loc)
285+
ax.tick_params(axis="both", which="major", labelsize=17)
286+
i_plot += 1
287+
plt.show()

0 commit comments

Comments
 (0)