Skip to content

Commit ed09c3a

Browse files
Julien RousselJulien Roussel
authored andcommitted
merged
2 parents 43e8102 + 5719c8d commit ed09c3a

File tree

7 files changed

+54
-57
lines changed

7 files changed

+54
-57
lines changed

HISTORY.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ History
99
* The Imputer classes do not possess a dictionary attribute anymore, and all list attributes have
1010
been changed into tuple attributes so that all are not immutable
1111
* All the tests from scikit-learn's check_estimator now pass for the class Imputer
12+
* Fix MLP imputer
1213

1314
0.0.14 (2023-06-14)
1415
-------------------

examples/benchmark.md

Lines changed: 27 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -76,26 +76,12 @@ The dataset `Beijing` is the Beijing Multi-Site Air-Quality Data Set. It consist
7676
This dataset only contains numerical vairables.
7777

7878
```python
79-
df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=20)
80-
81-
# cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"]
82-
# cols_to_impute = df_data.columns[df_data.isna().any()]
79+
df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120)
8380
cols_to_impute = ["TEMP", "PRES"]
84-
8581
```
8682

8783
The dataset `Artificial` is designed to have a sum of a periodical signal, a white noise and some outliers.
8884

89-
```python
90-
# df_data = data.get_data_corrupted("Artificial", ratio_masked=.2, mean_size=10)
91-
# cols_to_impute = ["signal"]
92-
```
93-
94-
```python
95-
# df_data = data.get_data("SNCF", n_groups_max=2)
96-
# cols_to_impute = ["val_in"]
97-
```
98-
9985
```python
10086
df_data
10187
```
@@ -244,7 +230,7 @@ comparison = comparator.Comparator(
244230
dict_imputers,
245231
cols_to_impute,
246232
generator_holes = generator_holes,
247-
metrics=["mae", "wmape", "KL_columnwise", "ks_test", "energy"],
233+
metrics=["mae", "wmape", "KL_columnwise", "ks_test"],
248234
max_evals=10,
249235
dict_config_opti=dict_config_opti,
250236
)
@@ -253,8 +239,14 @@ results
253239
```
254240

255241
```python
256-
df_plot = results.loc["energy", "All"]
242+
df_plot = results.loc["KL_columnwise",'TEMP']
257243
plt.barh(df_plot.index, df_plot, color=tab10(0))
244+
plt.title('TEMP')
245+
plt.show()
246+
247+
df_plot = results.loc["KL_columnwise",'PRES']
248+
plt.barh(df_plot.index, df_plot, color=tab10(0))
249+
plt.title('PRES')
258250
plt.show()
259251
```
260252

@@ -314,15 +306,19 @@ for col in cols_to_impute:
314306

315307
```
316308

309+
```python
310+
dfs_imputed
311+
```
312+
317313
```python
318314
# plot.plot_imputations(df_station, dfs_imputed_station)
319315

320-
n_columns = len(df_plot.columns)
316+
n_columns = len(cols_to_impute)
321317
n_imputers = len(dict_imputers)
322318

323319
fig = plt.figure(figsize=(12 * n_imputers, 4 * n_columns))
324320
i_plot = 1
325-
for i_col, col in enumerate(df_plot):
321+
for i_col, col in enumerate(cols_to_impute):
326322
for name_imputer, df_imp in dfs_imputed_station.items():
327323

328324
fig.add_subplot(n_columns, n_imputers, i_plot)
@@ -367,7 +363,6 @@ df = data.get_data("Beijing")
367363
cols_to_impute = ["TEMP", "PRES"]
368364
cols_with_nans = list(df.columns[df.isna().any()])
369365
df_data = data.add_datetime_features(df)
370-
df_data = data.add_station_features(df_data)
371366
df_data[cols_with_nans + cols_to_impute] = data.add_holes(pd.DataFrame(df_data[cols_with_nans + cols_to_impute]), ratio_masked=.1, mean_size=120)
372367
df_data
373368
```
@@ -377,46 +372,38 @@ Then we train the model without taking a group on the stations
377372

378373
```python
379374
estimator = tf.keras.models.Sequential([
380-
tf.keras.layers.Dense(256, activation='sigmoid'),
381-
tf.keras.layers.Dense(128, activation='sigmoid'),
382-
tf.keras.layers.Dense(64, activation='sigmoid'),
375+
tf.keras.layers.Dense(256, activation='relu'),
376+
tf.keras.layers.Dense(128, activation='relu'),
377+
tf.keras.layers.Dense(64, activation='relu'),
383378
tf.keras.layers.Dense(1)])
384-
estimator.compile(optimizer='adam', loss='mse')
385-
dict_imputers["MLP"] = imputer_mlp = imputers_keras.ImputerRegressorKeras(estimator=estimator, handler_nan = "column")
379+
estimator.compile(optimizer='adam', loss='mae')
380+
dict_imputers["MLP"] = imputer_mlp = imputers_keras.ImputerRegressorKeras(estimator=estimator, groups=['station'], handler_nan = "column")
386381
```
387382

388383
We can re-run the imputation model benchmark as before.
389-
390-
```python
391-
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, subset = cols_to_impute, ratio_masked=ratio_masked)
384+
```python jupyter={"outputs_hidden": true} tags=[]
385+
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], subset=cols_to_impute, ratio_masked=ratio_masked)
392386

393387
comparison = comparator.Comparator(
394388
dict_imputers,
395-
df_data.columns,
389+
cols_to_impute,
396390
generator_holes = generator_holes,
397-
n_calls_opt=10,
391+
metrics=["mae", "wmape", "KL_columnwise", "ks_test"],
392+
max_evals=10,
398393
dict_config_opti=dict_config_opti,
399394
)
400395
results = comparison.compare(df_data)
401396
results
402397
```
403-
404-
```python
405-
fig = plt.figure(figsize=(24, 4))
406-
plot.multibar(results.loc["mae"], decimals=1)
407-
plt.ylabel("mae")
408-
plt.show()
409-
```
410-
411-
```python
398+
```python jupyter={"outputs_hidden": true, "source_hidden": true} tags=[]
412399
df_plot = df_data
413400
dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
414401
station = df_plot.index.get_level_values("station")[0]
415402
df_station = df_plot.loc[station]
416403
dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
417404
```
418405

419-
```python
406+
```python jupyter={"source_hidden": true} tags=[]
420407
for col in cols_to_impute:
421408
fig, ax = plt.subplots(figsize=(10, 3))
422409
values_orig = df_station[col]

qolmat/imputations/em_sampler.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -179,10 +179,6 @@ def fit(self, X: NDArray):
179179

180180
# first imputation
181181
X_sample_last = utils.linear_interpolation(X)
182-
print("X_sample_last")
183-
print(X_sample_last)
184-
print("x")
185-
print(X)
186182
self.fit_distribution(X_sample_last)
187183

188184
for iter_em in range(self.max_iter_em):
@@ -309,8 +305,6 @@ def fit_distribution(self, X):
309305
self.cov = np.eye(n_rows)
310306
else:
311307
self.cov = np.cov(X).reshape(n_rows, -1)
312-
print("cov")
313-
print(self.cov)
314308
self.cov_inv = np.linalg.pinv(self.cov, rcond=1e-2)
315309

316310
def get_loglikelihood(self, X: NDArray) -> float:

qolmat/imputations/imputers.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
206206
if df_imputed.isna().any().any():
207207
raise AssertionError("Result of imputation contains NaN!")
208208

209+
df_imputed = df_imputed.astype(float)
209210
if isinstance(X, (np.ndarray)):
210211
df_imputed = df_imputed.to_numpy()
211212

@@ -1528,16 +1529,22 @@ def _transform_element(self, df: pd.DataFrame, col: str = "__all__") -> pd.DataF
15281529
if col not in self.estimators_:
15291530
y_imputed = pd.Series(y.mean(), index=y.index)
15301531
else:
1531-
y_imputed = self.estimators_[col].predict(X[is_na & is_valid])
1532-
y_imputed = pd.Series(y_imputed.flatten())
1532+
X_select = X[is_na & is_valid]
1533+
y_imputed = self.estimators_[col].predict(X_select)
1534+
y_imputed = y_imputed.flatten().astype(float)
1535+
print("y_imputed")
1536+
print(y_imputed)
1537+
1538+
y_imputed = pd.Series(y_imputed, index=X_select.index)
15331539

15341540
# Adds the imputed values
1535-
df_imputed.loc[~is_na, col] = y[~is_na]
1541+
# df_imputed.loc[~is_na, col] = y[~is_na]
15361542
# if isinstance(y_imputed, pd.Series):
15371543
# y_reshaped = y_imputed
15381544
# else:
15391545
# y_reshaped = y_imputed.flatten()
1540-
df_imputed.loc[is_na & is_valid, col] = y_imputed.values[: sum(is_na & is_valid)]
1546+
# df_imputed.loc[is_na & is_valid, col] = y_imputed.values[: sum(is_na & is_valid)]
1547+
df_imputed[col] = y_imputed.where(is_valid & is_na, y)
15411548

15421549
return df_imputed
15431550

tests/benchmark/test_hyperparameters.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,4 @@ def test_hyperparameters_optimize():
8686
imputer, df, generator, metric, dict_config_opti, max_evals=500
8787
)
8888
assert isinstance(imputer_opti, ImputerTest)
89-
np.testing.assert_almost_equal(imputer_opti.value, 4, decimal=2)
89+
np.testing.assert_almost_equal(imputer_opti.value, 4, decimal=1)

tests/benchmark/test_missing_patterns.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"unif": mp.UniformHoleGenerator(n_splits=2, ratio_masked=0.1, random_state=42),
2525
"multi": mp.MultiMarkovHoleGenerator(n_splits=2, ratio_masked=0.1, random_state=42),
2626
"group": mp.GroupedHoleGenerator(
27-
n_splits=2, ratio_masked=0.1, random_state=42, groups=["group"]
27+
n_splits=2, ratio_masked=0.1, random_state=42, groups=("group",)
2828
),
2929
}
3030

tests/imputations/test_imputers_keras.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,22 @@ def test_ImputerRegressorKeras_fit_transform(df: pd.DataFrame) -> None:
4343
)
4444

4545
result = imputer.fit_transform(df)
46+
# expected = pd.DataFrame(
47+
# {
48+
# "col1": [0.002, 15.0, 0.002, 23.0, 33.0],
49+
# "col2": [69.0, 76.0, 74.0, 80.0, 78.0],
50+
# "col3": [174.0, 166.0, 182.0, 177.0, 0.003714],
51+
# "col4": [9.0, 12.0, 11.0, 12.0, 8.0],
52+
# "col5": [93.0, 75.0, 0.005459, 12.0, 0.005461],
53+
# }
54+
# )
4655
expected = pd.DataFrame(
4756
{
48-
"col1": [38.362286, 15.0, 38.365032, 23.0, 33.0],
57+
"col1": [38.363, 15.0, 38.365, 23.0, 33.0],
4958
"col2": [69.0, 76.0, 74.0, 80.0, 78.0],
50-
"col3": [174.0, 166.0, 182.0, 177.0, 38.365231],
59+
"col3": [174.0, 166.0, 182.0, 177.0, 38.365],
5160
"col4": [9.0, 12.0, 11.0, 12.0, 8.0],
52-
"col5": [93.0, 75.0, 38.365032, 12.0, 38.365269],
61+
"col5": [93.0, 75.0, 38.365, 12.0, 38.365],
5362
}
5463
)
55-
56-
np.testing.assert_allclose(result["col3"], expected["col3"], atol=1e-3)
64+
pd.testing.assert_frame_equal(result, expected, atol=1e-3)

0 commit comments

Comments
 (0)