Skip to content

Commit 018b675

Browse files
Julien RousselJulien Roussel
authored andcommitted
notebooks moved to root
1 parent 5dfcdb9 commit 018b675

26 files changed

+98
-1053
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ __pycache__/
77
qolmat/notebooks/figures
88
qolmat/notebooks/*.ipynb
99
qolmat/examples/*.ipynb
10+
*.ipynb
1011
*.egg-info
1112
/dist
1213
/build

examples/1_timeSeries.ipynb

Lines changed: 0 additions & 316 deletions
This file was deleted.

examples/2_image_denoising.md

Lines changed: 0 additions & 101 deletions
This file was deleted.

examples/3_background.md

Lines changed: 0 additions & 99 deletions
This file was deleted.

examples/1_timeSeries.md renamed to examples/RPCA.md

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -100,16 +100,3 @@ X, A, errors = temporal_rpca.fit_transform(signal=samples)
100100
drawing.plot_signal([samples, X, A], style="matplotlib")
101101
```
102102

103-
```python
104-
online_temp_rpca = OnlineTemporalRPCA(n_rows=25, tau=1, lam=0.3, list_periods=[20], list_etas=[0.01],
105-
burnin=0.2, online_list_etas=[.3])
106-
X, A = online_temp_rpca.fit_transform(signal=samples)
107-
drawing.plot_signal([samples, X, A], style="matplotlib")
108-
```
109-
110-
```python
111-
online_temp_rpca = OnlineTemporalRPCA(n_rows=25, tau=1, lam=0.3, list_periods=[20], list_etas=[0.01],
112-
burnin=0.2, online_list_etas=[0.3], nwin=20)
113-
X, A = online_temp_rpca.fit_transform(signal=samples)
114-
drawing.plot_signal([samples, X, A], style="matplotlib")
115-
```

qolmat/notebooks/benchmark.md renamed to examples/benchmark.md

Lines changed: 46 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ jupyter:
66
extension: .md
77
format_name: markdown
88
format_version: '1.3'
9-
jupytext_version: 1.14.1
9+
jupytext_version: 1.14.4
1010
kernelspec:
1111
display_name: env_qolmat
1212
language: python
@@ -59,31 +59,32 @@ from qolmat.utils import data, utils, plot
5959
### **I. Load data**
6060
<!-- #endregion -->
6161

62-
The data used in this example is the Beijing Multi-Site Air-Quality Data Set. It consists in hourly air pollutants data from 12 chinese nationally-controlled air-quality monitoring sites and is available at https://archive.ics.uci.edu/ml/machine-learning-databases/00501/.
62+
The dataset `Beijing` is the Beijing Multi-Site Air-Quality Data Set. It consists in hourly air pollutants data from 12 chinese nationally-controlled air-quality monitoring sites and is available at https://archive.ics.uci.edu/ml/machine-learning-databases/00501/.
6363
This dataset only contains numerical vairables.
6464

65-
66-
df = pd.read_csv("/Users/hlbotterman/Downloads/m5-daily-sales.csv")
67-
df.head()
68-
df["Date"] = pd.to_datetime(df["Date"])
69-
df = df.rename(columns={"Date": "datetime"})
70-
df.set_index("datetime", inplace = True)
71-
df["Sales"] = df['Sales'].astype(float)
72-
cols_to_impute = ["Sales"]
73-
7465
```python
75-
download = True
76-
df_data = data.get_data_corrupted(download=download, ratio_masked=.2, mean_size=120 , groups=["station"])
66+
df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120)
7767

7868
# cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"]
7969
# cols_to_impute = df_data.columns[df_data.isna().any()]
8070
cols_to_impute = ["TEMP", "PRES"]
8171

8272
```
8373

74+
The dataset `Artificial` is designed to have a sum of a periodical signal, a white noise and some outliers.
75+
76+
```python
77+
# df_data = data.get_data_corrupted("Artificial", ratio_masked=.2, mean_size=10)
78+
# cols_to_impute = ["signal"]
79+
```
80+
8481
Let's take a look at variables to impute. We only consider a station, Aotizhongxin.
8582
Time series display seasonalities (roughly 12 months).
8683

84+
```python
85+
imputer = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=True, period=20, max_iter=1000)
86+
df = imputer.fit_transform(df_data)
87+
```
8788

8889
### **II. Imputation methods**
8990

@@ -114,11 +115,10 @@ imputer_spline = imputers.ImputerInterpolation(groups=["station"], method="splin
114115
imputer_shuffle = imputers.ImputerShuffle(groups=["station"])
115116
imputer_residuals = imputers.ImputerResiduals(groups=["station"], period=7, model_tsa="additive", extrapolate_trend="freq", method_interpolation="linear")
116117

117-
# imputer_rpca = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=False, n_rows=7*4, max_iter=1000, tau=1, lam=0.7)
118-
dict_tau = {"TEMP": 1, "PRES": 1.1}
119-
dict_lam = {"TEMP": 0.7, "PRES": 0.8}
120-
imputer_rpca = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=True, period=365, max_iter=1000, tau=dict_tau, lam=dict_lam)
121-
imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=True, n_rows=365, max_iter=1000)
118+
dict_tau = {"TEMP": 2, "PRES": 1.1}
119+
dict_lam = {"TEMP": 0.3, "PRES": 0.8}
120+
imputer_rpca = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=True, period=20, max_iter=1000, tau=2, lam=.3)
121+
imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], method="temporal", columnwise=True, period=20, max_iter=1000)
122122

123123
imputer_ou = imputers.ImputeEM(groups=["station"], method="multinormal", max_iter_em=34, n_iter_ou=15, strategy="ou")
124124
imputer_tsou = imputers.ImputeEM(groups=["station"], method="VAR1", strategy="ou", max_iter_em=34, n_iter_ou=15)
@@ -146,8 +146,8 @@ dict_imputers = {
146146
"OU": imputer_ou,
147147
"TSOU": imputer_tsou,
148148
"TSMLE": imputer_tsmle,
149-
"RPCA": imputer_rpca,
150-
# "RPCA_opti": imputer_rpca_opti,
149+
"RPCA": imputer_rpca_opti,
150+
"RPCA_opti": imputer_rpca_opti,
151151
# "locf": imputer_locf,
152152
# "nocb": imputer_nocb,
153153
# "knn": imputer_knn,
@@ -156,16 +156,23 @@ dict_imputers = {
156156
n_imputers = len(dict_imputers)
157157

158158

159+
# search_params = {
160+
# "RPCA_opti": {
161+
# "lam": {
162+
# "TEMP": {"min": .1, "max": 10, "type":"Real"},
163+
# "PRES": {"min": .1, "max": 10, "type":"Real"}
164+
# },
165+
# "tau": {
166+
# "TEMP": {"min": .1, "max": 10, "type":"Real"},
167+
# "PRES": {"min": .1, "max": 10, "type":"Real"}
168+
# }
169+
# }
170+
# }
171+
159172
search_params = {
160173
"RPCA_opti": {
161-
"lam": {
162-
"TEMP": {"min": .1, "max": 10, "type":"Real"},
163-
"PRES": {"min": .1, "max": 10, "type":"Real"}
164-
},
165-
"tau": {
166-
"TEMP": {"min": .1, "max": 10, "type":"Real"},
167-
"PRES": {"min": .1, "max": 10, "type":"Real"}
168-
}
174+
"tau": {"min": .5, "max": 5, "type":"Real"},
175+
"lam": {"min": .1, "max": 1, "type":"Real"},
169176
}
170177
}
171178

@@ -209,7 +216,8 @@ results
209216

210217
```python
211218
fig = plt.figure(figsize=(24, 4))
212-
plot.multibar(results.loc["mae"])
219+
plot.multibar(results.loc["mae"], decimals=1)
220+
plt.ylabel("mae")
213221
plt.show()
214222
```
215223

@@ -219,15 +227,20 @@ plt.show()
219227
We now run just one time each algorithm on the initial corrupted dataframe and compare the different performances through multiple analysis.
220228

221229
```python
222-
df_plot = df_data[["TEMP", "PRES"]]
230+
df_plot = df_data[cols_to_impute]
223231
```
224232

225233
```python
226234
dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
227235
```
228236

229237
```python
230-
station = "Aotizhongxin"
238+
df_station
239+
```
240+
241+
```python
242+
# station = "Aotizhongxin"
243+
station = df_plot.index.get_level_values("station")[0]
231244
df_station = df_plot.loc[station]
232245
dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
233246
```
@@ -286,12 +299,13 @@ for name_imputer in dict_imputers:
286299
values_imp[values_orig.notna()] = np.nan
287300
plt.plot(values_imp, ".", color=tab10(0), label=name_imputer, alpha=1)
288301
plt.ylabel(col, fontsize=16)
289-
if i_plot % 2 == 0:
302+
if i_plot % n_columns == 0:
290303
plt.legend(loc=[1, 0], fontsize=18)
291304
loc = plticker.MultipleLocator(base=2*365)
292305
ax.xaxis.set_major_locator(loc)
293306
ax.tick_params(axis='both', which='major', labelsize=17)
294307
i_plot += 1
308+
plt.xlim(0, 100)
295309
plt.savefig("figures/imputations_benchmark.png")
296310
plt.show()
297311

@@ -310,7 +324,6 @@ for i_model, model in enumerate(dict_imputers.keys()):
310324
for i in range(len(cols_to_impute)-1):
311325
plot.compare_covariances(df_station, df_imp, cols_to_impute[i], cols_to_impute[i+1], axs, color=tab10(i_model))
312326
axs.set_title(f"imputation method: {model}", fontsize=20)
313-
sns.despine()
314327
plt.show()
315328
```
316329

0 commit comments

Comments
 (0)