Skip to content

Commit 7ef8e4e

Browse files
author
vm-aifluence-jro
committed
Beijing data set modified to fit online changes
1 parent 1d8ef01 commit 7ef8e4e

File tree

3 files changed

+164
-62
lines changed

3 files changed

+164
-62
lines changed

examples/benchmark.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ dict_imputers = {
149149
# "mode": imputer_mode,
150150
"interpolation": imputer_interpol,
151151
# "spline": imputer_spline,
152-
# "shuffle": imputer_shuffle,
152+
"shuffle": imputer_shuffle,
153153
# "residuals": imputer_residuals,
154154
# "OU": imputer_ou,
155155
# "TSOU": imputer_tsou,
@@ -186,6 +186,12 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
186186

187187
Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
188188

189+
```python
190+
df = pd.DataFrame(columns=["a", "b"])
191+
df["a"] = [1, 2]
192+
df
193+
```
194+
189195
```python
190196
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
191197

qolmat/utils/data.py

Lines changed: 94 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import sys
23
import zipfile
34
from math import pi
45
from typing import List, Optional
@@ -11,23 +12,47 @@
1112

1213

1314
def download_data(zipname: str, urllink: str, datapath: str = "data/") -> List[pd.DataFrame]:
14-
path_zip = os.path.join(datapath)
15-
if not os.path.exists(path_zip + ".zip"):
16-
if not os.path.exists(datapath):
17-
os.mkdir(datapath)
18-
request.urlretrieve(urllink + zipname + ".zip", path_zip + ".zip")
19-
20-
with zipfile.ZipFile(path_zip + ".zip", "r") as zip_ref:
21-
zip_ref.extractall(path_zip)
22-
data_folder = os.listdir(path_zip)
23-
subfolder = os.path.join(path_zip, data_folder[0])
24-
data_files = os.listdir(subfolder)
25-
list_df = [pd.read_csv(os.path.join(subfolder, file)) for file in data_files]
15+
path_zip = os.path.join(datapath, zipname)
16+
path_zip_ext = path_zip + ".zip"
17+
url = os.path.join(urllink, zipname) + ".zip"
18+
os.makedirs(datapath, exist_ok=True)
19+
print("exists ?", path_zip_ext)
20+
if not os.path.exists(path_zip_ext) and not os.path.exists(path_zip):
21+
print("url")
22+
print(url)
23+
request.urlretrieve(url, path_zip_ext)
24+
if not os.path.exists(path_zip):
25+
with zipfile.ZipFile(path_zip_ext, "r") as zip_ref:
26+
zip_ref.extractall(path_zip)
27+
list_df = []
28+
for folder, _, files in os.walk(path_zip):
29+
for file in files:
30+
if ".csv" in file:
31+
list_df.append(pd.read_csv(os.path.join(folder, file)))
2632
return list_df
2733

2834

35+
def generate_artificial_ts(n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise):
36+
mesh = np.arange(n_samples)
37+
X = np.ones(n_samples)
38+
for p in periods:
39+
X += np.sin(2 * pi * mesh / p)
40+
41+
n_anomalies = int(n_samples * ratio_anomalies)
42+
anomalies = np.random.standard_exponential(size=n_anomalies)
43+
anomalies *= amp_anomalies * np.random.choice([-1, 1], size=n_anomalies)
44+
ind_anomalies = np.random.choice(range(n_samples), size=n_anomalies, replace=False)
45+
A = np.zeros(n_samples)
46+
A[ind_anomalies] = anomalies
47+
48+
E = amp_noise * np.random.normal(size=n_samples)
49+
return X, A, E
50+
51+
2952
def get_data(
30-
name_data: str = "Beijing", datapath: str = "data/", download: Optional[bool] = True
53+
name_data: str = "Beijing",
54+
datapath: str = "data/",
55+
n_groups_max: int = sys.maxsize,
3156
) -> pd.DataFrame:
3257
"""Download or generate data
3358
@@ -45,49 +70,78 @@ def get_data(
4570
requested data
4671
"""
4772
if name_data == "Beijing":
73+
urllink = "https://archive.ics.uci.edu/static/public/381/"
74+
zipname = "beijing+pm2+5+data"
75+
76+
list_df = download_data(zipname, urllink, datapath=datapath)
77+
list_df = [preprocess_data_beijing(df) for df in list_df]
78+
df = pd.concat(list_df)
79+
return df
80+
elif name_data == "Beijing_offline":
4881
urllink = "https://archive.ics.uci.edu/dataset/381/beijing+pm2+5+data"
4982
zipname = "PRSA2017_Data_20130301-20170228"
83+
5084
list_df = download_data(zipname, urllink, datapath=datapath)
51-
list_df = [preprocess_data(df) for df in list_df]
85+
list_df = [preprocess_data_beijing_offline(df) for df in list_df]
5286
df = pd.concat(list_df)
5387
return df
5488
elif name_data == "Artificial":
5589
city = "Wonderland"
5690
n_samples = 1000
57-
p1 = 100
58-
p2 = 20
59-
amplitude_A = 0.5
60-
freq_A = 0.05
61-
amplitude_E = 0.1
91+
periods = [100, 20]
92+
amp_anomalies = 0.5
93+
ratio_anomalies = 0.05
94+
amp_noise = 0.1
6295

63-
mesh = np.arange(n_samples)
64-
65-
X_true = 1 + np.sin(2 * pi * mesh / p1) + np.sin(2 * pi * mesh / p2)
66-
67-
noise = np.random.uniform(size=n_samples)
68-
A_true = (
69-
amplitude_A
70-
* np.where(noise < freq_A, -np.log(noise), 0)
71-
* (2 * (np.random.uniform(size=n_samples) > 0.5) - 1)
96+
X, A, E = generate_artificial_ts(
97+
n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise
7298
)
73-
74-
E_true = amplitude_E * np.random.normal(size=n_samples)
75-
76-
signal = X_true + E_true
77-
signal[A_true != 0] = A_true[A_true != 0]
78-
99+
signal = X + A + E
79100
df = pd.DataFrame({"signal": signal, "index": range(n_samples), "station": city})
80101
df.set_index(["station", "index"], inplace=True)
81102

82-
df["X"] = X_true
83-
df["A"] = A_true
84-
df["E"] = E_true
103+
df["X"] = X
104+
df["A"] = A
105+
df["E"] = E
106+
return df
107+
elif name_data == "SNCF":
108+
path_file = os.path.join(datapath, "validations_idfm_std.parq")
109+
df = pd.read_parquet(path_file)
110+
sizes_stations = df.groupby("station")["val_in"].mean().sort_values()
111+
n_groups_max = min(len(sizes_stations), n_groups_max)
112+
stations = sizes_stations.index.get_level_values("station").unique()[-n_groups_max:]
113+
df = df.loc[stations]
85114
return df
86115
else:
87116
raise ValueError(f"Data name {name_data} is unknown!")
88117

89118

90-
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
119+
def preprocess_data_beijing(df: pd.DataFrame) -> pd.DataFrame:
120+
"""Preprocess data from the "Beijing" datset
121+
122+
Parameters
123+
----------
124+
df : pd.DataFrame
125+
dataframe with some specific column names
126+
127+
Returns
128+
-------
129+
pd.DataFrame
130+
preprocessed dataframe
131+
"""
132+
df["datetime"] = pd.to_datetime(df[["year", "month", "day", "hour"]])
133+
df["station"] = "Beijing"
134+
df.set_index(["station", "datetime"], inplace=True)
135+
# df.drop(columns=["year", "month", "day", "hour", "No", "cbwd", ""], inplace=True)
136+
df.drop(columns=["year", "month", "day", "hour", "wd", "No"], inplace=True)
137+
df.sort_index(inplace=True)
138+
df = df.groupby(
139+
["station", df.index.get_level_values("datetime").floor("d")], group_keys=False
140+
).mean()
141+
return df
142+
143+
144+
def preprocess_data_beijing_offline(df: pd.DataFrame) -> pd.DataFrame:
91145
"""Preprocess data from the "Beijing" datset
92146
93147
Parameters
@@ -100,6 +154,8 @@ def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
100154
pd.DataFrame
101155
preprocessed dataframe
102156
"""
157+
print("preprocess_data_beijing_offline")
158+
print(df.dtypes)
103159
df["datetime"] = pd.to_datetime(df[["year", "month", "day", "hour"]])
104160
df.set_index(["station", "datetime"], inplace=True)
105161
df.drop(columns=["year", "month", "day", "hour", "wd", "No"], inplace=True)

tests/utils/test_data.py

Lines changed: 63 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,33 @@
33
import numpy as np
44
import pandas as pd
55
import pytest
6+
from pytest_mock.plugin import MockerFixture
67

78
from qolmat.utils import data
8-
from pytest_mock.plugin import MockerFixture
9+
10+
columns = ["No", "year", "month", "day", "hour", "a", "b", "wd"]
11+
df_beijing = pd.DataFrame(
12+
[
13+
[1, 2013, 3, 1, 0, 1, 2, "NW"],
14+
[2, 2014, 3, 1, 0, 3, np.nan, "NW"],
15+
[3, 2015, 3, 1, 0, np.nan, 6, "NW"],
16+
],
17+
columns=columns,
18+
)
19+
index_preprocess_beijing = pd.MultiIndex.from_tuples(
20+
[
21+
("Beijing", datetime.datetime(2013, 3, 1)),
22+
("Beijing", datetime.datetime(2014, 3, 1)),
23+
("Beijing", datetime.datetime(2015, 3, 1)),
24+
],
25+
names=["station", "datetime"],
26+
)
27+
df_preprocess_beijing = pd.DataFrame(
28+
[[1, 2], [3, np.nan], [np.nan, 6]], columns=["a", "b"], index=index_preprocess_beijing
29+
)
930

1031
columns = ["No", "year", "month", "day", "hour", "a", "b", "wd", "station"]
11-
df = pd.DataFrame(
32+
df_offline = pd.DataFrame(
1233
[
1334
[1, 2013, 3, 1, 0, 1, 2, "NW", "Gucheng"],
1435
[2, 2014, 3, 1, 0, 3, np.nan, "NW", "Gucheng"],
@@ -17,18 +38,19 @@
1738
columns=columns,
1839
)
1940

20-
index_preprocess = pd.MultiIndex.from_tuples(
41+
index_preprocess_offline = pd.MultiIndex.from_tuples(
2142
[
2243
("Gucheng", datetime.datetime(2013, 3, 1)),
2344
("Gucheng", datetime.datetime(2014, 3, 1)),
2445
("Gucheng", datetime.datetime(2015, 3, 1)),
2546
],
2647
names=["station", "datetime"],
2748
)
28-
df_preprocess = pd.DataFrame(
29-
[[1, 2], [3, np.nan], [np.nan, 6]], columns=["a", "b"], index=index_preprocess
49+
df_preprocess_offline = pd.DataFrame(
50+
[[1, 2], [3, np.nan], [np.nan, 6]], columns=["a", "b"], index=index_preprocess_offline
3051
)
3152

53+
3254
urllink = "https://archive.ics.uci.edu/ml/machine-learning-databases/00501/"
3355
zipname = "PRSA2017_Data_20130301-20170228"
3456

@@ -40,20 +62,34 @@
4062
# list_df_result = data.download_data(zipname, urllink)
4163

4264

43-
@pytest.mark.parametrize("name_data", ["Beijing", "Artificial", "Bug"])
44-
def test_utils_data_get_data(name_data: str, mocker: MockerFixture) -> None:
65+
@pytest.mark.parametrize(
66+
"name_data, df",
67+
[
68+
("Beijing", df_beijing),
69+
("Beijing_offline", df_offline),
70+
("Artificial", None),
71+
("Bug", None),
72+
],
73+
)
74+
def test_utils_data_get_data(name_data: str, df: pd.DataFrame, mocker: MockerFixture) -> None:
4575
mock_download = mocker.patch("qolmat.utils.data.download_data", return_value=[df])
46-
mocker.patch("qolmat.utils.data.preprocess_data", return_value=df_preprocess)
76+
mocker.patch(
77+
"qolmat.utils.data.preprocess_data_beijing_offline", return_value=df_preprocess_offline
78+
)
79+
mocker.patch("qolmat.utils.data.preprocess_data_beijing", return_value=df_preprocess_beijing)
4780
try:
4881
df_result = data.get_data(name_data=name_data)
4982
except ValueError:
50-
assert name_data not in ["Beijing", "Artificial"]
83+
assert name_data not in ["Beijing", "Beijing_offline", "Artificial"]
5184
np.testing.assert_raises(ValueError, data.get_data, name_data)
5285
return
5386

5487
if name_data == "Beijing":
5588
assert mock_download.call_count == 1
56-
pd.testing.assert_frame_equal(df_result, df_preprocess)
89+
pd.testing.assert_frame_equal(df_result, df_preprocess_beijing)
90+
elif name_data == "Beijing_offline":
91+
assert mock_download.call_count == 1
92+
pd.testing.assert_frame_equal(df_result, df_preprocess_offline)
5793
elif name_data == "Artificial":
5894
expected_columns = ["signal", "X", "A", "E"]
5995
assert isinstance(df_result, pd.DataFrame)
@@ -62,13 +98,17 @@ def test_utils_data_get_data(name_data: str, mocker: MockerFixture) -> None:
6298
assert False
6399

64100

65-
@pytest.mark.parametrize("df", [df])
66-
def test_utils_data_preprocess_data(df: pd.DataFrame) -> None:
67-
result = data.preprocess_data(df)
68-
pd.testing.assert_frame_equal(result, df_preprocess, atol=1e-3)
101+
@pytest.mark.parametrize("df", [df_offline])
102+
def test_utils_data_preprocess_data_beijing_offline(df: pd.DataFrame) -> None:
103+
result = data.preprocess_data_beijing_offline(df)
104+
print(result)
105+
print(df_preprocess_offline)
106+
print(result.dtypes)
107+
print(df_preprocess_offline.dtypes)
108+
pd.testing.assert_frame_equal(result, df_preprocess_offline, atol=1e-3)
69109

70110

71-
@pytest.mark.parametrize("df", [df_preprocess])
111+
@pytest.mark.parametrize("df", [df_preprocess_offline])
72112
def test_utils_data_add_holes(df: pd.DataFrame) -> None:
73113
df_out = data.add_holes(df, 0.0, 1)
74114
assert df_out.isna().sum().sum() == 2
@@ -78,33 +118,33 @@ def test_utils_data_add_holes(df: pd.DataFrame) -> None:
78118

79119
@pytest.mark.parametrize("name_data", ["Beijing"])
80120
def test_utils_data_get_data_corrupted(name_data: str, mocker: MockerFixture) -> None:
81-
mock_download = mocker.patch("qolmat.utils.data.download_data", return_value=[df])
82-
mocker.patch("qolmat.utils.data.preprocess_data", return_value=df_preprocess)
121+
mock_download = mocker.patch("qolmat.utils.data.download_data", return_value=[df_beijing])
122+
mocker.patch("qolmat.utils.data.preprocess_data_beijing", return_value=df_preprocess_beijing)
83123
df_out = data.get_data_corrupted()
84124
df_result = pd.DataFrame(
85-
[[1, 2], [np.nan, np.nan], [np.nan, 6]], columns=["a", "b"], index=index_preprocess
125+
[[1, 2], [np.nan, np.nan], [np.nan, 6]], columns=["a", "b"], index=index_preprocess_beijing
86126
)
87127
assert mock_download.call_count == 1
88128
pd.testing.assert_frame_equal(df_result, df_out)
89129

90130

91-
@pytest.mark.parametrize("df", [df_preprocess])
131+
@pytest.mark.parametrize("df", [df_preprocess_beijing])
92132
def test_utils_data_add_station_features(df: pd.DataFrame) -> None:
93-
columns_out = ["a", "b"] + ["station=Gucheng"]
133+
columns_out = ["a", "b"] + ["station=Beijing"]
94134
expected = pd.DataFrame(
95135
[
96136
[1, 2, 1.0],
97137
[3, np.nan, 1.0],
98138
[np.nan, 6, 1.0],
99139
],
100140
columns=columns_out,
101-
index=index_preprocess,
141+
index=index_preprocess_beijing,
102142
)
103143
result = data.add_station_features(df)
104144
pd.testing.assert_frame_equal(result, expected, atol=1e-3)
105145

106146

107-
@pytest.mark.parametrize("df", [df_preprocess])
147+
@pytest.mark.parametrize("df", [df_preprocess_beijing])
108148
def test_utils_data_add_datetime_features(df: pd.DataFrame) -> None:
109149
columns_out = ["a", "b"] + ["time_cos"]
110150
expected = pd.DataFrame(
@@ -114,7 +154,7 @@ def test_utils_data_add_datetime_features(df: pd.DataFrame) -> None:
114154
[np.nan, 6, 0.512],
115155
],
116156
columns=columns_out,
117-
index=index_preprocess,
157+
index=index_preprocess_beijing,
118158
)
119159
result = data.add_datetime_features(df)
120160
pd.testing.assert_frame_equal(result, expected, atol=1e-3)

0 commit comments

Comments
 (0)