11import os
2+ import sys
23import zipfile
34from math import pi
45from typing import List , Optional
1112
1213
1314def download_data (zipname : str , urllink : str , datapath : str = "data/" ) -> List [pd .DataFrame ]:
14- path_zip = os .path .join (datapath )
15- if not os .path .exists (path_zip + ".zip" ):
16- if not os .path .exists (datapath ):
17- os .mkdir (datapath )
18- request .urlretrieve (urllink + zipname + ".zip" , path_zip + ".zip" )
19-
20- with zipfile .ZipFile (path_zip + ".zip" , "r" ) as zip_ref :
21- zip_ref .extractall (path_zip )
22- data_folder = os .listdir (path_zip )
23- subfolder = os .path .join (path_zip , data_folder [0 ])
24- data_files = os .listdir (subfolder )
25- list_df = [pd .read_csv (os .path .join (subfolder , file )) for file in data_files ]
15+ path_zip = os .path .join (datapath , zipname )
16+ path_zip_ext = path_zip + ".zip"
17+ url = os .path .join (urllink , zipname ) + ".zip"
18+ os .makedirs (datapath , exist_ok = True )
19+ if not os .path .exists (path_zip_ext ) and not os .path .exists (path_zip ):
20+ request .urlretrieve (url , path_zip_ext )
21+ if not os .path .exists (path_zip ):
22+ with zipfile .ZipFile (path_zip_ext , "r" ) as zip_ref :
23+ zip_ref .extractall (path_zip )
24+ list_df = []
25+ for folder , _ , files in os .walk (path_zip ):
26+ for file in files :
27+ if ".csv" in file :
28+ list_df .append (pd .read_csv (os .path .join (folder , file )))
2629 return list_df
2730
2831
32+ def generate_artificial_ts (n_samples , periods , amp_anomalies , ratio_anomalies , amp_noise ):
33+ mesh = np .arange (n_samples )
34+ X = np .ones (n_samples )
35+ for p in periods :
36+ X += np .sin (2 * pi * mesh / p )
37+
38+ n_anomalies = int (n_samples * ratio_anomalies )
39+ anomalies = np .random .standard_exponential (size = n_anomalies )
40+ anomalies *= amp_anomalies * np .random .choice ([- 1 , 1 ], size = n_anomalies )
41+ ind_anomalies = np .random .choice (range (n_samples ), size = n_anomalies , replace = False )
42+ A = np .zeros (n_samples )
43+ A [ind_anomalies ] = anomalies
44+
45+ E = amp_noise * np .random .normal (size = n_samples )
46+ return X , A , E
47+
48+
2949def get_data (
30- name_data : str = "Beijing" , datapath : str = "data/" , download : Optional [bool ] = True
50+ name_data : str = "Beijing" ,
51+ datapath : str = "data/" ,
52+ n_groups_max : int = sys .maxsize ,
3153) -> pd .DataFrame :
3254 """Download or generate data
3355
@@ -45,49 +67,79 @@ def get_data(
4567 requested data
4668 """
4769 if name_data == "Beijing" :
48- urllink = "https://archive.ics.uci.edu/ml/machine-learning-databases/00501/"
70+ urllink = "https://archive.ics.uci.edu/static/public/381/"
71+ zipname = "beijing+pm2+5+data"
72+
73+ list_df = download_data (zipname , urllink , datapath = datapath )
74+ list_df = [preprocess_data_beijing (df ) for df in list_df ]
75+ df = pd .concat (list_df )
76+ return df
77+ elif name_data == "Beijing_offline" :
78+ urllink = "https://archive.ics.uci.edu/dataset/381/beijing+pm2+5+data"
4979 zipname = "PRSA2017_Data_20130301-20170228"
80+
5081 list_df = download_data (zipname , urllink , datapath = datapath )
51- list_df = [preprocess_data (df ) for df in list_df ]
82+ list_df = [preprocess_data_beijing_offline (df ) for df in list_df ]
5283 df = pd .concat (list_df )
5384 return df
5485 elif name_data == "Artificial" :
5586 city = "Wonderland"
5687 n_samples = 1000
57- p1 = 100
58- p2 = 20
59- amplitude_A = 0.5
60- freq_A = 0.05
61- amplitude_E = 0.1
88+ periods = [100 , 20 ]
89+ amp_anomalies = 0.5
90+ ratio_anomalies = 0.05
91+ amp_noise = 0.1
6292
63- mesh = np .arange (n_samples )
64-
65- X_true = 1 + np .sin (2 * pi * mesh / p1 ) + np .sin (2 * pi * mesh / p2 )
66-
67- noise = np .random .uniform (size = n_samples )
68- A_true = (
69- amplitude_A
70- * np .where (noise < freq_A , - np .log (noise ), 0 )
71- * (2 * (np .random .uniform (size = n_samples ) > 0.5 ) - 1 )
93+ X , A , E = generate_artificial_ts (
94+ n_samples , periods , amp_anomalies , ratio_anomalies , amp_noise
7295 )
73-
74- E_true = amplitude_E * np .random .normal (size = n_samples )
75-
76- signal = X_true + E_true
77- signal [A_true != 0 ] = A_true [A_true != 0 ]
78-
96+ signal = X + A + E
7997 df = pd .DataFrame ({"signal" : signal , "index" : range (n_samples ), "station" : city })
8098 df .set_index (["station" , "index" ], inplace = True )
8199
82- df ["X" ] = X_true
83- df ["A" ] = A_true
84- df ["E" ] = E_true
100+ df ["X" ] = X
101+ df ["A" ] = A
102+ df ["E" ] = E
103+ return df
104+ elif name_data == "SNCF" :
105+ path_file = os .path .join (datapath , "validations_idfm_std.parq" )
106+ df = pd .read_parquet (path_file )
107+ sizes_stations = df .groupby ("station" )["val_in" ].mean ().sort_values ()
108+ n_groups_max = min (len (sizes_stations ), n_groups_max )
109+ stations = sizes_stations .index .get_level_values ("station" ).unique ()[- n_groups_max :]
110+ df = df .loc [stations ]
85111 return df
86112 else :
87113 raise ValueError (f"Data name { name_data } is unknown!" )
88114
89115
90- def preprocess_data (df : pd .DataFrame ) -> pd .DataFrame :
116+ def preprocess_data_beijing (df : pd .DataFrame ) -> pd .DataFrame :
117+ """Preprocess data from the "Beijing" datset
118+
119+ Parameters
120+ ----------
121+ df : pd.DataFrame
122+ dataframe with some specific column names
123+
124+ Returns
125+ -------
126+ pd.DataFrame
127+ preprocessed dataframe
128+ """
129+ df ["datetime" ] = pd .to_datetime (df [["year" , "month" , "day" , "hour" ]])
130+ df ["station" ] = "Beijing"
131+ df .set_index (["station" , "datetime" ], inplace = True )
132+ df .drop (
133+ columns = ["year" , "month" , "day" , "hour" , "No" , "cbwd" , "Iws" , "Is" , "Ir" ], inplace = True
134+ )
135+ df .sort_index (inplace = True )
136+ df = df .groupby (
137+ ["station" , df .index .get_level_values ("datetime" ).floor ("d" )], group_keys = False
138+ ).mean ()
139+ return df
140+
141+
142+ def preprocess_data_beijing_offline (df : pd .DataFrame ) -> pd .DataFrame :
91143 """Preprocess data from the "Beijing" datset
92144
93145 Parameters
0 commit comments