Skip to content

Commit eaf50eb

Browse files
Merge pull request #38 from Quantmetry/doc_readme
Readme+test
2 parents d5c42b1 + 3dab35a commit eaf50eb

File tree

8 files changed

+246
-75
lines changed

8 files changed

+246
-75
lines changed

README.rst

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
.. |Commits| image:: https://img.shields.io/github/commits-since/Quantmetry/qolmat/latest/main
2424
.. _Commits: https://github.com/Quantmetry/qolmat/commits/main
2525

26-
.. image:: https://github.com/Quantmetry/qolmat/tree/main/docs/images/logo.png
26+
.. image:: https://raw.githubusercontent.com/Quantmetry/qolmat/main/docs/images/logo.png
2727
:align: center
2828

2929
Qolmat - The Tool for Data Imputation
@@ -95,7 +95,7 @@ For this demonstration, let us create artificial holes in our dataset.
9595
plt.savefig('readme1.png')
9696
plt.show()
9797
98-
.. image:: https://github.com/Quantmetry/qolmat/tree/main/docs/images/readme1.png
98+
.. image:: https://raw.githubusercontent.com/Quantmetry/qolmat/main/docs/images/readme1.png
9999
:align: center
100100

101101
To impute missing data, there are several methods that can be imported with ``from qolmat.imputations import imputers``.
@@ -191,7 +191,7 @@ We can observe the benchmark results.
191191
plt.savefig('readme3.png')
192192
plt.show()
193193
194-
.. image:: https://github.com/Quantmetry/qolmat/tree/main/docs/images/readme2.png
194+
.. image:: https://raw.githubusercontent.com/Quantmetry/qolmat/main/docs/images/readme2.png
195195
:align: center
196196

197197
Finally, we keep the best ``TSMLE`` imputor we represent.
@@ -206,7 +206,7 @@ Finally, we keep the best ``TSMLE`` imputor we represent.
206206
plt.plot(df_with_nan['y'],'.b')
207207
plt.show()
208208
209-
.. image:: https://github.com/Quantmetry/qolmat/tree/main/docs/images/readme3.png
209+
.. image:: https://raw.githubusercontent.com/Quantmetry/qolmat/main/docs/images/readme3.png
210210
:align: center
211211

212212

@@ -231,7 +231,7 @@ Qolmat has been developed by Quantmetry.
231231

232232
|Quantmetry|_
233233

234-
.. |Quantmetry| image:: https://www.quantmetry.com/wp-content/uploads/2020/08/08-Logo-quant-Texte-noir.svg
234+
.. |Quantmetry| image:: https://raw.githubusercontent.com/Quantmetry/qolmat/main/docs/images/quantmetry.png
235235
:width: 150
236236
.. _Quantmetry: https://www.quantmetry.com/
237237

docs/images/quantmetry.png

34 KB
Loading

examples/benchmark.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ dict_imputers = {
149149
# "mode": imputer_mode,
150150
"interpolation": imputer_interpol,
151151
# "spline": imputer_spline,
152-
# "shuffle": imputer_shuffle,
152+
"shuffle": imputer_shuffle,
153153
# "residuals": imputer_residuals,
154154
# "OU": imputer_ou,
155155
# "TSOU": imputer_tsou,

qolmat/utils/data.py

Lines changed: 91 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import sys
23
import zipfile
34
from math import pi
45
from typing import List, Optional
@@ -11,23 +12,44 @@
1112

1213

1314
def download_data(zipname: str, urllink: str, datapath: str = "data/") -> List[pd.DataFrame]:
14-
path_zip = os.path.join(datapath)
15-
if not os.path.exists(path_zip + ".zip"):
16-
if not os.path.exists(datapath):
17-
os.mkdir(datapath)
18-
request.urlretrieve(urllink + zipname + ".zip", path_zip + ".zip")
19-
20-
with zipfile.ZipFile(path_zip + ".zip", "r") as zip_ref:
21-
zip_ref.extractall(path_zip)
22-
data_folder = os.listdir(path_zip)
23-
subfolder = os.path.join(path_zip, data_folder[0])
24-
data_files = os.listdir(subfolder)
25-
list_df = [pd.read_csv(os.path.join(subfolder, file)) for file in data_files]
15+
path_zip = os.path.join(datapath, zipname)
16+
path_zip_ext = path_zip + ".zip"
17+
url = os.path.join(urllink, zipname) + ".zip"
18+
os.makedirs(datapath, exist_ok=True)
19+
if not os.path.exists(path_zip_ext) and not os.path.exists(path_zip):
20+
request.urlretrieve(url, path_zip_ext)
21+
if not os.path.exists(path_zip):
22+
with zipfile.ZipFile(path_zip_ext, "r") as zip_ref:
23+
zip_ref.extractall(path_zip)
24+
list_df = []
25+
for folder, _, files in os.walk(path_zip):
26+
for file in files:
27+
if ".csv" in file:
28+
list_df.append(pd.read_csv(os.path.join(folder, file)))
2629
return list_df
2730

2831

32+
def generate_artificial_ts(n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise):
33+
mesh = np.arange(n_samples)
34+
X = np.ones(n_samples)
35+
for p in periods:
36+
X += np.sin(2 * pi * mesh / p)
37+
38+
n_anomalies = int(n_samples * ratio_anomalies)
39+
anomalies = np.random.standard_exponential(size=n_anomalies)
40+
anomalies *= amp_anomalies * np.random.choice([-1, 1], size=n_anomalies)
41+
ind_anomalies = np.random.choice(range(n_samples), size=n_anomalies, replace=False)
42+
A = np.zeros(n_samples)
43+
A[ind_anomalies] = anomalies
44+
45+
E = amp_noise * np.random.normal(size=n_samples)
46+
return X, A, E
47+
48+
2949
def get_data(
30-
name_data: str = "Beijing", datapath: str = "data/", download: Optional[bool] = True
50+
name_data: str = "Beijing",
51+
datapath: str = "data/",
52+
n_groups_max: int = sys.maxsize,
3153
) -> pd.DataFrame:
3254
"""Download or generate data
3355
@@ -45,49 +67,79 @@ def get_data(
4567
requested data
4668
"""
4769
if name_data == "Beijing":
48-
urllink = "https://archive.ics.uci.edu/ml/machine-learning-databases/00501/"
70+
urllink = "https://archive.ics.uci.edu/static/public/381/"
71+
zipname = "beijing+pm2+5+data"
72+
73+
list_df = download_data(zipname, urllink, datapath=datapath)
74+
list_df = [preprocess_data_beijing(df) for df in list_df]
75+
df = pd.concat(list_df)
76+
return df
77+
elif name_data == "Beijing_offline":
78+
urllink = "https://archive.ics.uci.edu/dataset/381/beijing+pm2+5+data"
4979
zipname = "PRSA2017_Data_20130301-20170228"
80+
5081
list_df = download_data(zipname, urllink, datapath=datapath)
51-
list_df = [preprocess_data(df) for df in list_df]
82+
list_df = [preprocess_data_beijing_offline(df) for df in list_df]
5283
df = pd.concat(list_df)
5384
return df
5485
elif name_data == "Artificial":
5586
city = "Wonderland"
5687
n_samples = 1000
57-
p1 = 100
58-
p2 = 20
59-
amplitude_A = 0.5
60-
freq_A = 0.05
61-
amplitude_E = 0.1
88+
periods = [100, 20]
89+
amp_anomalies = 0.5
90+
ratio_anomalies = 0.05
91+
amp_noise = 0.1
6292

63-
mesh = np.arange(n_samples)
64-
65-
X_true = 1 + np.sin(2 * pi * mesh / p1) + np.sin(2 * pi * mesh / p2)
66-
67-
noise = np.random.uniform(size=n_samples)
68-
A_true = (
69-
amplitude_A
70-
* np.where(noise < freq_A, -np.log(noise), 0)
71-
* (2 * (np.random.uniform(size=n_samples) > 0.5) - 1)
93+
X, A, E = generate_artificial_ts(
94+
n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise
7295
)
73-
74-
E_true = amplitude_E * np.random.normal(size=n_samples)
75-
76-
signal = X_true + E_true
77-
signal[A_true != 0] = A_true[A_true != 0]
78-
96+
signal = X + A + E
7997
df = pd.DataFrame({"signal": signal, "index": range(n_samples), "station": city})
8098
df.set_index(["station", "index"], inplace=True)
8199

82-
df["X"] = X_true
83-
df["A"] = A_true
84-
df["E"] = E_true
100+
df["X"] = X
101+
df["A"] = A
102+
df["E"] = E
103+
return df
104+
elif name_data == "SNCF":
105+
path_file = os.path.join(datapath, "validations_idfm_std.parq")
106+
df = pd.read_parquet(path_file)
107+
sizes_stations = df.groupby("station")["val_in"].mean().sort_values()
108+
n_groups_max = min(len(sizes_stations), n_groups_max)
109+
stations = sizes_stations.index.get_level_values("station").unique()[-n_groups_max:]
110+
df = df.loc[stations]
85111
return df
86112
else:
87113
raise ValueError(f"Data name {name_data} is unknown!")
88114

89115

90-
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
116+
def preprocess_data_beijing(df: pd.DataFrame) -> pd.DataFrame:
117+
"""Preprocess data from the "Beijing" datset
118+
119+
Parameters
120+
----------
121+
df : pd.DataFrame
122+
dataframe with some specific column names
123+
124+
Returns
125+
-------
126+
pd.DataFrame
127+
preprocessed dataframe
128+
"""
129+
df["datetime"] = pd.to_datetime(df[["year", "month", "day", "hour"]])
130+
df["station"] = "Beijing"
131+
df.set_index(["station", "datetime"], inplace=True)
132+
df.drop(
133+
columns=["year", "month", "day", "hour", "No", "cbwd", "Iws", "Is", "Ir"], inplace=True
134+
)
135+
df.sort_index(inplace=True)
136+
df = df.groupby(
137+
["station", df.index.get_level_values("datetime").floor("d")], group_keys=False
138+
).mean()
139+
return df
140+
141+
142+
def preprocess_data_beijing_offline(df: pd.DataFrame) -> pd.DataFrame:
91143
"""Preprocess data from the "Beijing" datset
92144
93145
Parameters

qolmat/utils/utils.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,15 @@ def display_bar_table(data: pd.DataFrame, ylabel: Optional[str] = "", path: Opti
3737
plt.show()
3838

3939

40-
def progress_bar(iteration, total, prefix="", suffix="", decimals=1, length=100, fill="█"):
40+
def progress_bar(
41+
iteration: int,
42+
total: int,
43+
prefix: str = "",
44+
suffix: str = "",
45+
decimals: int = 1,
46+
length: int = 100,
47+
fill: str = "█",
48+
):
4149
"""Call in a loop to create terminal progress bar
4250
4351
Parameters
@@ -46,27 +54,40 @@ def progress_bar(iteration, total, prefix="", suffix="", decimals=1, length=100,
4654
current iteration
4755
total : int
4856
total iterations
49-
prefix : str, optional
57+
prefix : str
5058
prefix string, by default ""
51-
suffix : str, optional
59+
suffix : str
5260
suffix string, by default ""
53-
decimals : int, optional
61+
decimals : int
5462
positive number of decimals in percent complete, by default 1
55-
length : int, optional
63+
length : int
5664
character length of bar, by default 100
57-
fill : str, optional
65+
fill : str
5866
bar fill character, by default "█"
5967
"""
6068
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
6169
filled_length = int(length * iteration // total)
6270
bar = fill * filled_length + "-" * (length - filled_length)
6371
print(f"\r{prefix} |{bar}| {percent}% {suffix}", end="\r")
64-
# Print New Line on Complete
6572
if iteration == total:
6673
print()
6774

6875

6976
def acf(values: pd.Series, lag_max: int = 30) -> pd.Series:
77+
"""Correlation series of dataseries
78+
79+
Parameters
80+
----------
81+
values : pd.Series
82+
dataseries
83+
lag_max : int, optional
84+
the maximum lag, by default 30
85+
86+
Returns
87+
-------
88+
pd.Series
89+
correlation series of value
90+
"""
7091
acf = pd.Series(0, index=range(lag_max))
7192
for lag in range(lag_max):
7293
acf[lag] = values.corr(values.shift(lag))

0 commit comments

Comments
 (0)