Beijing data set modified to fit online changes

vm-aifluence-jro · vm-aifluence-jro · commit 7ef8e4efdc39 · 2023-06-13T14:57:39.000Z
diff --git a/examples/benchmark.md b/examples/benchmark.md
@@ -149,7 +149,7 @@ dict_imputers = {
     # "mode": imputer_mode,
     "interpolation": imputer_interpol,
     # "spline": imputer_spline,
-    # "shuffle": imputer_shuffle,
+    "shuffle": imputer_shuffle,
     # "residuals": imputer_residuals,
     # "OU": imputer_ou,
     # "TSOU": imputer_tsou,
@@ -186,6 +186,12 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
 
 Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
 
+```python
+df = pd.DataFrame(columns=["a", "b"])
+df["a"] = [1, 2]
+df
+```
+
 ```python
 generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
 
diff --git a/qolmat/utils/data.py b/qolmat/utils/data.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import zipfile
 from math import pi
 from typing import List, Optional
@@ -11,23 +12,47 @@
 
 
 def download_data(zipname: str, urllink: str, datapath: str = "data/") -> List[pd.DataFrame]:
-    path_zip = os.path.join(datapath)
-    if not os.path.exists(path_zip + ".zip"):
-        if not os.path.exists(datapath):
-            os.mkdir(datapath)
-        request.urlretrieve(urllink + zipname + ".zip", path_zip + ".zip")
-
-    with zipfile.ZipFile(path_zip + ".zip", "r") as zip_ref:
-        zip_ref.extractall(path_zip)
-    data_folder = os.listdir(path_zip)
-    subfolder = os.path.join(path_zip, data_folder[0])
-    data_files = os.listdir(subfolder)
-    list_df = [pd.read_csv(os.path.join(subfolder, file)) for file in data_files]
+    path_zip = os.path.join(datapath, zipname)
+    path_zip_ext = path_zip + ".zip"
+    url = os.path.join(urllink, zipname) + ".zip"
+    os.makedirs(datapath, exist_ok=True)
+    print("exists ?", path_zip_ext)
+    if not os.path.exists(path_zip_ext) and not os.path.exists(path_zip):
+        print("url")
+        print(url)
+        request.urlretrieve(url, path_zip_ext)
+    if not os.path.exists(path_zip):
+        with zipfile.ZipFile(path_zip_ext, "r") as zip_ref:
+            zip_ref.extractall(path_zip)
+    list_df = []
+    for folder, _, files in os.walk(path_zip):
+        for file in files:
+            if ".csv" in file:
+                list_df.append(pd.read_csv(os.path.join(folder, file)))
     return list_df
 
 
+def generate_artificial_ts(n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise):
+    mesh = np.arange(n_samples)
+    X = np.ones(n_samples)
+    for p in periods:
+        X += np.sin(2 * pi * mesh / p)
+
+    n_anomalies = int(n_samples * ratio_anomalies)
+    anomalies = np.random.standard_exponential(size=n_anomalies)
+    anomalies *= amp_anomalies * np.random.choice([-1, 1], size=n_anomalies)
+    ind_anomalies = np.random.choice(range(n_samples), size=n_anomalies, replace=False)
+    A = np.zeros(n_samples)
+    A[ind_anomalies] = anomalies
+
+    E = amp_noise * np.random.normal(size=n_samples)
+    return X, A, E
+
+
 def get_data(
-    name_data: str = "Beijing", datapath: str = "data/", download: Optional[bool] = True
+    name_data: str = "Beijing",
+    datapath: str = "data/",
+    n_groups_max: int = sys.maxsize,
 ) -> pd.DataFrame:
     """Download or generate data
 
@@ -45,49 +70,78 @@ def get_data(
         requested data
     """
     if name_data == "Beijing":
+        urllink = "https://archive.ics.uci.edu/static/public/381/"
+        zipname = "beijing+pm2+5+data"
+
+        list_df = download_data(zipname, urllink, datapath=datapath)
+        list_df = [preprocess_data_beijing(df) for df in list_df]
+        df = pd.concat(list_df)
+        return df
+    elif name_data == "Beijing_offline":
         urllink = "https://archive.ics.uci.edu/dataset/381/beijing+pm2+5+data"
         zipname = "PRSA2017_Data_20130301-20170228"
+
         list_df = download_data(zipname, urllink, datapath=datapath)
-        list_df = [preprocess_data(df) for df in list_df]
+        list_df = [preprocess_data_beijing_offline(df) for df in list_df]
         df = pd.concat(list_df)
         return df
     elif name_data == "Artificial":
         city = "Wonderland"
         n_samples = 1000
-        p1 = 100
-        p2 = 20
-        amplitude_A = 0.5
-        freq_A = 0.05
-        amplitude_E = 0.1
+        periods = [100, 20]
+        amp_anomalies = 0.5
+        ratio_anomalies = 0.05
+        amp_noise = 0.1
 
-        mesh = np.arange(n_samples)
-
-        X_true = 1 + np.sin(2 * pi * mesh / p1) + np.sin(2 * pi * mesh / p2)
-
-        noise = np.random.uniform(size=n_samples)
-        A_true = (
-            amplitude_A
-            * np.where(noise < freq_A, -np.log(noise), 0)
-            * (2 * (np.random.uniform(size=n_samples) > 0.5) - 1)
+        X, A, E = generate_artificial_ts(
+            n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise
         )
-
-        E_true = amplitude_E * np.random.normal(size=n_samples)
-
-        signal = X_true + E_true
-        signal[A_true != 0] = A_true[A_true != 0]
-
+        signal = X + A + E
         df = pd.DataFrame({"signal": signal, "index": range(n_samples), "station": city})
         df.set_index(["station", "index"], inplace=True)
 
-        df["X"] = X_true
-        df["A"] = A_true
-        df["E"] = E_true
+        df["X"] = X
+        df["A"] = A
+        df["E"] = E
+        return df
+    elif name_data == "SNCF":
+        path_file = os.path.join(datapath, "validations_idfm_std.parq")
+        df = pd.read_parquet(path_file)
+        sizes_stations = df.groupby("station")["val_in"].mean().sort_values()
+        n_groups_max = min(len(sizes_stations), n_groups_max)
+        stations = sizes_stations.index.get_level_values("station").unique()[-n_groups_max:]
+        df = df.loc[stations]
         return df
     else:
         raise ValueError(f"Data name {name_data} is unknown!")
 
 
-def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
+def preprocess_data_beijing(df: pd.DataFrame) -> pd.DataFrame:
+    """Preprocess data from the "Beijing" datset
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        dataframe with some specific column names
+
+    Returns
+    -------
+    pd.DataFrame
+        preprocessed dataframe
+    """
+    df["datetime"] = pd.to_datetime(df[["year", "month", "day", "hour"]])
+    df["station"] = "Beijing"
+    df.set_index(["station", "datetime"], inplace=True)
+    # df.drop(columns=["year", "month", "day", "hour", "No", "cbwd", ""], inplace=True)
+    df.drop(columns=["year", "month", "day", "hour", "wd", "No"], inplace=True)
+    df.sort_index(inplace=True)
+    df = df.groupby(
+        ["station", df.index.get_level_values("datetime").floor("d")], group_keys=False
+    ).mean()
+    return df
+
+
+def preprocess_data_beijing_offline(df: pd.DataFrame) -> pd.DataFrame:
     """Preprocess data from the "Beijing" datset
 
     Parameters
@@ -100,6 +154,8 @@ def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
     pd.DataFrame
         preprocessed dataframe
     """
+    print("preprocess_data_beijing_offline")
+    print(df.dtypes)
     df["datetime"] = pd.to_datetime(df[["year", "month", "day", "hour"]])
     df.set_index(["station", "datetime"], inplace=True)
     df.drop(columns=["year", "month", "day", "hour", "wd", "No"], inplace=True)
diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py
@@ -3,12 +3,33 @@
 import numpy as np
 import pandas as pd
 import pytest
+from pytest_mock.plugin import MockerFixture
 
 from qolmat.utils import data
-from pytest_mock.plugin import MockerFixture
+
+columns = ["No", "year", "month", "day", "hour", "a", "b", "wd"]
+df_beijing = pd.DataFrame(
+    [
+        [1, 2013, 3, 1, 0, 1, 2, "NW"],
+        [2, 2014, 3, 1, 0, 3, np.nan, "NW"],
+        [3, 2015, 3, 1, 0, np.nan, 6, "NW"],
+    ],
+    columns=columns,
+)
+index_preprocess_beijing = pd.MultiIndex.from_tuples(
+    [
+        ("Beijing", datetime.datetime(2013, 3, 1)),
+        ("Beijing", datetime.datetime(2014, 3, 1)),
+        ("Beijing", datetime.datetime(2015, 3, 1)),
+    ],
+    names=["station", "datetime"],
+)
+df_preprocess_beijing = pd.DataFrame(
+    [[1, 2], [3, np.nan], [np.nan, 6]], columns=["a", "b"], index=index_preprocess_beijing
+)
 
 columns = ["No", "year", "month", "day", "hour", "a", "b", "wd", "station"]
-df = pd.DataFrame(
+df_offline = pd.DataFrame(
     [
         [1, 2013, 3, 1, 0, 1, 2, "NW", "Gucheng"],
         [2, 2014, 3, 1, 0, 3, np.nan, "NW", "Gucheng"],
@@ -17,18 +38,19 @@
     columns=columns,
 )
 
-index_preprocess = pd.MultiIndex.from_tuples(
+index_preprocess_offline = pd.MultiIndex.from_tuples(
     [
         ("Gucheng", datetime.datetime(2013, 3, 1)),
         ("Gucheng", datetime.datetime(2014, 3, 1)),
         ("Gucheng", datetime.datetime(2015, 3, 1)),
     ],
     names=["station", "datetime"],
 )
-df_preprocess = pd.DataFrame(
-    [[1, 2], [3, np.nan], [np.nan, 6]], columns=["a", "b"], index=index_preprocess
+df_preprocess_offline = pd.DataFrame(
+    [[1, 2], [3, np.nan], [np.nan, 6]], columns=["a", "b"], index=index_preprocess_offline
 )
 
+
 urllink = "https://archive.ics.uci.edu/ml/machine-learning-databases/00501/"
 zipname = "PRSA2017_Data_20130301-20170228"
 
@@ -40,20 +62,34 @@
 #     list_df_result = data.download_data(zipname, urllink)
 
 
-@pytest.mark.parametrize("name_data", ["Beijing", "Artificial", "Bug"])
-def test_utils_data_get_data(name_data: str, mocker: MockerFixture) -> None:
+@pytest.mark.parametrize(
+    "name_data, df",
+    [
+        ("Beijing", df_beijing),
+        ("Beijing_offline", df_offline),
+        ("Artificial", None),
+        ("Bug", None),
+    ],
+)
+def test_utils_data_get_data(name_data: str, df: pd.DataFrame, mocker: MockerFixture) -> None:
     mock_download = mocker.patch("qolmat.utils.data.download_data", return_value=[df])
-    mocker.patch("qolmat.utils.data.preprocess_data", return_value=df_preprocess)
+    mocker.patch(
+        "qolmat.utils.data.preprocess_data_beijing_offline", return_value=df_preprocess_offline
+    )
+    mocker.patch("qolmat.utils.data.preprocess_data_beijing", return_value=df_preprocess_beijing)
     try:
         df_result = data.get_data(name_data=name_data)
     except ValueError:
-        assert name_data not in ["Beijing", "Artificial"]
+        assert name_data not in ["Beijing", "Beijing_offline", "Artificial"]
         np.testing.assert_raises(ValueError, data.get_data, name_data)
         return
 
     if name_data == "Beijing":
         assert mock_download.call_count == 1
-        pd.testing.assert_frame_equal(df_result, df_preprocess)
+        pd.testing.assert_frame_equal(df_result, df_preprocess_beijing)
+    elif name_data == "Beijing_offline":
+        assert mock_download.call_count == 1
+        pd.testing.assert_frame_equal(df_result, df_preprocess_offline)
     elif name_data == "Artificial":
         expected_columns = ["signal", "X", "A", "E"]
         assert isinstance(df_result, pd.DataFrame)
@@ -62,13 +98,17 @@ def test_utils_data_get_data(name_data: str, mocker: MockerFixture) -> None:
         assert False
 
 
-@pytest.mark.parametrize("df", [df])
-def test_utils_data_preprocess_data(df: pd.DataFrame) -> None:
-    result = data.preprocess_data(df)
-    pd.testing.assert_frame_equal(result, df_preprocess, atol=1e-3)
+@pytest.mark.parametrize("df", [df_offline])
+def test_utils_data_preprocess_data_beijing_offline(df: pd.DataFrame) -> None:
+    result = data.preprocess_data_beijing_offline(df)
+    print(result)
+    print(df_preprocess_offline)
+    print(result.dtypes)
+    print(df_preprocess_offline.dtypes)
+    pd.testing.assert_frame_equal(result, df_preprocess_offline, atol=1e-3)
 
 
-@pytest.mark.parametrize("df", [df_preprocess])
+@pytest.mark.parametrize("df", [df_preprocess_offline])
 def test_utils_data_add_holes(df: pd.DataFrame) -> None:
     df_out = data.add_holes(df, 0.0, 1)
     assert df_out.isna().sum().sum() == 2
@@ -78,33 +118,33 @@ def test_utils_data_add_holes(df: pd.DataFrame) -> None:
 
 @pytest.mark.parametrize("name_data", ["Beijing"])
 def test_utils_data_get_data_corrupted(name_data: str, mocker: MockerFixture) -> None:
-    mock_download = mocker.patch("qolmat.utils.data.download_data", return_value=[df])
-    mocker.patch("qolmat.utils.data.preprocess_data", return_value=df_preprocess)
+    mock_download = mocker.patch("qolmat.utils.data.download_data", return_value=[df_beijing])
+    mocker.patch("qolmat.utils.data.preprocess_data_beijing", return_value=df_preprocess_beijing)
     df_out = data.get_data_corrupted()
     df_result = pd.DataFrame(
-        [[1, 2], [np.nan, np.nan], [np.nan, 6]], columns=["a", "b"], index=index_preprocess
+        [[1, 2], [np.nan, np.nan], [np.nan, 6]], columns=["a", "b"], index=index_preprocess_beijing
     )
     assert mock_download.call_count == 1
     pd.testing.assert_frame_equal(df_result, df_out)
 
 
-@pytest.mark.parametrize("df", [df_preprocess])
+@pytest.mark.parametrize("df", [df_preprocess_beijing])
 def test_utils_data_add_station_features(df: pd.DataFrame) -> None:
-    columns_out = ["a", "b"] + ["station=Gucheng"]
+    columns_out = ["a", "b"] + ["station=Beijing"]
     expected = pd.DataFrame(
         [
             [1, 2, 1.0],
             [3, np.nan, 1.0],
             [np.nan, 6, 1.0],
         ],
         columns=columns_out,
-        index=index_preprocess,
+        index=index_preprocess_beijing,
     )
     result = data.add_station_features(df)
     pd.testing.assert_frame_equal(result, expected, atol=1e-3)
 
 
-@pytest.mark.parametrize("df", [df_preprocess])
+@pytest.mark.parametrize("df", [df_preprocess_beijing])
 def test_utils_data_add_datetime_features(df: pd.DataFrame) -> None:
     columns_out = ["a", "b"] + ["time_cos"]
     expected = pd.DataFrame(
@@ -114,7 +154,7 @@ def test_utils_data_add_datetime_features(df: pd.DataFrame) -> None:
             [np.nan, 6, 0.512],
         ],
         columns=columns_out,
-        index=index_preprocess,
+        index=index_preprocess_beijing,
     )
     result = data.add_datetime_features(df)
     pd.testing.assert_frame_equal(result, expected, atol=1e-3)