Merge branch 'dev' of https://github.com/Quantmetry/qolmat into dev

Julien Roussel · Julien Roussel · commit e59e1e082e83 · 2024-04-17T00:07:57.000+02:00
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+omit = qolmat/_version.py
diff --git a/.gitignore b/.gitignore
@@ -59,7 +59,7 @@ examples/*.ipynb
 examples/figures/*
 examples/data/*
 examples/local
-
+data/data_local/*
 
 # VSCode
 .vscode
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+addopts = --cov=qolmat
diff --git a/qolmat/benchmark/comparator.py b/qolmat/benchmark/comparator.py
@@ -52,7 +52,7 @@ def get_errors(
         df_origin: pd.DataFrame,
         df_imputed: pd.DataFrame,
         df_mask: pd.DataFrame,
-    ) -> pd.Series:
+    ) -> pd.DataFrame:
         """Functions evaluating the reconstruction's quality
 
         Parameters
@@ -64,15 +64,15 @@ def get_errors(
 
         Returns
         -------
-        dictionary
-            dictionay of results obtained via different metrics
+        pd.DataFrame
+            DataFrame of results obtained via different metrics
         """
         dict_errors = {}
         for name_metric in self.metrics:
             fun_metric = metrics.get_metric(name_metric)
             dict_errors[name_metric] = fun_metric(df_origin, df_imputed, df_mask)
-        errors = pd.concat(dict_errors.values(), keys=dict_errors.keys())
-        return errors
+        df_errors = pd.concat(dict_errors.values(), keys=dict_errors.keys())
+        return df_errors
 
     def evaluate_errors_sample(
         self,
@@ -96,8 +96,8 @@ def evaluate_errors_sample(
 
         Returns
         -------
-        pd.DataFrame
-            DataFrame with the errors for each metric (in column) and at each fold (in index)
+        pd.Series
+            Series with the errors for each metric and each variable
         """
         list_errors = []
         df_origin = df[self.selected_columns].copy()
@@ -115,8 +115,12 @@ def evaluate_errors_sample(
             )
             df_imputed = imputer_opti.fit_transform(df_corrupted)
             subset = self.generator_holes.subset
-            errors = self.get_errors(df_origin[subset], df_imputed[subset], df_mask[subset])
-            list_errors.append(errors)
+            if subset is None:
+                raise ValueError(
+                    "HoleGenerator `subset` should be overwritten in split but it is none!"
+                )
+            df_errors = self.get_errors(df_origin[subset], df_imputed[subset], df_mask[subset])
+            list_errors.append(df_errors)
         df_errors = pd.DataFrame(list_errors)
         errors_mean = df_errors.mean(axis=0)
 
@@ -136,7 +140,8 @@ def compare(
         Returns
         -------
         pd.DataFrame
-            dataframe with imputation
+            Dataframe with the metrics results, imputers are in columns and indices represent
+            metrics and variables.
         """
 
         dict_errors = {}
diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Callable, Dict, List, Optional
+from typing import Callable, Dict, List
 
 import numpy as np
 import pandas as pd
@@ -1030,7 +1030,9 @@ def pattern_based_weighted_mean_metric(
     return pd.Series(sum([s * w for s, w in zip(scores, weights)]), index=["All"])
 
 
-def get_metric(name: str) -> Callable:
+def get_metric(
+    name: str,
+) -> Callable[[pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.Series]:
     dict_metrics: Dict[str, Callable] = {
         "mse": mean_squared_error,
         "rmse": root_mean_squared_error,
diff --git a/qolmat/utils/data.py b/qolmat/utils/data.py
@@ -3,7 +3,7 @@
 import zipfile
 from datetime import datetime
 from math import pi
-from typing import List
+from typing import List, Tuple, Union
 from urllib import request
 
 import numpy as np
@@ -36,6 +36,24 @@ def read_csv_local(data_file_name: str, **kwargs) -> pd.DataFrame:
 def download_data_from_zip(
     zipname: str, urllink: str, datapath: str = "data/"
 ) -> List[pd.DataFrame]:
+    """
+    Downloads and extracts ZIP files from a URL, then loads DataFrames from CSV files.
+
+    Parameters
+    ----------
+    zipname : str
+        Name of the ZIP file to download, without the '.zip' extension.
+    urllink : str
+        Base URL where the ZIP file is hosted.
+    datapath : str, optional
+        Path to the directory where the ZIP will be downloaded and extracted.
+        Defaults to 'data/'.
+
+    Returns
+    -------
+    List[pd.DataFrame]
+        A list of DataFrames loaded from the CSV files within the extracted directory.
+    """
     path_zip = os.path.join(datapath, zipname)
     path_zip_ext = path_zip + ".zip"
     url = os.path.join(urllink, zipname) + ".zip"
@@ -50,6 +68,23 @@ def download_data_from_zip(
 
 
 def get_dataframes_in_folder(path: str, extension: str) -> List[pd.DataFrame]:
+    """
+    Loads all dataframes from files with a specified extension within a directory, including
+    subdirectories. Special handling for '.tsf' files which are converted and immediately returned.
+
+    Parameters
+    ----------
+    path : str
+        Path to the directory to search for files.
+    extension : str
+        File extension to filter files by, e.g., '.csv'.
+
+    Returns
+    -------
+    List[pd.DataFrame]
+        A list of pandas DataFrames loaded from the files matching the extension.
+        If a '.tsf' file is found, its converted DataFrame is returned immediately.
+    """
     list_df = []
     for folder, _, files in os.walk(path):
         for file in files:
@@ -61,7 +96,37 @@ def get_dataframes_in_folder(path: str, extension: str) -> List[pd.DataFrame]:
     return list_df
 
 
-def generate_artificial_ts(n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise):
+def generate_artificial_ts(
+    n_samples: int,
+    periods: List[int],
+    amp_anomalies: float,
+    ratio_anomalies: float,
+    amp_noise: float,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Generates time series data, anomalies, and noise based on given parameters.
+
+    Parameters
+    ----------
+    n_samples : int
+        Number of samples in the time series.
+    periods : List[int]
+        List of periods that are added to the time series.
+    amp_anomalies : float
+        Amplitude multiplier for anomalies.
+    ratio_anomalies : float
+        Ratio of total samples that will be anomalies.
+    amp_noise : float
+        Standard deviation of Gaussian noise.
+
+    Returns
+    -------
+    Tuple[np.ndarray, np.ndarray, np.ndarray]
+        Time series data with sine waves (X).
+        Anomaly data with specified amplitudes at random positions (A).
+        Gaussian noise added to the time series (E).
+    """
+
     mesh = np.arange(n_samples)
     X = np.ones(n_samples)
     for p in periods:
@@ -83,7 +148,8 @@ def get_data(
     datapath: str = "data/",
     n_groups_max: int = sys.maxsize,
 ) -> pd.DataFrame:
-    """Download or generate data
+    """
+    Download or generate data
 
     Parameters
     ----------
@@ -102,39 +168,16 @@ def get_data(
     if name_data == "Beijing":
         df = read_csv_local("beijing")
         df["date"] = pd.to_datetime(df["date"])
-
-        # df["date"] = pd.to_datetime(
-        #     {
-        #         "year": df["year"],
-        #         "month": df["month"],
-        #         "day": df["day"],
-        #         "hour": df["hour"],
-        #     }
-        # )
         df = df.drop(columns=["year", "month", "day", "hour", "wd"])
-        # df = df.set_index(["station", "date"])
         df = df.groupby(["station", "date"]).mean()
         return df
     elif name_data == "Superconductor":
         df = read_csv_local("conductors")
         return df
     elif name_data == "Titanic":
-        # df = read_csv_local("titanic", sep=";")
         path = "https://gist.githubusercontent.com/fyyying/4aa5b471860321d7b47fd881898162b7/raw/"
         "6907bb3a38bfbb6fccf3a8b1edfb90e39714d14f/titanic_dataset.csv"
         df = pd.read_csv(path)
-        # df = df.dropna(how="all")
-        # df = df.drop(
-        #     columns=[
-        #         "pclass",
-        #         "name",
-        #         "home.dest",
-        #         "cabin",
-        #         "ticket",
-        #         "boat",
-        #         "body",
-        #     ]
-        # )
         df = df[["Survived", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
         df["Age"] = pd.to_numeric(df["Age"], errors="coerce")
         df["Fare"] = pd.to_numeric(df["Fare"], errors="coerce")
@@ -276,22 +319,16 @@ def add_holes(df: pd.DataFrame, ratio_masked: float, mean_size: int) -> pd.DataF
 
     ratio_masked : float
         Targeted global proportion of nans added in the returned dataset
-
-    groups: list of strings
-        List of the column names used as groups
-
     Returns
     -------
     pd.DataFrame
         dataframe with missing values
     """
-    try:
-        groups = df.index.names.difference(["datetime", "date", "index"])
+    groups = df.index.names.difference(["datetime", "date", "index"])
+    if groups != []:
         generator = missing_patterns.GeometricHoleGenerator(
             1, ratio_masked=ratio_masked, subset=df.columns, groups=groups
         )
-    except ValueError:
-        print("No group")
     else:
         generator = missing_patterns.GeometricHoleGenerator(
             1, ratio_masked=ratio_masked, subset=df.columns
@@ -392,42 +429,27 @@ def convert_tsf_to_dataframe(
     col_types = []
     all_data = {}
     line_count = 0
-    # frequency = None
-    # forecast_horizon = None
-    # contain_missing_values = None
-    # contain_equal_length = None
     found_data_tag = False
     found_data_section = False
     started_reading_data_section = False
 
     with open(full_file_path_and_name, "r", encoding="cp1252") as file:
         for line in file:
-            # Strip white space from start/end of line
             line = line.strip()
 
             if line:
-                if line.startswith("@"):  # Read meta-data
+                if line.startswith("@"):
                     if not line.startswith("@data"):
                         line_content = line.split(" ")
                         if line.startswith("@attribute"):
-                            if len(line_content) != 3:  # Attributes have both name and type
+                            if len(line_content) != 3:
                                 raise Exception("Invalid meta-data specification.")
 
                             col_names.append(line_content[1])
                             col_types.append(line_content[2])
                         else:
-                            if len(line_content) != 2:  # Other meta-data have only values
+                            if len(line_content) != 2:
                                 raise Exception("Invalid meta-data specification.")
-
-                            # if line.startswith("@frequency"):
-                            #     frequency = line_content[1]
-                            # elif line.startswith("@horizon"):
-                            #     forecast_horizon = int(line_content[1])
-                            # elif line.startswith("@missing"):
-                            #     contain_missing_values = bool(strtobool(line_content[1]))
-                            # elif line.startswith("@equallength"):
-                            #     contain_equal_length = bool(strtobool(line_content[1]))
-
                     else:
                         if len(col_names) == 0:
                             raise Exception("Attribute section must come before data.")
diff --git a/tests/benchmark/test_comparator.py b/tests/benchmark/test_comparator.py
@@ -0,0 +1,78 @@
+import pytest
+import numpy as np
+import pandas as pd
+
+from unittest.mock import patch, MagicMock
+from qolmat.benchmark.comparator import Comparator
+
+generator_holes_mock = MagicMock()
+generator_holes_mock.split.return_value = [
+    pd.DataFrame({"A": [False, False, True], "B": [True, False, False]})
+]
+
+comparator = Comparator(
+    dict_models={},
+    selected_columns=["A", "B"],
+    generator_holes=generator_holes_mock,
+    metrics=["mae", "mse"],
+)
+
+imputer_mock = MagicMock()
+expected_get_errors = pd.Series(
+    [1.0, 1.0, 1.0, 1.0],
+    index=pd.MultiIndex.from_tuples([("mae", "A"), ("mae", "B"), ("mse", "A"), ("mse", "B")]),
+)
+
+
+@patch("qolmat.benchmark.metrics.get_metric")
+def test_get_errors(mock_get_metric):
+    df_origin = pd.DataFrame({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]})
+    df_imputed = pd.DataFrame({"A": [1, 2, 4], "B": [4, 5, 7]})
+    df_mask = pd.DataFrame({"A": [False, False, True], "B": [False, False, True]})
+
+    mock_get_metric.return_value = lambda df_origin, df_imputed, df_mask: pd.Series(
+        [1.0, 1.0], index=["A", "B"]
+    )
+    errors = comparator.get_errors(df_origin, df_imputed, df_mask)
+    pd.testing.assert_series_equal(errors, expected_get_errors)
+
+
+@patch("qolmat.benchmark.hyperparameters.optimize", return_value=imputer_mock)
+@patch(
+    "qolmat.benchmark.comparator.Comparator.get_errors",
+    return_value=expected_get_errors,
+)
+def test_evaluate_errors_sample(mock_get_errors, mock_optimize):
+    errors_mean = comparator.evaluate_errors_sample(
+        imputer_mock, pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, np.nan]})
+    )
+    expected_errors_mean = expected_get_errors
+    pd.testing.assert_series_equal(errors_mean, expected_errors_mean)
+    mock_optimize.assert_called_once()
+    mock_get_errors.assert_called()
+
+
+@patch(
+    "qolmat.benchmark.comparator.Comparator.evaluate_errors_sample",
+    return_value=expected_get_errors,
+)
+def test_compare(mock_evaluate_errors_sample):
+    df_test = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+
+    imputer1 = MagicMock(name="Imputer1")
+    imputer2 = MagicMock(name="Imputer2")
+    comparator.dict_imputers = {"imputer1": imputer1, "imputer2": imputer2}
+
+    errors_imputer1 = pd.Series([0.1, 0.2], index=["mae", "mse"])
+    errors_imputer2 = pd.Series([0.3, 0.4], index=["mae", "mse"])
+    mock_evaluate_errors_sample.side_effect = [errors_imputer1, errors_imputer2]
+
+    df_errors = comparator.compare(df_test)
+    assert mock_evaluate_errors_sample.call_count == 2
+
+    mock_evaluate_errors_sample.assert_any_call(imputer1, df_test, {}, "mse")
+    mock_evaluate_errors_sample.assert_any_call(imputer2, df_test, {}, "mse")
+    expected_df_errors = pd.DataFrame(
+        {"imputer1": [0.1, 0.2], "imputer2": [0.3, 0.4]}, index=["mae", "mse"]
+    )
+    pd.testing.assert_frame_equal(df_errors, expected_df_errors)
diff --git a/tests/imputations/test_preprocessing.py b/tests/imputations/test_preprocessing.py
diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py