Merge pull request #136 from scikit-learn-contrib/dev

JulienRoussel77 · web-flow · commit 5683e66f6f1b · 2024-04-17T10:45:12.000+02:00
Dev
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.4
+current_version = 0.1.5
 commit = True
 tag = True
 
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+omit = qolmat/_version.py
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -11,9 +11,9 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v3.12.0
       with:
         python-version: '3.10'
     - name: Install dependencies
diff --git a/.gitignore b/.gitignore
@@ -59,7 +59,7 @@ examples/*.ipynb
 examples/figures/*
 examples/data/*
 examples/local
-
+data/data_local/*
 
 # VSCode
 .vscode
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -5,10 +5,10 @@ Credits
 Development Team
 ----------------
 
-* Julien Roussel <jroussel@quantmetry.com>
-* Anh Khoa Ngo Ho <angoho@quantmetry.com>
-* Charles-Henri Prat <chprat@quantmetry.com>
-* Guillaume Saës <gsaes@quantmetry.com>
+* Julien Roussel <julien.a.roussel@capgemini.com>
+* Anh Khoa Ngo Ho <anh-khoa.ngo-ho@capgemini.com>
+* Guillaume Saës <guillaume.saes@capgemini.com>
+* Yasser Zidani <yasser.zidani@capgemini.com>
 
 Past Contributors
 -----------------
@@ -19,3 +19,4 @@ Past Contributors
 * Mikaïl Duran
 * Rima Hajou
 * Thomas Morzadec
+* Charles-Henri Prat
diff --git a/docs/conf.py b/docs/conf.py
@@ -27,7 +27,7 @@
 author = "Quantmetry"
 
 # The full version, including alpha/beta/rc tags
-version = "0.1.4"
+version = "0.1.5"
 release = version
 
 # -- General configuration ---------------------------------------------------
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+addopts = --cov=qolmat
diff --git a/qolmat/_version.py b/qolmat/_version.py
@@ -1 +1 @@
-__version__ = "0.1.4"
+__version__ = "0.1.5"
diff --git a/qolmat/benchmark/comparator.py b/qolmat/benchmark/comparator.py
@@ -52,7 +52,7 @@ def get_errors(
         df_origin: pd.DataFrame,
         df_imputed: pd.DataFrame,
         df_mask: pd.DataFrame,
-    ) -> pd.Series:
+    ) -> pd.DataFrame:
         """Functions evaluating the reconstruction's quality
 
         Parameters
@@ -64,15 +64,15 @@ def get_errors(
 
         Returns
         -------
-        dictionary
-            dictionay of results obtained via different metrics
+        pd.DataFrame
+            DataFrame of results obtained via different metrics
         """
         dict_errors = {}
         for name_metric in self.metrics:
             fun_metric = metrics.get_metric(name_metric)
             dict_errors[name_metric] = fun_metric(df_origin, df_imputed, df_mask)
-        errors = pd.concat(dict_errors.values(), keys=dict_errors.keys())
-        return errors
+        df_errors = pd.concat(dict_errors.values(), keys=dict_errors.keys())
+        return df_errors
 
     def evaluate_errors_sample(
         self,
@@ -96,8 +96,8 @@ def evaluate_errors_sample(
 
         Returns
         -------
-        pd.DataFrame
-            DataFrame with the errors for each metric (in column) and at each fold (in index)
+        pd.Series
+            Series with the errors for each metric and each variable
         """
         list_errors = []
         df_origin = df[self.selected_columns].copy()
@@ -115,8 +115,12 @@ def evaluate_errors_sample(
             )
             df_imputed = imputer_opti.fit_transform(df_corrupted)
             subset = self.generator_holes.subset
-            errors = self.get_errors(df_origin[subset], df_imputed[subset], df_mask[subset])
-            list_errors.append(errors)
+            if subset is None:
+                raise ValueError(
+                    "HoleGenerator `subset` should be overwritten in split but it is none!"
+                )
+            df_errors = self.get_errors(df_origin[subset], df_imputed[subset], df_mask[subset])
+            list_errors.append(df_errors)
         df_errors = pd.DataFrame(list_errors)
         errors_mean = df_errors.mean(axis=0)
 
@@ -136,7 +140,8 @@ def compare(
         Returns
         -------
         pd.DataFrame
-            dataframe with imputation
+            Dataframe with the metrics results, imputers are in columns and indices represent
+            metrics and variables.
         """
 
         dict_errors = {}
diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Callable, Dict, List, Optional
+from typing import Callable, Dict, List
 
 import numpy as np
 import pandas as pd
@@ -1030,7 +1030,9 @@ def pattern_based_weighted_mean_metric(
     return pd.Series(sum([s * w for s, w in zip(scores, weights)]), index=["All"])
 
 
-def get_metric(name: str) -> Callable:
+def get_metric(
+    name: str,
+) -> Callable[[pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.Series]:
     dict_metrics: Dict[str, Callable] = {
         "mse": mean_squared_error,
         "rmse": root_mean_squared_error,
diff --git a/qolmat/data/titanic.csv b/qolmat/data/titanic.csv
diff --git a/qolmat/imputations/preprocessing.py b/qolmat/imputations/preprocessing.py
@@ -320,7 +320,6 @@ def make_pipeline_mixte_preprocessing(
 
     if avoid_new:
         preprocessor.steps.append(("bins", BinTransformer()))
-    print(preprocessor)
     return preprocessor
 
 
diff --git a/qolmat/utils/data.py b/qolmat/utils/data.py
@@ -3,7 +3,7 @@
 import zipfile
 from datetime import datetime
 from math import pi
-from typing import List
+from typing import List, Tuple, Union
 from urllib import request
 
 import numpy as np
@@ -36,6 +36,24 @@ def read_csv_local(data_file_name: str, **kwargs) -> pd.DataFrame:
 def download_data_from_zip(
     zipname: str, urllink: str, datapath: str = "data/"
 ) -> List[pd.DataFrame]:
+    """
+    Downloads and extracts ZIP files from a URL, then loads DataFrames from CSV files.
+
+    Parameters
+    ----------
+    zipname : str
+        Name of the ZIP file to download, without the '.zip' extension.
+    urllink : str
+        Base URL where the ZIP file is hosted.
+    datapath : str, optional
+        Path to the directory where the ZIP will be downloaded and extracted.
+        Defaults to 'data/'.
+
+    Returns
+    -------
+    List[pd.DataFrame]
+        A list of DataFrames loaded from the CSV files within the extracted directory.
+    """
     path_zip = os.path.join(datapath, zipname)
     path_zip_ext = path_zip + ".zip"
     url = os.path.join(urllink, zipname) + ".zip"
@@ -50,6 +68,23 @@ def download_data_from_zip(
 
 
 def get_dataframes_in_folder(path: str, extension: str) -> List[pd.DataFrame]:
+    """
+    Loads all dataframes from files with a specified extension within a directory, including
+    subdirectories. Special handling for '.tsf' files which are converted and immediately returned.
+
+    Parameters
+    ----------
+    path : str
+        Path to the directory to search for files.
+    extension : str
+        File extension to filter files by, e.g., '.csv'.
+
+    Returns
+    -------
+    List[pd.DataFrame]
+        A list of pandas DataFrames loaded from the files matching the extension.
+        If a '.tsf' file is found, its converted DataFrame is returned immediately.
+    """
     list_df = []
     for folder, _, files in os.walk(path):
         for file in files:
@@ -61,7 +96,37 @@ def get_dataframes_in_folder(path: str, extension: str) -> List[pd.DataFrame]:
     return list_df
 
 
-def generate_artificial_ts(n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise):
+def generate_artificial_ts(
+    n_samples: int,
+    periods: List[int],
+    amp_anomalies: float,
+    ratio_anomalies: float,
+    amp_noise: float,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Generates time series data, anomalies, and noise based on given parameters.
+
+    Parameters
+    ----------
+    n_samples : int
+        Number of samples in the time series.
+    periods : List[int]
+        List of periods that are added to the time series.
+    amp_anomalies : float
+        Amplitude multiplier for anomalies.
+    ratio_anomalies : float
+        Ratio of total samples that will be anomalies.
+    amp_noise : float
+        Standard deviation of Gaussian noise.
+
+    Returns
+    -------
+    Tuple[np.ndarray, np.ndarray, np.ndarray]
+        Time series data with sine waves (X).
+        Anomaly data with specified amplitudes at random positions (A).
+        Gaussian noise added to the time series (E).
+    """
+
     mesh = np.arange(n_samples)
     X = np.ones(n_samples)
     for p in periods:
@@ -83,7 +148,8 @@ def get_data(
     datapath: str = "data/",
     n_groups_max: int = sys.maxsize,
 ) -> pd.DataFrame:
-    """Download or generate data
+    """
+    Download or generate data
 
     Parameters
     ----------
@@ -102,38 +168,19 @@ def get_data(
     if name_data == "Beijing":
         df = read_csv_local("beijing")
         df["date"] = pd.to_datetime(df["date"])
-
-        # df["date"] = pd.to_datetime(
-        #     {
-        #         "year": df["year"],
-        #         "month": df["month"],
-        #         "day": df["day"],
-        #         "hour": df["hour"],
-        #     }
-        # )
         df = df.drop(columns=["year", "month", "day", "hour", "wd"])
-        # df = df.set_index(["station", "date"])
         df = df.groupby(["station", "date"]).mean()
         return df
     elif name_data == "Superconductor":
         df = read_csv_local("conductors")
         return df
     elif name_data == "Titanic":
-        df = read_csv_local("titanic", sep=";")
-        df = df.dropna(how="all")
-        df = df.drop(
-            columns=[
-                "pclass",
-                "name",
-                "home.dest",
-                "cabin",
-                "ticket",
-                "boat",
-                "body",
-            ]
-        )
-        df["age"] = pd.to_numeric(df["age"], errors="coerce")
-        df["fare"] = pd.to_numeric(df["fare"].str.replace(",", ""), errors="coerce")
+        path = "https://gist.githubusercontent.com/fyyying/4aa5b471860321d7b47fd881898162b7/raw/"
+        "6907bb3a38bfbb6fccf3a8b1edfb90e39714d14f/titanic_dataset.csv"
+        df = pd.read_csv(path)
+        df = df[["Survived", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
+        df["Age"] = pd.to_numeric(df["Age"], errors="coerce")
+        df["Fare"] = pd.to_numeric(df["Fare"], errors="coerce")
         return df
     elif name_data == "Artificial":
         city = "Wonderland"
@@ -272,22 +319,16 @@ def add_holes(df: pd.DataFrame, ratio_masked: float, mean_size: int) -> pd.DataF
 
     ratio_masked : float
         Targeted global proportion of nans added in the returned dataset
-
-    groups: list of strings
-        List of the column names used as groups
-
     Returns
     -------
     pd.DataFrame
         dataframe with missing values
     """
-    try:
-        groups = df.index.names.difference(["datetime", "date", "index"])
+    groups = df.index.names.difference(["datetime", "date", "index"])
+    if groups != []:
         generator = missing_patterns.GeometricHoleGenerator(
             1, ratio_masked=ratio_masked, subset=df.columns, groups=groups
         )
-    except ValueError:
-        print("No group")
     else:
         generator = missing_patterns.GeometricHoleGenerator(
             1, ratio_masked=ratio_masked, subset=df.columns
@@ -388,42 +429,27 @@ def convert_tsf_to_dataframe(
     col_types = []
     all_data = {}
     line_count = 0
-    # frequency = None
-    # forecast_horizon = None
-    # contain_missing_values = None
-    # contain_equal_length = None
     found_data_tag = False
     found_data_section = False
     started_reading_data_section = False
 
     with open(full_file_path_and_name, "r", encoding="cp1252") as file:
         for line in file:
-            # Strip white space from start/end of line
             line = line.strip()
 
             if line:
-                if line.startswith("@"):  # Read meta-data
+                if line.startswith("@"):
                     if not line.startswith("@data"):
                         line_content = line.split(" ")
                         if line.startswith("@attribute"):
-                            if len(line_content) != 3:  # Attributes have both name and type
+                            if len(line_content) != 3:
                                 raise Exception("Invalid meta-data specification.")
 
                             col_names.append(line_content[1])
                             col_types.append(line_content[2])
                         else:
-                            if len(line_content) != 2:  # Other meta-data have only values
+                            if len(line_content) != 2:
                                 raise Exception("Invalid meta-data specification.")
-
-                            # if line.startswith("@frequency"):
-                            #     frequency = line_content[1]
-                            # elif line.startswith("@horizon"):
-                            #     forecast_horizon = int(line_content[1])
-                            # elif line.startswith("@missing"):
-                            #     contain_missing_values = bool(strtobool(line_content[1]))
-                            # elif line.startswith("@equallength"):
-                            #     contain_equal_length = bool(strtobool(line_content[1]))
-
                     else:
                         if len(col_names) == 0:
                             raise Exception("Attribute section must come before data.")
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 from setuptools import find_packages, setup
 
 DISTNAME = "qolmat"
-VERSION = "0.1.4"
+VERSION = "0.1.5"
 DESCRIPTION = "A Python library for optimal data imputation."
 LONG_DESCRIPTION_CONTENT_TYPE = "text/x-rst"
 with codecs.open("README.rst", encoding="utf-8-sig") as f:
diff --git a/tests/benchmark/test_comparator.py b/tests/benchmark/test_comparator.py
diff --git a/tests/imputations/test_preprocessing.py b/tests/imputations/test_preprocessing.py
diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.1.4"`
	`1`	`+__version__ = "0.1.5"`