diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 33a40614..00000000 --- a/.flake8 +++ /dev/null @@ -1,9 +0,0 @@ -[flake8] -exclude = .git,__pycache__,.vscode -max-line-length=99 -ignore=E302,E305,W503,E203,E731,E402,E266,E712,F401,F821 -indent-size = 4 -per-file-ignores= - qolmat/imputations/imputers.py:F401 - */__init__.py:F401 - examples/test.py:F401 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index b3177c72..39ba5324 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,13 +1,11 @@ -name: Publish Package on PYPI +name: Publish Package on PyPI on: release: types: [published] - jobs: deploy: - runs-on: ubuntu-latest steps: @@ -16,14 +14,19 @@ jobs: uses: actions/setup-python@v4 with: python-version: '3.10' + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python3 - + echo "$HOME/.local/bin" >> $GITHUB_PATH - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine + poetry install - name: Build package - run: python setup.py sdist bdist_wheel + run: | + poetry build - name: Publish package - uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + env: + PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + run: | + poetry config pypi-token.pypi $PYPI_TOKEN + poetry publish diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9081326e..d737fc7f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,14 +3,13 @@ name: Unit tests on: push: branches: - -dev - -main + - "**" pull_request: types: [opened, synchronize, reopened, ready_for_review] workflow_dispatch: jobs: - build-linux: + check: if: github.event.pull_request.draft == false runs-on: ${{matrix.os}} strategy: @@ -22,24 +21,23 @@ jobs: shell: bash -l {0} steps: - - name: Git clone + - name: Checkout uses: actions/checkout@v3 - - name: Set up venv for ci - uses: conda-incubator/setup-miniconda@v2 + - name: Python + uses: actions/setup-python@v4 with: - python-version: ${{matrix.python-version}} - environment-file: environment.ci.yml - - name: Lint with flake8 - run: | - flake8 - - name: Test with pytest - run: | - make coverage - - name: typing with mypy - run: | - mypy qolmat - echo you should uncomment mypy qolmat and delete this line - - name: Upload coverage reports to Codecov + python-version: ${{ matrix.python-version }} + - name: Poetry + uses: snok/install-poetry@v1 + with: + version: 1.8.3 + - name: Lock + run: poetry lock --no-update + - name: Install + run: poetry install + - name: Checkers + run: make checkers + - name: Codecov uses: codecov/codecov-action@v3 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/test_quick.yml b/.github/workflows/test_quick.yml deleted file mode 100644 index 40f58f5a..00000000 --- a/.github/workflows/test_quick.yml +++ /dev/null @@ -1,68 +0,0 @@ -name: Unit tests fast - -on: - push: - branches-ignore: - - dev - - main - workflow_dispatch: - -jobs: - basic-testing: - runs-on: ${{matrix.os}} - strategy: - matrix: - os: [ubuntu-latest] - python-version: [3.8] - defaults: - run: - shell: bash -l {0} - - steps: - - name: Git clone - uses: actions/checkout@v3 - - # See caching environments - # https://github.com/conda-incubator/setup-miniconda#caching-environments - - name: Setup Mambaforge - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: env_qolmat_ci - use-mamba: true - - - name: Get Date - id: get-date - run: echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT - - - name: Cache Conda env - uses: actions/cache@v2 - with: - path: ${{ env.CONDA }}/envs - key: - conda-${{ runner.os }}--${{ runner.arch }}--${{ - steps.get-date.outputs.today }}-${{ - hashFiles('environment.ci.yml') }}-${{ env.CACHE_NUMBER - }} - env: - # Increase this value to reset cache if environment.ci.yml has not changed - CACHE_NUMBER: 0 - id: cache - - - name: Update environment - run: mamba env update -n env_qolmat_ci -f environment.ci.yml - if: steps.cache.outputs.cache-hit != 'true' - - - name: Lint with flake8 - run: | - flake8 - - name: Test with pytest - run: | - make coverage - - name: Test docstrings - run: make doctest - - name: typing with mypy - run: | - mypy qolmat - echo you should uncomment mypy qolmat and delete this line diff --git a/.gitignore b/.gitignore index e385a1ee..970a7e3e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ +*.ipynb +poetry.lock # Byte-compiled / optimized / DLL files +data/ __pycache__/ *.py[cod] *$py.class diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 62948350..68c1acf3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,20 +8,8 @@ repos: exclude: (docs/) - id: trailing-whitespace exclude: (docs/) - - repo: https://github.com/psf/black - rev: 22.8.0 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.3.3 hooks: - - id: black - args: - - "-l 99" - # Flake8 - - repo: https://github.com/PyCQA/flake8 - rev: 4.0.1 - hooks: - - id: flake8 - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.1.1 - hooks: - - id: mypy - args: [--ignore-missing-imports] - additional_dependencies: [types-requests] + - id: ruff + - id: ruff-format diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 84b6f589..d5f1e3bc 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -29,14 +29,11 @@ You can create a virtual environment via `conda`: .. code:: sh - $ conda env create -f environment.dev.yml - $ conda activate env_qolmat_dev - -If you need to use pytorch, enter the command: - -.. code:: sh - - $ pip install -e .[pytorch] + $ pip install poetry + $ poetry config virtualenvs.in-project true + $ poetry lock + $ poetry install + $ poetry shell Once the environment is installed, pre-commit is installed, but need to be activated using the following command: @@ -78,7 +75,7 @@ These tests absolutely have to pass. .. code:: sh - $ mypy qolmat + $ make check-types Unit test ^^^^^^^^^ @@ -88,4 +85,4 @@ The coverage should on new features must be above 95%. .. code:: sh - $ pytest -vs --cov-branch --cov=qolmat --pyargs tests --cov-report term-missing + $ make check-coverage diff --git a/Makefile b/Makefile index c08e0d40..e0ca5828 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,34 @@ -coverage: - pytest --cov-branch --cov=qolmat --cov-report=xml tests -doctest: - pytest --doctest-modules --pyargs qolmat +check-coverage: + poetry run pytest --cov-branch --cov=qolmat/ --cov-report=xml tests/ -doc: - make html -C docs +check-poetry: + poetry check --lock + +check-quality: + poetry run ruff check qolmat/ tests/ + +check-security: + poetry run bandit --recursive --configfile=pyproject.toml qolmat/ + +check-tests: + poetry run pytest tests/ + +check-types: + poetry run mypy qolmat/ tests/ + +checkers: check-coverage check-types clean: rm -rf .mypy_cache .pytest_cache .coverage* rm -rf **__pycache__ make clean -C docs + +coverage: + poetry run pytest --cov-branch --cov=qolmat --cov-report=xml tests + +doc: + make html -C docs + +doctest: + poetry run pytest --doctest-modules --pyargs qolmat diff --git a/docs/conf.py b/docs/conf.py index 2429e591..6e080268 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,7 +12,7 @@ import os import sys -import sphinx_gallery + import sphinx_rtd_theme # If extensions (or modules to document with autodoc) are in another directory, @@ -53,7 +53,6 @@ # see https://github.com/numpy/numpydoc/issues/69 numpydoc_show_class_members = False -from distutils.version import LooseVersion # pngmath / imgmath compatibility layer for different sphinx versions # import sphinx diff --git a/environment.ci.yml b/environment.ci.yml deleted file mode 100644 index 86949837..00000000 --- a/environment.ci.yml +++ /dev/null @@ -1,18 +0,0 @@ -name: env_qolmat_ci -channels: - - defaults - - conda-forge -dependencies: - - codecov - - flake8 - - matplotlib - - mypy - - numpy - - numpydoc - - pytest - - pytest-cov - - pytest-mock - - pip - - pip: - - torch - - -e . diff --git a/environment.dev.yml b/environment.dev.yml deleted file mode 100644 index e2dfbed9..00000000 --- a/environment.dev.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: env_qolmat_dev -channels: - - conda-forge - - defaults -dependencies: - - bump2version=1.0.1 - - dcor=0.6 - - ipykernel=6.21.0 - - jupyter=1.0.0 - - jupyterlab=1.2.6 - - jupytext=1.14.4 - - hyperopt=0.2.7 - - numpy=1.24.4 - - packaging=23.1 - - pandas=2.0.1 - - python=3.8 - - pip=23.0.1 - - scipy=1.10.1 - - scikit-learn=1.3.2 - - sphinx=4.3.2 - - sphinx-gallery=0.10.1 - - sphinx_rtd_theme=1.0.0 - - statsmodels=0.14.0 - - twine=3.7.1 - - wheel=0.37.1 - - pip: - - flake8==6.0.0 - - jupytext==1.14.4 - - matplotlib==3.6.2 - - mypy==1.1.1 - - numpydoc==1.5.0 - - pre-commit==2.21.0 - - pytest==7.2.0 - - pytest-cov==4.0.0 - - pytest-mock==3.10.0 - - sphinx_markdown_tables==0.0.17 - - -e . diff --git a/environment.doc.yml b/environment.doc.yml deleted file mode 100644 index 30458c93..00000000 --- a/environment.doc.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: env_qolmat_doc -channels: - - conda-forge - - defaults -dependencies: - - numpydoc=1.1.0 - - python=3.8 - - sphinx=4.3.2 - - sphinx-gallery=0.10.1 - - sphinx_rtd_theme=1.0.0 - - typing_extensions=4.0.1 - - pip - - pip: - - sphinx-markdown-tables==0.0.17 diff --git a/examples/RPCA.md b/examples/RPCA.md index 05f8b755..0a4fbe8e 100644 --- a/examples/RPCA.md +++ b/examples/RPCA.md @@ -34,6 +34,10 @@ from qolmat.imputations.rpca import rpca_utils from qolmat.utils.data import generate_artificial_ts ``` +```python +from qolmat.imputations.imputers import ImputerRpcaNoisy, ImputerRpcaPcp +``` + **Generate synthetic data** ```python tags=[] @@ -46,16 +50,33 @@ amp_noise = 0.1 X_true, A_true, E_true = generate_artificial_ts(n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise) signal = X_true + A_true + E_true +signal = 10 + signal * 40 # Adding missing data signal[120:180] = np.nan signal[:20] = np.nan +for i in range(10): + signal[i::365] = np.nan # signal[80:220] = np.nan # mask = np.random.choice(len(signal), round(len(signal) / 20)) # signal[mask] = np.nan ``` +```python +import pandas as pd +df = pd.DataFrame({"signal": signal}) +irn = ImputerRpcaPcp(period=100) +df_imp = irn.fit_transform(df) +``` + +```python +plt.plot(df_imp["signal"]) +plt.plot(df["signal"]) + +plt.xlim(0, 200) +``` + ```python tags=[] fig = plt.figure(figsize=(15, 8)) ax = fig.add_subplot(4, 1, 1) diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py index f205d08a..1fdbbddb 100644 --- a/examples/tutorials/plot_tuto_benchmark_TS.py +++ b/examples/tutorials/plot_tuto_benchmark_TS.py @@ -1,5 +1,4 @@ -""" -========================= +"""========================= Benchmark for time series ========================= @@ -14,18 +13,18 @@ # First import some libraries import numpy as np -import pandas as pd np.random.seed(1234) -from matplotlib import pyplot as plt import matplotlib.ticker as plticker +from matplotlib import pyplot as plt tab10 = plt.get_cmap("tab10") +from sklearn.linear_model import LinearRegression + from qolmat.benchmark import comparator, missing_patterns from qolmat.imputations import imputers from qolmat.utils import data, plot -from sklearn.linear_model import LinearRegression # %% # 1. Data diff --git a/examples/tutorials/plot_tuto_categorical.py b/examples/tutorials/plot_tuto_categorical.py index b6e993fb..f0491ac8 100644 --- a/examples/tutorials/plot_tuto_categorical.py +++ b/examples/tutorials/plot_tuto_categorical.py @@ -1,5 +1,4 @@ -""" -============================== +"""============================== Benchmark for categorical data ============================== @@ -8,14 +7,13 @@ It comprehends passengers features as well as if they survived the accident. """ -from qolmat.imputations import preprocessing, imputers +from sklearn.pipeline import Pipeline + +from qolmat.benchmark import comparator, missing_patterns +from qolmat.imputations import imputers, preprocessing from qolmat.imputations.imputers import ImputerRegressor -from qolmat.benchmark import missing_patterns -from qolmat.benchmark import comparator from qolmat.utils import data -from sklearn.pipeline import Pipeline - # %% # 1. Titanic dataset # --------------------------------------------------------------- diff --git a/examples/tutorials/plot_tuto_diffusion_models.py b/examples/tutorials/plot_tuto_diffusion_models.py index 317128db..0ff0d80a 100644 --- a/examples/tutorials/plot_tuto_diffusion_models.py +++ b/examples/tutorials/plot_tuto_diffusion_models.py @@ -1,5 +1,4 @@ -""" -=============================================== +"""=============================================== Tutorial for imputers based on diffusion models =============================================== @@ -7,15 +6,14 @@ and :class:`~qolmat.imputations.diffusions.ddpms.TsDDPM` classes. """ -import pandas as pd -import numpy as np import matplotlib.pyplot as plt +import numpy as np +import pandas as pd -from qolmat.utils import data from qolmat.benchmark import comparator, missing_patterns - -from qolmat.imputations.imputers_pytorch import ImputerDiffusion from qolmat.imputations.diffusions.ddpms import TabDDPM, TsDDPM +from qolmat.imputations.imputers_pytorch import ImputerDiffusion +from qolmat.utils import data # %% # 1. Time-series data diff --git a/examples/tutorials/plot_tuto_hole_generator.py b/examples/tutorials/plot_tuto_hole_generator.py index 07594591..07ea6348 100644 --- a/examples/tutorials/plot_tuto_hole_generator.py +++ b/examples/tutorials/plot_tuto_hole_generator.py @@ -1,5 +1,4 @@ -""" -============================================ +"""============================================ Tutorial for hole generation in tabular data ============================================ @@ -17,13 +16,10 @@ """ from typing import List -from io import BytesIO import matplotlib import matplotlib.pyplot as plt import numpy as np import pandas as pd -import requests -import zipfile from qolmat.benchmark import missing_patterns from qolmat.utils import data @@ -90,6 +86,7 @@ def visualise_missing_values(df_init: pd.DataFrame, df_mask: pd.DataFrame): initial dataframe df_mask : pd.DataFrame masked dataframe + """ df_tot = df_init.copy() df_tot[df_init.notna()] = 0 @@ -117,6 +114,7 @@ def get_holes_sizes_column_wise(data: np.ndarray) -> List[List[int]]: ------- List[List[int]] List of hole size for each column. + """ hole_sizes = [] for col in range(data.shape[1]): @@ -153,6 +151,7 @@ def plot_cdf( list of labels colors : List[str] list of colors + """ _, axs = plt.subplots(1, df.shape[1], sharey=True, figsize=(15, 3)) diff --git a/examples/tutorials/plot_tuto_mcar.py b/examples/tutorials/plot_tuto_mcar.py index c43d1217..a9bddb7f 100644 --- a/examples/tutorials/plot_tuto_mcar.py +++ b/examples/tutorials/plot_tuto_mcar.py @@ -1,5 +1,4 @@ -""" -============================================ +"""============================================ Tutorial for Testing the MCAR Case ============================================ @@ -8,10 +7,9 @@ # %% # First import some libraries -from matplotlib import pyplot as plt - import numpy as np import pandas as pd +from matplotlib import pyplot as plt from scipy.stats import norm from qolmat.analysis.holes_characterization import LittleTest diff --git a/examples/tutorials/plot_tuto_mean_median.py b/examples/tutorials/plot_tuto_mean_median.py index 403b4407..33c36db2 100644 --- a/examples/tutorials/plot_tuto_mean_median.py +++ b/examples/tutorials/plot_tuto_mean_median.py @@ -1,5 +1,4 @@ -""" -======================================================================================== +"""======================================================================================== Comparison of basic imputers ======================================================================================== @@ -21,7 +20,6 @@ from qolmat.imputations import imputers from qolmat.utils import data, plot - # %% # 1. Data # --------------------------------------------------------------- @@ -29,11 +27,14 @@ # Originally, the first 81 columns contain extracted features and # the 82nd column contains the critical temperature which is used as the # target variable. -# The data does not contain missing values; so for the purpose of this notebook, +# The data does not contain missing values; +# so for the purpose of this notebook, # we corrupt the data, with the :func:`qolmat.utils.data.add_holes` function. # In this way, each column has missing values. -df = data.add_holes(data.get_data("Superconductor"), ratio_masked=0.2, mean_size=120) +df = data.add_holes( + data.get_data("Superconductor"), ratio_masked=0.2, mean_size=120 +) # %% # The dataset contains 82 columns. For simplicity, @@ -55,7 +56,9 @@ # a missing (resp. observed) value. plt.figure(figsize=(15, 4)) -plt.imshow(df.notna().values.T, aspect="auto", cmap="binary", interpolation="none") +plt.imshow( + df.notna().values.T, aspect="auto", cmap="binary", interpolation="none" +) plt.yticks(range(len(df.columns)), df.columns) plt.xlabel("Samples", fontsize=12) plt.grid(False) @@ -102,7 +105,9 @@ custom_cmap = matplotlib.colors.ListedColormap(colorsList) plt.figure(figsize=(15, 4)) -plt.imshow(df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none") +plt.imshow( + df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none" +) plt.yticks(range(len(df_tot.columns)), df_tot.columns) plt.xlabel("Samples", fontsize=12) plt.grid(False) @@ -147,7 +152,9 @@ # are relatively poor. Other imputation methods are therefore # necessary (see folder `imputations`). -dfs_imputed = {name: imp.fit_transform(df) for name, imp in dict_imputers.items()} +dfs_imputed = { + name: imp.fit_transform(df) for name, imp in dict_imputers.items() +} for col in cols_to_impute: fig, ax = plt.subplots(figsize=(10, 3)) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..a0f87501 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,126 @@ +# PACKAGE + +[tool.poetry] +name = "qolmat" +version = "0.1.8" +description = "A Python library for optimal data imputation." +authors = [ + "Julien ROUSSEL ", + "Anh Khoa NGO HO ", + "Hong-Lan BOTTERMAN ", + "Guillaume SAËS ", +] +license = "BSD-3-Clause" +readme = "README.rst" +homepage = "https://github.com/Quantmetry/qolmat" +repository = "https://github.com/Quantmetry/qolmat" +documentation = "https://qolmat.readthedocs.io/en/latest/" +keywords = ["imputation"] +classifiers = [ + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "License :: OSI Approved", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", +] + +# DEPENDENCIES + +[tool.poetry.dependencies] +python = ">=3.8.1,<3.12" +bump2version = "1.0.1" +dcor = "0.6" +jupyter = "1.0.0" +jupyterlab = "1.2.6" +jupytext = "1.14.4" +hyperopt = "0.2.7" +numpy = "1.24.4" +packaging = "23.1" +pandas = "2.0.1" +scipy = "1.10.1" +scikit-learn = "1.3.2" +sphinx-markdown-tables = { version = "*", optional = true } +statsmodels = "0.14.0" +typed-ast = { version = "*", optional = true } +twine = "3.7.1" +wheel = "0.37.1" +category-encoders = "^2.6.3" +ipykernel = "^6.29.5" +torch = "^2.4.0" + +[tool.poetry.dev-dependencies] +matplotlib = "3.6.2" +pre-commit = "2.21.0" + +[tool.poetry.group.checkers.dependencies] +bandit = "^1.7.9" +mypy = "1.1.1" +ruff = "^0.6.3" +pytest = "7.2.0" +pytest-cov = "4.0.0" +pytest-mock = "3.10.0" + +[tool.poetry.group.ci.dependencies] +codecov = "^2.1.13" + +[tool.poetry.group.docs.dependencies] +numpydoc = "1.1.0" +sphinx = "4.3.2" +sphinx-gallery = "0.10.1" +sphinx_rtd_theme = "1.0.0" + +[tool.poetry.extras] +tests = ["typed-ast"] +docs = ["sphinx-markdown-tables"] + +[tool.poetry.urls] +"Bug Tracker" = "https://github.com/Quantmetry/qolmat" +"Source Code" = "https://github.com/Quantmetry/qolmat" + +[[tool.poetry.source]] +name = "pytorch_cpu" +url = "https://download.pytorch.org/whl/cpu" +priority = "explicit" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + + +# CONFIGURATION +[tool.bandit] +targets = ["qolmat"] + +[tool.mypy] +pretty = true +strict = false +python_version = ">=3.8.1,<3.12" +ignore_missing_imports = true + +[tool.ruff] +line-length = 79 +fix = true +indent-width = 4 +target-version = "py310" +exclude = ["examples/", "docs/"] + +[tool.ruff.format] +docstring-code-format = true + +[tool.ruff.lint] +select = ["C", "D", "E", "F", "I", "Q", "W"] +ignore = ["C901", "D107"] + +[tool.ruff.lint.isort] +known-first-party = ["qolmat"] + +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = ["D100", "D103"] +"__init__.py" = ["D104"] diff --git a/qolmat/analysis/holes_characterization.py b/qolmat/analysis/holes_characterization.py index 5669ac7b..aa511052 100644 --- a/qolmat/analysis/holes_characterization.py +++ b/qolmat/analysis/holes_characterization.py @@ -1,3 +1,5 @@ +"""Script for characterising the holes.""" + from abc import ABC, abstractmethod from typing import Optional, Union @@ -9,34 +11,37 @@ class McarTest(ABC): - """ - Astract class for MCAR tests. - """ + """Astract class for MCAR tests.""" @abstractmethod def test(self, df: pd.DataFrame) -> float: + """Test function.""" pass class LittleTest(McarTest): - """ - This class implements the Little's test, which is designed to detect the heterogeneity accross - the missing patterns. The null hypothesis is "The missing data mechanism is MCAR". The - shortcoming of this test is that it won't detect the heterogeneity of covariance. + """Little Test class. + + This class implements the Little's test, which is designed to detect the + heterogeneity accross the missing patterns. The null hypothesis is + "The missing data mechanism is MCAR". The shortcoming of this test is + that it won't detect the heterogeneity of covariance. References ---------- - Little. "A Test of Missing Completely at Random for Multivariate Data with Missing Values." - Journal of the American Statistical Association, Volume 83, 1988 - Issue 404 + Little. "A Test of Missing Completely at Random for Multivariate Data with + Missing Values." Journal of the American Statistical Association, + Volume 83, 1988 - Issue 404 Parameters ---------- imputer : Optional[ImputerEM] - Imputer based on the EM algorithm. The 'model' attribute must be equal to 'multinormal'. - If None, the default ImputerEM is taken. + Imputer based on the EM algorithm. The 'model' attribute must be + equal to 'multinormal'. If None, the default ImputerEM is taken. random_state : int, RandomState instance or None, default=None Controls the randomness. Pass an int for reproducible output across multiple function calls. + """ def __init__( @@ -47,15 +52,14 @@ def __init__( super().__init__() if imputer and imputer.model != "multinormal": raise AttributeError( - "The ImputerEM model must be 'multinormal' to use the Little's test" + "The ImputerEM model must be 'multinormal' " + "to use the Little's test" ) self.imputer = imputer self.random_state = random_state def test(self, df: pd.DataFrame) -> float: - """ - Apply the Little's test over a real dataframe. - + """Apply the Little's test over a real dataframe. Parameters ---------- @@ -66,6 +70,7 @@ def test(self, df: pd.DataFrame) -> float: ------- float The p-value of the test. + """ imputer = self.imputer or ImputerEM(random_state=self.random_state) imputer = imputer._fit_element(df) @@ -79,16 +84,22 @@ def test(self, df: pd.DataFrame) -> float: # Iterate over the patterns df_nan = df.notna() - for tup_pattern, df_nan_pattern in df_nan.groupby(df_nan.columns.tolist()): + for tup_pattern, df_nan_pattern in df_nan.groupby( + df_nan.columns.tolist() + ): n_rows_pattern, _ = df_nan_pattern.shape ind_pattern = df_nan_pattern.index df_pattern = df.loc[ind_pattern, list(tup_pattern)] obs_mean = df_pattern.mean().to_numpy() diff_means = obs_mean - ml_means[list(tup_pattern)] - inv_sigma_pattern = np.linalg.inv(ml_cov[:, tup_pattern][tup_pattern, :]) + inv_sigma_pattern = np.linalg.inv( + ml_cov[:, tup_pattern][tup_pattern, :] + ) - d0 += n_rows_pattern * np.dot(np.dot(diff_means, inv_sigma_pattern), diff_means.T) + d0 += n_rows_pattern * np.dot( + np.dot(diff_means, inv_sigma_pattern), diff_means.T + ) degree_f += tup_pattern.count(True) return 1 - float(chi2.cdf(d0, degree_f)) diff --git a/qolmat/benchmark/comparator.py b/qolmat/benchmark/comparator.py index 5a60c6f5..4fed2e9e 100644 --- a/qolmat/benchmark/comparator.py +++ b/qolmat/benchmark/comparator.py @@ -1,3 +1,5 @@ +"""Script for comparator.""" + from typing import Any, Dict, List, Optional import numpy as np @@ -8,23 +10,28 @@ class Comparator: - """ - This class implements a comparator for evaluating different imputation methods. + """Comparator class. + + This class implements a comparator for evaluating different + imputation methods. Parameters ---------- dict_models: Dict[str, any] dictionary of imputation methods selected_columns: List[str]Œ - list of column's names selected (all with at least one null value will be imputed) + list of column's names selected (all with at least one null value will + be imputed) columnwise_evaluation : Optional[bool], optional - whether the metric should be calculated column-wise or not, by default False - dict_config_opti: Optional[Dict[str, Dict[str, Union[str, float, int]]]] = {} - dictionary of search space for each implementation method. By default, the value is set to - {}. + whether the metric should be calculated column-wise or not, + by default False + dict_config_opti: Optional[Dict[str, Dict[str, Union[str, float, int]]]] + dictionary of search space for each implementation method. + By default, the value is set to {}. max_evals: int = 10 number of calls of the optimization algorithm 10. + """ def __init__( @@ -53,24 +60,29 @@ def get_errors( df_imputed: pd.DataFrame, df_mask: pd.DataFrame, ) -> pd.DataFrame: - """Functions evaluating the reconstruction's quality + """Get errors - estimate the reconstruction's quality. Parameters ---------- - signal_ref : pd.DataFrame + df_origin : pd.DataFrame reference/orginal signal - signal_imputed : pd.DataFrame + df_imputed : pd.DataFrame imputed signal + df_mask : pd.DataFrame + masked dataframe (NA) Returns ------- pd.DataFrame DataFrame of results obtained via different metrics + """ dict_errors = {} for name_metric in self.metrics: fun_metric = metrics.get_metric(name_metric) - dict_errors[name_metric] = fun_metric(df_origin, df_imputed, df_mask) + dict_errors[name_metric] = fun_metric( + df_origin, df_imputed, df_mask + ) df_errors = pd.concat(dict_errors.values(), keys=dict_errors.keys()) return df_errors @@ -81,23 +93,25 @@ def evaluate_errors_sample( dict_config_opti_imputer: Dict[str, Any] = {}, metric_optim: str = "mse", ) -> pd.Series: - """Evaluate the errors in the cross-validation + """Evaluate the errors in the cross-validation. Parameters ---------- - tested_model : any + imputer : Any imputation model df : pd.DataFrame dataframe to impute dict_config_opti_imputer : Dict search space for tested_model's hyperparameters metric_optim : str - Loss function used when imputers undergo hyperparameter optimization + Loss function used when imputers undergo hyperparameter + optimization Returns ------- pd.Series Series with the errors for each metric and each variable + """ list_errors = [] df_origin = df[self.selected_columns].copy() @@ -117,9 +131,12 @@ def evaluate_errors_sample( subset = self.generator_holes.subset if subset is None: raise ValueError( - "HoleGenerator `subset` should be overwritten in split but it is none!" + "HoleGenerator `subset` should be overwritten in split " + "but it is none!" ) - df_errors = self.get_errors(df_origin[subset], df_imputed[subset], df_mask[subset]) + df_errors = self.get_errors( + df_origin[subset], df_imputed[subset], df_mask[subset] + ) list_errors.append(df_errors) df_errors = pd.DataFrame(list_errors) errors_mean = df_errors.mean(axis=0) @@ -130,20 +147,20 @@ def compare( self, df: pd.DataFrame, ): - """Function to compare different imputation methods on dataframe df + """Compure different imputation methods on dataframe df. Parameters ---------- df : pd.DataFrame - verbose : bool, optional - _description_, by default True + input dataframe (for comparison) + Returns ------- pd.DataFrame - Dataframe with the metrics results, imputers are in columns and indices represent - metrics and variables. - """ + Dataframe with the metrics results, imputers are in columns + and indices represent metrics and variables. + """ dict_errors = {} for name, imputer in self.dict_imputers.items(): @@ -156,7 +173,10 @@ def compare( ) print("done.") except Exception as excp: - print(f"Error while testing {name} of type {type(imputer).__name__}!") + print( + f"Error while testing {name} of type " + f"{type(imputer).__name__}!" + ) raise excp df_errors = pd.DataFrame(dict_errors) diff --git a/qolmat/benchmark/hyperparameters.py b/qolmat/benchmark/hyperparameters.py index eaf6efc4..7aa4a24b 100644 --- a/qolmat/benchmark/hyperparameters.py +++ b/qolmat/benchmark/hyperparameters.py @@ -1,15 +1,15 @@ -import copy -from typing import Any, Callable, Dict, List, Union +"""Script for hyperparameter optimisation.""" -import numpy as np -import pandas as pd +import copy +from typing import Callable, Dict, List # import skopt # from skopt.space import Categorical, Dimension, Integer, Real import hyperopt as ho -from hyperopt.pyll.base import Apply as hoApply -from qolmat.benchmark import metrics +import numpy as np +import pandas as pd +from qolmat.benchmark import metrics from qolmat.benchmark.missing_patterns import _HoleGenerator from qolmat.imputations.imputers import _Imputer from qolmat.utils.utils import HyperValue @@ -22,18 +22,21 @@ def get_objective( metric: str, names_hyperparams: List[str], ) -> Callable: - """ - Define the objective function, which is the average metric computed over the folds provided by + """Define the objective function. + + This is the average metric computed over the folds provided by the hole generator, using a cross-validation. Parameters ---------- imputer: _Imputer - Imputer that should be optimized, it should at least have a fit_transform method and an - imputer_params attribute + Imputer that should be optimized, it should at least have a + fit_transform method and an imputer_params attribute + df : pd.DataFrame + input dataframe generator: _HoleGenerator - Generator creating the masked values in the nested cross validation allowing to measure the - imputer performance + Generator creating the masked values in the nested cross validation + allowing to measure the imputer performance metric: str Metric used as perfomance indicator, common values are `mse` and `mae` names_hyperparams: List[str] @@ -43,6 +46,7 @@ def get_objective( ------- Callable[List[HyperValue], float] Objective function + """ def fun_obf(args: List[HyperValue]) -> float: @@ -58,7 +62,9 @@ def fun_obf(args: List[HyperValue]) -> float: df_imputed = imputer.fit_transform(df_corrupted) subset = generator.subset fun_metric = metrics.get_metric(metric) - errors = fun_metric(df_origin[subset], df_imputed[subset], df_mask[subset]) + errors = fun_metric( + df_origin[subset], df_imputed[subset], df_mask[subset] + ) list_errors.append(errors) mean_errors = np.mean(errors) @@ -76,44 +82,55 @@ def optimize( max_evals: int = 100, verbose: bool = False, ): - """Return the provided imputer with hyperparameters optimized in the provided range in order to - minimize the provided metric. + """Optimisation function. + + Return the provided imputer with hyperparameters optimized in the provided + range in order to minimize the provided metric. Parameters ---------- imputer: _Imputer - Imputer that should be optimized, it should at least have a fit_transform method and an - imputer_params attribute + Imputer that should be optimized, it should at least have a + fit_transform method and an imputer_params attribute + df : pd.DataFrame + input dataframe generator: _HoleGenerator - Generator creating the masked values in the nested cross validation allowing to measure the - imputer performance + Generator creating the masked values in the nested cross validation + allowing to measure the imputer performance metric: str Metric used as perfomance indicator, common values are `mse` and `mae` dict_config: Dict[str, HyperValue] Search space for the tested hyperparameters max_evals: int - Maximum number of evaluation of the performance of the algorithm. Each estimation involves - one call to fit_transform per fold returned by the generator. See the n_fold attribute. + Maximum number of evaluation of the performance of the algorithm. + Each estimation involves one call to fit_transform per fold returned + by the generator. See the n_fold attribute. verbose: bool - Verbosity switch, usefull for imputers that can have unstable behavior for some - hyperparameters values + Verbosity switch, usefull for imputers that can have unstable + behavior for some hyperparameters values Returns ------- _Imputer Optimized imputer + """ imputer = copy.deepcopy(imputer) if dict_config == {}: return imputer names_hyperparams = list(dict_config.keys()) values_hyperparams = list(dict_config.values()) - imputer.imputer_params = tuple(set(imputer.imputer_params) | set(dict_config.keys())) + imputer.imputer_params = tuple( + set(imputer.imputer_params) | set(dict_config.keys()) + ) if verbose and hasattr(imputer, "verbose"): setattr(imputer, "verbose", False) fun_obj = get_objective(imputer, df, generator, metric, names_hyperparams) hyperparams = ho.fmin( - fn=fun_obj, space=values_hyperparams, algo=ho.tpe.suggest, max_evals=max_evals + fn=fun_obj, + space=values_hyperparams, + algo=ho.tpe.suggest, + max_evals=max_evals, ) for key, value in hyperparams.items(): diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py index f8f87441..b8af8667 100644 --- a/qolmat/benchmark/metrics.py +++ b/qolmat/benchmark/metrics.py @@ -1,15 +1,17 @@ +"""Script for metrics.""" + from functools import partial from typing import Callable, Dict, List +import dcor import numpy as np import pandas as pd import scipy +from numpy.linalg import LinAlgError from sklearn import metrics as skm -import dcor from qolmat.utils import algebra, utils from qolmat.utils.exceptions import NotEnoughSamples -from numpy.linalg import LinAlgError EPS = np.finfo(float).eps @@ -26,7 +28,9 @@ def columnwise_metric( type_cols: str = "all", **kwargs, ) -> pd.Series: - """For each column, compute a metric score based on the true dataframe + """Compute column-wise metrics. + + For each column, compute a metric score based on the true dataframe and the predicted dataframe Parameters @@ -44,17 +48,21 @@ def columnwise_metric( - `all` to apply the metric to all columns - `numerical` to apply the metric to numerical columns only - `categorical` to apply the metric to categorical columns only + **kwargs: dict + additional arguments Returns ------- pd.Series Series of scores for all columns + """ try: pd.testing.assert_index_equal(df1.columns, df2.columns) except AssertionError: raise ValueError( - f"Input dataframes do not have the same columns! ({df1.columns} != {df2.columns})" + "Input dataframes do not have the same columns! " + f"({df1.columns} != {df2.columns})" ) if type_cols == "all": cols = df1.columns @@ -63,19 +71,23 @@ def columnwise_metric( elif type_cols == "categorical": cols = utils._get_categorical_features(df1) else: - raise ValueError(f"Value {type_cols} is not valid for parameter `type_cols`!") + raise ValueError( + f"Value {type_cols} is not valid for parameter `type_cols`!" + ) values = {} for col in cols: df1_col = df1.loc[df_mask[col], col] df2_col = df2.loc[df_mask[col], col] - assert df1_col.notna().all() - assert df2_col.notna().all() + if df1_col.isna().any() or df2_col.isna().any(): + raise ValueError(f"Column {col} contains NaN.") values[col] = metric(df1_col, df2_col, **kwargs) return pd.Series(values) -def mean_squared_error(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series: +def mean_squared_error( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> pd.Series: """Mean squared error between two dataframes. Parameters @@ -90,14 +102,17 @@ def mean_squared_error(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFra Returns ------- pd.Series + """ - return columnwise_metric(df1, df2, df_mask, skm.mean_squared_error, type_cols="numerical") + return columnwise_metric( + df1, df2, df_mask, skm.mean_squared_error, type_cols="numerical" + ) def root_mean_squared_error( df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame ) -> pd.Series: - """Root mean squared error between two dataframes. + """Compute the root mean squared error between two dataframes. Parameters ---------- @@ -111,14 +126,22 @@ def root_mean_squared_error( Returns ------- pd.Series + """ return columnwise_metric( - df1, df2, df_mask, skm.mean_squared_error, type_cols="numerical", squared=False + df1, + df2, + df_mask, + skm.mean_squared_error, + type_cols="numerical", + squared=False, ) -def mean_absolute_error(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series: - """Mean absolute error between two dataframes. +def mean_absolute_error( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> pd.Series: + """Compute the mean absolute error between two dataframes. Parameters ---------- @@ -132,14 +155,17 @@ def mean_absolute_error(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFr Returns ------- pd.Series + """ - return columnwise_metric(df1, df2, df_mask, skm.mean_absolute_error, type_cols="numerical") + return columnwise_metric( + df1, df2, df_mask, skm.mean_absolute_error, type_cols="numerical" + ) def mean_absolute_percentage_error( df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame ) -> pd.Series: - """Mean absolute percentage error between two dataframes. + """Compute the mean absolute percentage error between two dataframes. Parameters ---------- @@ -153,14 +179,22 @@ def mean_absolute_percentage_error( Returns ------- pd.Series + """ return columnwise_metric( - df1, df2, df_mask, skm.mean_absolute_percentage_error, type_cols="numerical" + df1, + df2, + df_mask, + skm.mean_absolute_percentage_error, + type_cols="numerical", ) -def _weighted_mean_absolute_percentage_error_1D(values1: pd.Series, values2: pd.Series) -> float: - """Weighted mean absolute percentage error between two series. +def _weighted_mean_absolute_percentage_error_1D( + values1: pd.Series, values2: pd.Series +) -> float: + """Compute the weighted mean absolute perc. error between 2 series. + Based on https://en.wikipedia.org/wiki/Mean_absolute_percentage_error Parameters @@ -174,6 +208,7 @@ def _weighted_mean_absolute_percentage_error_1D(values1: pd.Series, values2: pd. ------- float Weighted mean absolute percentage error + """ return (values1 - values2).abs().sum() / values1.abs().sum() @@ -181,7 +216,7 @@ def _weighted_mean_absolute_percentage_error_1D(values1: pd.Series, values2: pd. def weighted_mean_absolute_percentage_error( df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame ) -> pd.Series: - """Weighted mean absolute percentage error between two dataframes. + """Compute the weighted mean absolute percentage error between 2 df. Parameters ---------- @@ -195,6 +230,7 @@ def weighted_mean_absolute_percentage_error( Returns ------- pd.Series + """ return columnwise_metric( df1, @@ -205,9 +241,10 @@ def weighted_mean_absolute_percentage_error( ) -def accuracy(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series: - """ - Matching ratio beetween the two datasets. +def accuracy( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> pd.Series: + """Compute the matching ratio beetween the two datasets. Parameters ---------- @@ -221,6 +258,7 @@ def accuracy(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd. Returns ------- pd.Series + """ return columnwise_metric( df1, @@ -232,8 +270,7 @@ def accuracy(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd. def accuracy_1D(values1: pd.Series, values2: pd.Series) -> float: - """ - Matching ratio beetween the set of values. + """Compute the matching ratio beetween the set of values. Parameters ---------- @@ -246,6 +283,7 @@ def accuracy_1D(values1: pd.Series, values2: pd.Series) -> float: ------- float accuracy + """ return (values1 == values2).mean() @@ -256,8 +294,9 @@ def dist_wasserstein( df_mask: pd.DataFrame, method: str = "columnwise", ) -> pd.Series: - """Wasserstein distances between columns of 2 dataframes. - Wasserstein distance can only be computed columnwise + """Compute the Wasserstein distances between columns of 2 dataframes. + + Wasserstein distance can only be computed columnwise. Parameters ---------- @@ -267,24 +306,34 @@ def dist_wasserstein( Predicted dataframe df_mask : pd.DataFrame Elements of the dataframes to compute on + method : str, optional + columnwise or not Returns ------- pd.Series wasserstein distances + """ if method == "columnwise": - return columnwise_metric(df1, df2, df_mask, scipy.stats.wasserstein_distance) + return columnwise_metric( + df1, df2, df_mask, scipy.stats.wasserstein_distance + ) else: raise AssertionError( - f"The parameter of the function wasserstein_distance should be one of" - f"the following: [`columnwise`], not `{method}`!" + f"The parameter of the function wasserstein_distance should " + "be one of the following: " + f"[`columnwise`], not `{method}`!" ) def kolmogorov_smirnov_test_1D(df1: pd.Series, df2: pd.Series) -> float: - """Compute KS test statistic of the two-sample Kolmogorov-Smirnov test for goodness of fit. - See more in https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html. + """Compute KS test statistic. + + Compute KS test stat. of the two-sample Kolmogorov-Smirnov test + for goodness of fit. + See more in + https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html. Parameters ---------- @@ -297,6 +346,7 @@ def kolmogorov_smirnov_test_1D(df1: pd.Series, df2: pd.Series) -> float: ------- float KS test statistic + """ return scipy.stats.ks_2samp(df1, df2)[0] @@ -304,7 +354,8 @@ def kolmogorov_smirnov_test_1D(df1: pd.Series, df2: pd.Series) -> float: def kolmogorov_smirnov_test( df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame ) -> pd.Series: - """Kolmogorov Smirnov Test for numerical features. + """Compute the Kolmogorov Smirnov Test for numerical features. + Lower score means better performance. Parameters @@ -320,12 +371,16 @@ def kolmogorov_smirnov_test( ------- pd.Series KS test statistic + """ - return columnwise_metric(df1, df2, df_mask, kolmogorov_smirnov_test_1D, type_cols="numerical") + return columnwise_metric( + df1, df2, df_mask, kolmogorov_smirnov_test_1D, type_cols="numerical" + ) def _total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float: - """Compute Total Variance Distance for a categorical feature + """Compute Total Variance Distance for a categorical feature. + It is based on TVComplement in https://github.com/sdv-dev/SDMetrics Parameters @@ -339,6 +394,7 @@ def _total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float: ------- float Total variance distance + """ list_categories = list(set(df1.unique()).union(set(df2.unique()))) freqs1 = df1.value_counts() / len(df1) @@ -351,7 +407,8 @@ def _total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float: def total_variance_distance( df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame ) -> pd.Series: - """Total variance distance for categorical features + """Compute the total variance distance for categorical features. + It is based on TVComplement in https://github.com/sdv-dev/SDMetrics Parameters @@ -367,6 +424,7 @@ def total_variance_distance( ------- pd.Series Total variance distance + """ return columnwise_metric( df1, @@ -382,9 +440,13 @@ def _check_same_number_columns(df1: pd.DataFrame, df2: pd.DataFrame): raise Exception("inputs have to have the same number of columns.") -def _get_correlation_pearson_matrix(df: pd.DataFrame, use_p_value: bool = True) -> pd.DataFrame: - """Get matrix of correlation values for numerical features - based on Pearson correlation coefficient or p-value for testing non-correlation. +def _get_correlation_pearson_matrix( + df: pd.DataFrame, use_p_value: bool = True +) -> pd.DataFrame: + """Get matrix of correlation values for numerical features. + + Based on Pearson correlation coefficient or p-value for + testing non-correlation. Parameters ---------- @@ -397,12 +459,15 @@ def _get_correlation_pearson_matrix(df: pd.DataFrame, use_p_value: bool = True) ------- pd.DataFrame Correlation matrix + """ cols = df.columns.tolist() matrix = np.zeros((len(df.columns), len(df.columns))) for idx_1, col_1 in enumerate(cols): for idx_2, col_2 in enumerate(cols): - res = scipy.stats.mstats.pearsonr(df[[col_1]].values, df[[col_2]].values) + res = scipy.stats.mstats.pearsonr( + df[[col_1]].values, df[[col_2]].values + ) if use_p_value: matrix[idx_1, idx_2] = res[1] else: @@ -417,8 +482,11 @@ def mean_difference_correlation_matrix_numerical_features( df_mask: pd.DataFrame, use_p_value: bool = True, ) -> pd.Series: - """Mean absolute of differences between the correlation matrices of df1 and df2. - based on Pearson correlation coefficient or p-value for testing non-correlation. + """Compute the mean absolute of differences. + + Computed between the correlation matrices of df1 and df2. + based on Pearson correlation coefficient or p-value for + testing non-correlation. Parameters ---------- @@ -435,6 +503,7 @@ def mean_difference_correlation_matrix_numerical_features( ------- pd.Series Mean absolute of differences for each feature + """ df1 = df1[df_mask].dropna(axis=0) df2 = df2[df_mask].dropna(axis=0) @@ -442,28 +511,38 @@ def mean_difference_correlation_matrix_numerical_features( _check_same_number_columns(df1, df2) cols_numerical = utils._get_numerical_features(df1) - df_corr1 = _get_correlation_pearson_matrix(df1[cols_numerical], use_p_value=use_p_value) - df_corr2 = _get_correlation_pearson_matrix(df2[cols_numerical], use_p_value=use_p_value) + df_corr1 = _get_correlation_pearson_matrix( + df1[cols_numerical], use_p_value=use_p_value + ) + df_corr2 = _get_correlation_pearson_matrix( + df2[cols_numerical], use_p_value=use_p_value + ) diff_corr = (df_corr1 - df_corr2).abs().mean(axis=1) return pd.Series(diff_corr, index=cols_numerical) -def _get_correlation_chi2_matrix(data: pd.DataFrame, use_p_value: bool = True) -> pd.DataFrame: - """Get matrix of correlation values for categorical features - based on Chi-square test of independence of variables (the test statistic or the p-value). +def _get_correlation_chi2_matrix( + data: pd.DataFrame, use_p_value: bool = True +) -> pd.DataFrame: + """Get matrix of correlation values for categorical features. + + Based on Chi-square test of independence of variables + (the test statistic or the p-value). Parameters ---------- - df : pd.DataFrame + data : pd.DataFrame dataframe use_p_value : bool, optional - use the p-value of the test instead of the test statistic, by default True + use the p-value of the test instead of the test statistic, + by default True Returns ------- pd.DataFrame Correlation matrix + """ cols = data.columns.tolist() matrix = np.zeros((len(data.columns), len(data.columns))) @@ -486,8 +565,11 @@ def mean_difference_correlation_matrix_categorical_features( df_mask: pd.DataFrame, use_p_value: bool = True, ) -> pd.Series: - """Mean absolute of differences between the correlation matrix of df1 and df2 - based on Chi-square test of independence of variables (the test statistic or the p-value) + """Compute the mean absolute of differences. + + Computed between the correlation matrix of df1 and df2 + based on Chi-square test of independence of variables + (the test statistic or the p-value) Parameters ---------- @@ -498,12 +580,14 @@ def mean_difference_correlation_matrix_categorical_features( df_mask : pd.DataFrame Elements of the dataframes to compute on use_p_value : bool, optional - use the p-value of the test instead of the test statistic, by default True + use the p-value of the test instead of the test statistic, + by default True Returns ------- pd.Series Mean absolute of differences for each feature + """ df1 = df1[df_mask].dropna(axis=0) df2 = df2[df_mask].dropna(axis=0) @@ -511,8 +595,12 @@ def mean_difference_correlation_matrix_categorical_features( _check_same_number_columns(df1, df2) cols_categorical = utils._get_categorical_features(df1) - df_corr1 = _get_correlation_chi2_matrix(df1[cols_categorical], use_p_value=use_p_value) - df_corr2 = _get_correlation_chi2_matrix(df2[cols_categorical], use_p_value=use_p_value) + df_corr1 = _get_correlation_chi2_matrix( + df1[cols_categorical], use_p_value=use_p_value + ) + df_corr2 = _get_correlation_chi2_matrix( + df2[cols_categorical], use_p_value=use_p_value + ) diff_corr = (df_corr1 - df_corr2).abs().mean(axis=1) return pd.Series(diff_corr, index=cols_categorical) @@ -524,7 +612,9 @@ def _get_correlation_f_oneway_matrix( cols_numerical: List[str], use_p_value: bool = True, ) -> pd.DataFrame: - """Get matrix of correlation values between categorical and numerical features + """Get matrix of correlation values. + + Computed between categorical and numerical features based on the one-way ANOVA. Parameters @@ -536,12 +626,14 @@ def _get_correlation_f_oneway_matrix( cols_numerical : List[str] list numerical columns use_p_value : bool, optional - use the p-value of the test instead of the test statistic, by default True + use the p-value of the test instead of the test statistic, + by default True Returns ------- pd.DataFrame Correlation matrix + """ matrix = np.zeros((len(cols_categorical), len(cols_numerical))) for idx_cat, col_cat in enumerate(cols_categorical): @@ -561,7 +653,9 @@ def mean_diff_corr_matrix_categorical_vs_numerical_features( df_mask: pd.DataFrame, use_p_value: bool = True, ) -> pd.Series: - """Mean absolute of differences between the correlation matrix of df1 and df2 + """Compute the mean absolute of differences. + + Computation between the correlation matrix of df1 and df2 based on the one-way ANOVA. Parameters @@ -573,12 +667,14 @@ def mean_diff_corr_matrix_categorical_vs_numerical_features( df_mask : pd.DataFrame Elements of the dataframes to compute on use_p_value : bool, optional - use the p-value of the test instead of the test statistic, by default True + use the p-value of the test instead of the test statistic, + by default True Returns ------- pd.Series Mean absolute of differences for each feature + """ df1 = df1[df_mask].dropna(axis=0) df2 = df2[df_mask].dropna(axis=0) @@ -603,7 +699,8 @@ def mean_diff_corr_matrix_categorical_vs_numerical_features( def _sum_manhattan_distances_1D(values: pd.Series) -> float: - """Sum of Manhattan distances computed for one column + """Compute the sum of Manhattan distances computed for one column. + It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/ Parameters @@ -615,6 +712,7 @@ def _sum_manhattan_distances_1D(values: pd.Series) -> float: ------- float Sum of Manhattan distances + """ values = values.sort_values(ascending=True) sums_partial = values.shift().fillna(0.0).cumsum() @@ -624,25 +722,31 @@ def _sum_manhattan_distances_1D(values: pd.Series) -> float: def _sum_manhattan_distances(df1: pd.DataFrame) -> float: - """Sum Manhattan distances between all pairs of rows. + """Compute the sum Manhattan distances between all pairs of rows. + It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/ Parameters ---------- df1 : pd.DataFrame + input dataframe Returns ------- float Sum of Manhattan distances for all pairs of rows. + """ cols = df1.columns.tolist() result = sum([_sum_manhattan_distances_1D(df1[col]) for col in cols]) return result -def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series: - """Sum of energy distances between df1 and df2. +def sum_energy_distances( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> pd.Series: + """Compute the sum of energy distances between df1 and df2. + It is based on https://dcor.readthedocs.io/en/latest/theory.html# Parameters @@ -658,8 +762,8 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF ------- pd.Series Sum of energy distances between df1 and df2. - """ + """ # Replace nan in dataframe df1 = df1[df_mask].fillna(0.0) df2 = df2[df_mask].fillna(0.0) @@ -670,7 +774,11 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF df = pd.concat([df1, df2]) sum_distances_df1_df2 = _sum_manhattan_distances(df) - sum_distance = 2 * sum_distances_df1_df2 - 4 * sum_distances_df1 - 4 * sum_distances_df2 + sum_distance = ( + 2 * sum_distances_df1_df2 + - 4 * sum_distances_df1 + - 4 * sum_distances_df2 + ) return pd.Series(sum_distance, index=["All"]) @@ -681,7 +789,8 @@ def sum_pairwise_distances( df_mask: pd.DataFrame, metric: str = "cityblock", ) -> float: - """Sum of pairwise distances based on a predefined metric. + """Compute the sum of pairwise distances based on a predefined metric. + Metrics are found in this link https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html @@ -700,6 +809,7 @@ def sum_pairwise_distances( ------- float Sum of pairwise distances based on a predefined metric + """ df1 = df1[df_mask.any(axis=1)] df2 = df2[df_mask.any(axis=1)] @@ -717,12 +827,16 @@ def frechet_distance_base( df1: pd.DataFrame, df2: pd.DataFrame, ) -> pd.Series: - """Compute the Fréchet distance between two dataframes df1 and df2 - Frechet_distance = || mu_1 - mu_2 ||_2^2 + Tr(Sigma_1 + Sigma_2 - 2(Sigma_1 . Sigma_2)^(1/2)) - It is normalized, df1 and df2 are first scaled by a factor (std(df1) + std(df2)) / 2 - and then centered around (mean(df1) + mean(df2)) / 2 - Based on: Dowson, D. C., and BV666017 Landau. "The Fréchet distance between multivariate normal - distributions." Journal of multivariate analysis 12.3 (1982): 450-455. + """Compute the Fréchet distance between two dataframes df1 and df2. + + Frechet_distance = || mu_1 - mu_2 ||_2^2 + + Tr(Sigma_1 + Sigma_2 - 2(Sigma_1 . Sigma_2)^(1/2)) + It is normalized, df1 and df2 are first scaled by a factor + (std(df1) + std(df2)) / 2 and then centered around + (mean(df1) + mean(df2)) / 2 + Based on: Dowson, D. C., and BV666017 Landau. + "The Fréchet distance between multivariate normal distributions." + Journal of multivariate analysis 12.3 (1982): 450-455. Parameters ---------- @@ -735,8 +849,8 @@ def frechet_distance_base( ------- pd.Series Frechet distance in a Series object - """ + """ if df1.shape != df2.shape: raise Exception("inputs have to be of same dimensions.") @@ -759,12 +873,13 @@ def frechet_distance( method: str = "single", min_n_rows: int = 10, ) -> pd.Series: - """ - Frechet distance computed using a pattern decomposition. Several variant are implemented: - - the `single` method relies on a single estimation of the means and covariance matrix. It is - relevent for MCAR data. - - the `pattern`method relies on the aggregation of the estimated distance between each - pattern. It is relevent for MAR data. + """Compute Frechet distance computed using a pattern decomposition. + + Several variant are implemented: + - the `single` method relies on a single estimation of the means and + covariance matrix. It is relevent for MCAR data. + - the `pattern`method relies on the aggregation of the estimated distance + between each pattern. It is relevent for MAR data. Parameters ---------- @@ -775,8 +890,8 @@ def frechet_distance( df_mask : pd.DataFrame Mask indicating on which values the distance has to computed on method: str - Method used to compute the distance on multivariate datasets with missing values. - Possible values are `robust` and `pattern`. + Method used to compute the distance on multivariate datasets with + missing values. Possible values are `robust` and `pattern`. min_n_rows: int Minimum number of rows for a KL estimation @@ -784,8 +899,8 @@ def frechet_distance( ------- pd.Series Series of computed metrics - """ + """ if method == "single": return frechet_distance_base(df1, df2) return pattern_based_weighted_mean_metric( @@ -799,9 +914,12 @@ def frechet_distance( def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> float: - """Estimation of the Kullback-Leibler divergence between the two 1D empirical distributions - given by `df1`and `df2`. The samples are binarized using a uniform spacing with 20 bins from - the smallest to the largest value. Not that this may be a coarse estimation. + """Estimate the the Kullback-Leibler divergence for 1D. + + Computation between the two 1D empirical distributions + given by `df1`and `df2`. The samples are binarized using a uniform spacing + with 20 bins from the smallest to the largest value. Not that this may be + a coarse estimation. Parameters ---------- @@ -814,6 +932,7 @@ def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> float: ------- float Kullback-Leibler divergence between the two empirical distributions. + """ min_val = min(df1.min(), df2.min()) max_val = max(df1.max(), df2.max()) @@ -824,7 +943,9 @@ def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> float: def kl_divergence_gaussian(df1: pd.DataFrame, df2: pd.DataFrame) -> float: - """Kullback-Leibler divergence estimation based on a Gaussian approximation of both empirical + """Compute Kullback-Leibler divergence estimation. + + Computation based on a Gaussian approximation of both empirical distributions Parameters @@ -838,16 +959,20 @@ def kl_divergence_gaussian(df1: pd.DataFrame, df2: pd.DataFrame) -> float: ------- pd.Series Series of estimated metrics + """ cov1 = df1.cov().values cov2 = df2.cov().values means1 = np.array(df1.mean()) means2 = np.array(df2.mean()) try: - div_kl = algebra.kl_divergence_gaussian_exact(means1, cov1, means2, cov2) + div_kl = algebra.kl_divergence_gaussian_exact( + means1, cov1, means2, cov2 + ) except LinAlgError: raise ValueError( - "Provided datasets have degenerate colinearities, KL-divergence cannot be computed!" + "Provided datasets have degenerate colinearities, KL-divergence " + "cannot be computed!" ) return div_kl @@ -859,11 +984,12 @@ def kl_divergence( method: str = "columnwise", min_n_rows: int = 10, ) -> pd.Series: - """ - Estimation of the Kullback-Leibler divergence between too empirical distributions. Three - methods are implemented: - - columnwise, relying on a uniform binarization and only taking marginals into account - (https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence), + """Estimate the KL divergence. + + Estimation of the Kullback-Leibler divergence between too empirical + distributions. Three methods are implemented: + - columnwise, relying on a uniform binarization and only taking marginals + into account (https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence), - gaussian, relying on a Gaussian approximation, Parameters @@ -875,8 +1001,8 @@ def kl_divergence( df_mask: pd.DataFrame Mask indicating on what values the divergence should be computed method: str - Method used to compute the divergence on multivariate datasets with missing values. - Possible values are `columnwise` and `gaussian`. + Method used to compute the divergence on multivariate datasets with + missing values. Possible values are `columnwise` and `gaussian`. min_n_rows: int Minimum number of rows for a KL estimation @@ -888,11 +1014,15 @@ def kl_divergence( Raises ------ AssertionError - If the empirical distributions do not have enough samples to estimate a KL divergence. - Consider using a larger dataset of lowering the parameter `min_n_rows`. + If the empirical distributions do not have enough samples to estimate + a KL divergence. Consider using a larger dataset of lowering + the parameter `min_n_rows`. + """ if method == "columnwise": - return columnwise_metric(df1, df2, df_mask, kl_divergence_1D, type_cols="numerical") + return columnwise_metric( + df1, df2, df_mask, kl_divergence_1D, type_cols="numerical" + ) elif method == "gaussian": return pattern_based_weighted_mean_metric( df1, @@ -904,13 +1034,17 @@ def kl_divergence( ) else: raise AssertionError( - f"The parameter of the function wasserstein_distance should be one of" - f"the following: [`columnwise`, `gaussian`], not `{method}`!" + f"The parameter of the function wasserstein_distance " + "should be one of the following: " + f"[`columnwise`, `gaussian`], not `{method}`!" ) def distance_anticorr(df1: pd.DataFrame, df2: pd.DataFrame) -> float: - """Score based on the distance anticorrelation between two empirical distributions. + """Compute distance anticorr. + + Score based on the distance anticorrelation between + two empirical distributions. The theoretical basis can be found on dcor documentation: https://dcor.readthedocs.io/en/latest/theory.html @@ -925,6 +1059,7 @@ def distance_anticorr(df1: pd.DataFrame, df2: pd.DataFrame) -> float: ------- float Distance correlation score + """ return (1 - dcor.distance_correlation(df1.values, df2.values)) / 2 @@ -935,7 +1070,7 @@ def distance_anticorr_pattern( df_mask: pd.DataFrame, min_n_rows: int = 10, ) -> pd.Series: - """Correlation distance computed using a pattern decomposition + """Compute correlation distance computed using a pattern decomposition. Parameters ---------- @@ -952,8 +1087,8 @@ def distance_anticorr_pattern( ------- pd.Series Series of computed metrics - """ + """ return pattern_based_weighted_mean_metric( df1, df2, @@ -974,6 +1109,7 @@ def pattern_based_weighted_mean_metric( **kwargs, ) -> pd.Series: """Compute a mean score based on missing patterns. + Note that for each pattern, a score is returned by the function metric. This code is based on https://www.statsmodels.org/ @@ -989,11 +1125,16 @@ def pattern_based_weighted_mean_metric( metric function min_n_rows : int, optional minimum number of row allowed for a pattern without nan, by default 10 + type_cols : str, optional + type of the columns ("all", "numerical", "categorical") + **kwargs : dict + additional arguments Returns ------- pd.Series _description_ + """ if type_cols == "all": cols = df1.columns @@ -1002,7 +1143,9 @@ def pattern_based_weighted_mean_metric( elif type_cols == "categorical": cols = df1.select_dtypes(exclude=["number"]).columns else: - raise ValueError(f"Value {type_cols} is not valid for parameter `type_cols`!") + raise ValueError( + f"Value {type_cols} is not valid for parameter `type_cols`!" + ) if np.any(df_mask & df1.isna()): raise ValueError("The argument df1 has missing values on the mask!") @@ -1016,7 +1159,9 @@ def pattern_based_weighted_mean_metric( df2 = df2[cols].loc[rows_mask] df_mask = df_mask[cols].loc[rows_mask] max_num_row = 0 - for tup_pattern, df_mask_pattern in df_mask.groupby(df_mask.columns.tolist()): + for tup_pattern, df_mask_pattern in df_mask.groupby( + df_mask.columns.tolist() + ): ind_pattern = df_mask_pattern.index df1_pattern = df1.loc[ind_pattern, list(tup_pattern)] max_num_row = max(max_num_row, len(df1_pattern)) @@ -1027,12 +1172,27 @@ def pattern_based_weighted_mean_metric( scores.append(metric(df1_pattern, df2_pattern, **kwargs)) if len(scores) == 0: raise NotEnoughSamples(max_num_row, min_n_rows) - return pd.Series(sum([s * w for s, w in zip(scores, weights)]), index=["All"]) + return pd.Series( + sum([s * w for s, w in zip(scores, weights)]), index=["All"] + ) def get_metric( name: str, ) -> Callable[[pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.Series]: + """Get metric. + + Parameters + ---------- + name : str + name of the metic to compute + + Returns + ------- + Callable[[pd.DataFrame, pd.DataFrame, pd.DataFrame], pd.Series] + metric + + """ dict_metrics: Dict[str, Callable] = { "mse": mean_squared_error, "rmse": root_mean_squared_error, @@ -1043,7 +1203,9 @@ def get_metric( "KL_columnwise": partial(kl_divergence, method="columnwise"), "KL_gaussian": partial(kl_divergence, method="gaussian"), "KS_test": kolmogorov_smirnov_test, - "correlation_diff": mean_difference_correlation_matrix_numerical_features, + "correlation_diff": ( + mean_difference_correlation_matrix_numerical_features + ), "energy": sum_energy_distances, "frechet": partial(frechet_distance, method="single"), "frechet_pattern": partial(frechet_distance, method="pattern"), diff --git a/qolmat/benchmark/missing_patterns.py b/qolmat/benchmark/missing_patterns.py index 65b6d6ea..2f317a4a 100644 --- a/qolmat/benchmark/missing_patterns.py +++ b/qolmat/benchmark/missing_patterns.py @@ -1,19 +1,33 @@ +"""Script for missing patterns.""" + from __future__ import annotations import functools -from typing import Callable, List, Optional, Tuple, Union +import math import warnings +from typing import Callable, List, Optional, Tuple, Union import numpy as np import pandas as pd from sklearn import utils as sku -from sklearn.utils import resample -import math -from qolmat.utils.exceptions import NoMissingValue, SubsetIsAString +from qolmat.utils.exceptions import SubsetIsAString def compute_transition_counts_matrix(states: pd.Series): + """Compute transtion counts matrix. + + Parameters + ---------- + states : pd.Series + possible states (masks) + + Returns + ------- + pd.Series | pd.DataFrame + transition counts matrix + + """ if isinstance(states.iloc[0], tuple): n_variables = len(states.iloc[0]) state_nonan = pd.Series([tuple([False] * n_variables)]) @@ -28,18 +42,48 @@ def compute_transition_counts_matrix(states: pd.Series): return df_counts -def compute_transition_matrix(states: pd.Series, ngroups: Optional[List] = None): +def compute_transition_matrix( + states: pd.Series, ngroups: Optional[List] = None +): + """Compute the transition matrix. + + Parameters + ---------- + states : pd.Series + serie of possible states (masks) + ngroups : Optional[List], optional + groups, by default None + + Returns + ------- + pd.DataFrame | pd.Series + transition matrix + + """ if ngroups is None: df_counts = compute_transition_counts_matrix(states) else: - list_counts = [compute_transition_counts_matrix(df) for _, df in states.groupby(ngroups)] - df_counts = functools.reduce(lambda a, b: a.add(b, fill_value=0), list_counts) + list_counts = [ + compute_transition_counts_matrix(df) + for _, df in states.groupby(ngroups) + ] + df_counts = functools.reduce( + lambda a, b: a.add(b, fill_value=0), list_counts + ) df_transition = df_counts.div(df_counts.sum(axis=1), axis=0) return df_transition def get_sizes_max(values_isna: pd.Series) -> pd.Series[int]: + """Get max sizes. + + Parameters + ---------- + values_isna : pd.Series + pandas series indicating if value is missing. + + """ ids_hole = (values_isna.diff() != 0).cumsum() sizes_max = values_isna.groupby(ids_hole, group_keys=True).apply( lambda x: (~x) * np.arange(len(x)) @@ -51,14 +95,16 @@ def get_sizes_max(values_isna: pd.Series) -> pd.Series[int]: class _HoleGenerator: - """ - This abstract class implements the generic method to generate masks according to law of missing - values. + """Abstract HoleGenerator class. + + This abstract class implements the generic method to generate masks + according to law of missing values. Parameters ---------- n_splits : int - number of dataframes with missing additional missing values to be created + number of dataframes with missing additional missing values to be + created subset : Optional[List[str]] Names of the columns for which holes must be created, by default None ratio_masked : Optional[float] @@ -68,6 +114,7 @@ class _HoleGenerator: Pass an int for reproducible output across multiple function calls. groups: Tuple[str, ...] Column names used to group the data + """ generate_mask: Callable @@ -88,20 +135,22 @@ def __init__( self.groups = groups def fit(self, X: pd.DataFrame) -> _HoleGenerator: - """ - Fits the generator. + """Fit the generator. Parameters ---------- X : pd.DataFrame Initial dataframe with a missing pattern to be imitated. + """ self._check_subset(X) self.dict_ratios = {} missing_per_col = X[self.subset].isna().sum() self.dict_ratios = (missing_per_col / missing_per_col.sum()).to_dict() if self.groups: - self.ngroups = X.groupby(list(self.groups)).ngroup().rename("_ngroup") + self.ngroups = ( + X.groupby(list(self.groups)).ngroup().rename("_ngroup") + ) else: self.ngroups = None @@ -109,6 +158,7 @@ def fit(self, X: pd.DataFrame) -> _HoleGenerator: def split(self, X: pd.DataFrame) -> List[pd.DataFrame]: """Create a list of boolean masks representing the data to mask. + Parameters ---------- X : pd.DataFrame @@ -117,17 +167,19 @@ def split(self, X: pd.DataFrame) -> List[pd.DataFrame]: Returns ------- Dict[str, pd.DataFrame] - the initial dataframe, the dataframe with additional missing entries and the created - mask - """ + the initial dataframe, the dataframe with additional missing + entries and the created mask + """ self.fit(X) list_masks = [] for _ in range(self.n_splits): if self.ngroups is None: mask = self.generate_mask(X) else: - mask = X.groupby(self.ngroups, group_keys=False).apply(self.generate_mask) + mask = X.groupby(self.ngroups, group_keys=False).apply( + self.generate_mask + ) list_masks.append(mask) return list_masks @@ -140,8 +192,10 @@ def _check_subset(self, X: pd.DataFrame): class UniformHoleGenerator(_HoleGenerator): - """This class implements a way to generate holes in a dataframe. - The holes are generated randomly, using the resample method of scikit learn. + """UniformHoleGenerator class. + + This class implements a way to generate holes in a dataframe. + The holes are generated randomly, using the resample method of sklearn. Parameters ---------- @@ -157,6 +211,7 @@ class UniformHoleGenerator(_HoleGenerator): sample_proportional: bool, optional If True, generates holes in target columns with same equal frequency. If False, reproduces the empirical proportions between the variables. + """ def __init__( @@ -177,15 +232,14 @@ def __init__( self.sample_proportional = sample_proportional def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame: - """ - Returns a mask for the dataframe at hand. + """Return a mask for the dataframe at hand. Parameters ---------- X : pd.DataFrame Initial dataframe with a missing pattern to be imitated. - """ + """ self.random_state = sku.check_random_state(self.random_state) df_mask = pd.DataFrame(False, index=X.index, columns=X.columns) @@ -206,8 +260,10 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame: class _SamplerHoleGenerator(_HoleGenerator): - """This abstract class implements a generic way to generate holes in a dataframe by sampling 1D - hole size distributions. + """Abstract SamplerHoleGenerator class. + + This abstract class implements a generic way to generate holes in a + dataframe by sampling 1D hole size distributions. Parameters ---------- @@ -222,6 +278,7 @@ class _SamplerHoleGenerator(_HoleGenerator): Pass an int for reproducible output across multiple function calls. groups: Tuple[str, ...] Column names used to group the data + """ sample_sizes: Callable @@ -242,18 +299,27 @@ def __init__( groups=groups, ) - def generate_hole_sizes(self, column: str, n_masked: int, sort: bool = True) -> List[int]: - """Generate a sequence of states "states" of size "size" from - a transition matrix "df_transition" + def generate_hole_sizes( + self, column: str, n_masked: int, sort: bool = True + ) -> List[int]: + """Generate a sequence of states "states" of size "size". + + Generated from a transition matrix "df_transition" Parameters ---------- - size : int - length of the output sequence + column : str + column name + n_masked: int + number of masks + sort: bool, optional + true if sort, by default True Returns ------- - List[float] + List[int] + list of hole sizes + """ sizes_sampled = self.sample_sizes(column, n_masked) sizes_sampled = sizes_sampled[sizes_sampled.cumsum() < n_masked] @@ -265,6 +331,7 @@ def generate_hole_sizes(self, column: str, n_masked: int, sort: bool = True) -> def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame: """Create missing data in an arraylike object based on a markov chain. + States of the MC are the different masks of missing values: there are at most pow(2,X.shape[1]) possible states. @@ -277,6 +344,7 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame: ------- mask : pd.DataFrame masked dataframe with additional missing entries + """ mask = pd.DataFrame(False, columns=X.columns, index=X.index) n_masked_col = round(self.ratio_masked * len(X)) @@ -288,14 +356,29 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame: sizes_max = get_sizes_max(states) n_masked_left = n_masked_col - sizes_sampled = self.generate_hole_sizes(column, n_masked_col, sort=True) - assert sum(sizes_sampled) == n_masked_col - sizes_sampled += self.generate_hole_sizes(column, n_masked_col, sort=False) + sizes_sampled = self.generate_hole_sizes( + column, n_masked_col, sort=True + ) + if sum(sizes_sampled) != n_masked_col: + raise ValueError( + "sum of sizes_sampled is different from n_masked_col: " + f"{sum(sizes_sampled)} != {n_masked_col}." + ) + sizes_sampled += self.generate_hole_sizes( + column, n_masked_col, sort=False + ) for sample in sizes_sampled: sample = min(min(sample, sizes_max.max()), n_masked_left) i_hole = self.rng.choice(np.where(sample <= sizes_max)[0]) - assert (~mask[column].iloc[i_hole - sample : i_hole]).all() + if not (~mask[column].iloc[i_hole - sample : i_hole]).all(): + raise ValueError( + "The mask condition is not satisfied for " + f"column={column}, " + f"sample={sample}, " + f"and i_hole={i_hole}." + ) + mask[column].iloc[i_hole - sample : i_hole] = True n_masked_left -= sample @@ -308,12 +391,16 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame: break if list_failed: - warnings.warn(f"No place to introduce sampled holes of size {list_failed}!") + warnings.warn( + f"No place to introduce sampled holes of size {list_failed}!" + ) return mask class GeometricHoleGenerator(_SamplerHoleGenerator): - """This class implements a way to generate holes in a dataframe. + """GeometricHoleGenerator class. + + This class implements a way to generate holes in a dataframe. The holes are generated following a Markov 1D process. Parameters @@ -329,6 +416,7 @@ class GeometricHoleGenerator(_SamplerHoleGenerator): Pass an int for reproducible output across multiple function calls. groups: Tuple[str, ...] Column names used to group the data + """ def __init__( @@ -348,14 +436,13 @@ def __init__( ) def fit(self, X: pd.DataFrame) -> GeometricHoleGenerator: - """ - Get the transition matrix from a list of states + """Get the transition matrix from a list of states. Parameters ---------- X : pd.DataFrame - transition matrix (stochastic matrix) current in index, next in columns - 1 is missing + transition matrix (stochastic matrix) current in index, + next in columns 1 is missing Returns @@ -373,16 +460,35 @@ def fit(self, X: pd.DataFrame) -> GeometricHoleGenerator: return self - def sample_sizes(self, column, n_masked): + def sample_sizes(self, column: str, n_masked: int): + """Sample sizes. + + Parameters + ---------- + column : str + column name + n_masked : int + number of masks + + Returns + ------- + pd.Series + sizes sampled + + """ proba_out = self.dict_probas_out[column] mean_size = 1 / proba_out n_holes = 2 * round(n_masked / mean_size) - sizes_sampled = pd.Series(self.rng.geometric(p=proba_out, size=n_holes)) + sizes_sampled = pd.Series( + self.rng.geometric(p=proba_out, size=n_holes) + ) return sizes_sampled class EmpiricalHoleGenerator(_SamplerHoleGenerator): - """This class implements a way to generate holes in a dataframe. + """EmpiricalHoleGenerator class. + + This class implements a way to generate holes in a dataframe. The distribution of holes is learned from the data. The distributions are learned column by column. @@ -399,6 +505,7 @@ class EmpiricalHoleGenerator(_SamplerHoleGenerator): Pass an int for reproducible output across multiple function calls. groups: Tuple[str, ...] Column names used to group the data + """ def __init__( @@ -418,6 +525,19 @@ def __init__( ) def compute_distribution_holes(self, states: pd.Series) -> pd.Series: + """Compute the hole distribution. + + Parameters + ---------- + states : pd.Series + Series of states. + + Returns + ------- + pd.Series + hole distribution + + """ series_id = (states.diff() != 0).cumsum() series_id = series_id[states] distribution_holes = series_id.value_counts().value_counts() @@ -427,7 +547,8 @@ def compute_distribution_holes(self, states: pd.Series) -> pd.Series: def fit(self, X: pd.DataFrame) -> EmpiricalHoleGenerator: """Compute the holes sizes of a dataframe. - Dataframe df has only one column + + Dataframe df has only one column. Parameters ---------- @@ -438,6 +559,7 @@ def fit(self, X: pd.DataFrame) -> EmpiricalHoleGenerator: ------- EmpiricalTimeHoleGenerator The model itself + """ super().fit(X) @@ -445,42 +567,54 @@ def fit(self, X: pd.DataFrame) -> EmpiricalHoleGenerator: for column in self.subset: states = X[column].isna() if self.ngroups is None: - self.dict_distributions_holes[column] = self.compute_distribution_holes(states) + self.dict_distributions_holes[column] = ( + self.compute_distribution_holes(states) + ) else: distributions_holes = states.groupby(self.ngroups).apply( self.compute_distribution_holes ) - distributions_holes = distributions_holes.groupby(by="_size_hole").sum() + distributions_holes = distributions_holes.groupby( + by="_size_hole" + ).sum() self.dict_distributions_holes[column] = distributions_holes return self def sample_sizes(self, column, n_masked): - """Create missing data in an arraylike object based on the holes size distribution. + """Create missing data based on the holes size distribution. Parameters ---------- column : str name of the column to fill with holes - nb_holes : Optional[int], optional - number of holes to create, by default 10 + n_masked :int + number of masks Returns ------- samples_sizes : List[int] + """ distribution_holes = self.dict_distributions_holes[column] distribution_holes /= distribution_holes.sum() - mean_size = (distribution_holes.values * distribution_holes.index.values).sum() + mean_size = ( + distribution_holes.values * distribution_holes.index.values + ).sum() n_samples = 2 * round(n_masked / mean_size) - sizes_sampled = self.rng.choice(distribution_holes.index, n_samples, p=distribution_holes) + sizes_sampled = self.rng.choice( + distribution_holes.index, n_samples, p=distribution_holes + ) return sizes_sampled class MultiMarkovHoleGenerator(_HoleGenerator): - """This class implements a way to generate holes in a dataframe. + """MultiMarkovHoleGenerator class. + + This class implements a way to generate holes in a dataframe. The holes are generated according to a Markov process. - Each line of the dataframe mask (np.nan) represents a state of the Markov chain. + Each line of the dataframe mask (np.nan) represents a state of the + Markov chain. Parameters ---------- @@ -495,6 +629,7 @@ class MultiMarkovHoleGenerator(_HoleGenerator): Pass an int for reproducible output across multiple function calls. groups: Tuple[str, ...] Column names used to group the data + """ def __init__( @@ -514,7 +649,8 @@ def __init__( ) def fit(self, X: pd.DataFrame) -> MultiMarkovHoleGenerator: - """ + """Get the transition matrix. + Get the transition matrix from a list of states transition matrix (stochastic matrix) current in index, next in columns 1 is missing @@ -522,6 +658,7 @@ def fit(self, X: pd.DataFrame) -> MultiMarkovHoleGenerator: Parameters ---------- X : pd.DataFrame + input dataframe Returns ------- @@ -533,28 +670,34 @@ def fit(self, X: pd.DataFrame) -> MultiMarkovHoleGenerator: states = X[self.subset].isna().apply(lambda x: tuple(x), axis=1) self.df_transition = compute_transition_matrix(states, self.ngroups) - self.df_transition.index = pd.MultiIndex.from_tuples(self.df_transition.index) - self.df_transition.columns = pd.MultiIndex.from_tuples(self.df_transition.columns) + self.df_transition.index = pd.MultiIndex.from_tuples( + self.df_transition.index + ) + self.df_transition.columns = pd.MultiIndex.from_tuples( + self.df_transition.columns + ) return self - def generate_multi_realisation(self, n_masked: int) -> List[List[Tuple[bool, ...]]]: - """Generate a sequence of states "states" of size "size" - from a transition matrix "df_transition" + def generate_multi_realisation( + self, n_masked: int + ) -> List[List[Tuple[bool, ...]]]: + """Generate a sequence of states "states" of size "size". + + Generated from a transition matrix "df_transition" Parameters ---------- - df_transition : pd.DataFrame - transition matrix (stochastic matrix) - size : int - length of the output sequence + n_masked : int + number of masks. Returns ------- realisation ; List[int] sequence of states + """ - states = sorted(list(self.df_transition.index)) + states = sorted(self.df_transition.index) state_nona = tuple([False] * len(states[0])) state = state_nona @@ -564,7 +707,9 @@ def generate_multi_realisation(self, n_masked: int) -> List[List[Tuple[bool, ... realisation = [] while True: probas = self.df_transition.loc[state, :].values - state = np.random.choice(self.df_transition.columns, 1, p=probas)[0] + state = np.random.choice( + self.df_transition.columns, 1, p=probas + )[0] if state == state_nona: break else: @@ -576,6 +721,7 @@ def generate_multi_realisation(self, n_masked: int) -> List[List[Tuple[bool, ... def generate_mask(self, X: pd.DataFrame) -> List[pd.DataFrame]: """Create missing data in an arraylike object based on a markov chain. + States of the MC are the different masks of missing values: there are at most pow(2,X.shape[1]) possible states. @@ -587,13 +733,15 @@ def generate_mask(self, X: pd.DataFrame) -> List[pd.DataFrame]: Returns ------- Dict[str, pd.DataFrame] - the initial dataframe, the dataframe with additional missing entries and the created - mask - """ + the initial dataframe, the dataframe with additional missing + entries and the created mask + """ self.rng = sku.check_random_state(self.random_state) X_subset = X[self.subset] - mask = pd.DataFrame(False, columns=X_subset.columns, index=X_subset.index) + mask = pd.DataFrame( + False, columns=X_subset.columns, index=X_subset.index + ) values_hasna = X_subset.isna().any(axis=1) @@ -608,7 +756,11 @@ def generate_mask(self, X: pd.DataFrame) -> List[pd.DataFrame]: size_hole = min(size_hole, sizes_max.max()) realisation = realisation[:size_hole] i_hole = self.rng.choice(np.where(size_hole <= sizes_max)[0]) - assert (~mask.iloc[i_hole - size_hole : i_hole]).all().all() + if not (~mask.iloc[i_hole - size_hole : i_hole]).all().all(): + raise ValueError( + f"The mask condition is not satisfied for i_hole={i_hole} " + f"and size_hole={size_hole}." + ) if size_hole != 0: mask.iloc[i_hole - size_hole : i_hole] = mask.iloc[ i_hole - size_hole : i_hole @@ -629,7 +781,9 @@ def generate_mask(self, X: pd.DataFrame) -> List[pd.DataFrame]: class GroupedHoleGenerator(_HoleGenerator): - """This class implements a way to generate holes in a dataframe. + """GroupedHoleGenerator class. + + This class implements a way to generate holes in a dataframe. The holes are generated from groups, specified by the user. Parameters @@ -645,6 +799,7 @@ class GroupedHoleGenerator(_HoleGenerator): Pass an int for reproducible output across multiple function calls. groups : Tuple[str, ...] Names of the columns forming the groups, by default [] + """ def __init__( @@ -667,11 +822,12 @@ def __init__( raise Exception("Argument groups is an empty tuple!") def fit(self, X: pd.DataFrame) -> GroupedHoleGenerator: - """Create the groups based on the column names (groups attribute) + """Create the groups based on the column names (groups attribute). Parameters ---------- X : pd.DataFrame + input dataframe Returns ------- @@ -681,33 +837,41 @@ def fit(self, X: pd.DataFrame) -> GroupedHoleGenerator: Raises ------ if the number of samples/splits is greater than the number of groups. - """ + """ super().fit(X) if self.n_splits > self.ngroups.nunique(): - raise ValueError("n_samples has to be smaller than the number of groups.") + raise ValueError( + "n_samples has to be smaller than the number of groups." + ) return self def split(self, X: pd.DataFrame) -> List[pd.DataFrame]: - """creates masked dataframes + """Create masked dataframes. Parameters ---------- X : pd.DataFrame + input dataframe Returns ------- List[pd.DataFrame] list of masks + """ self.fit(X) - group_sizes = X.groupby(self.ngroups, group_keys=False).count().mean(axis=1) + group_sizes = ( + X.groupby(self.ngroups, group_keys=False).count().mean(axis=1) + ) list_masks = [] for _ in range(self.n_splits): - shuffled_group_sizes = group_sizes.sample(frac=1, random_state=self.random_state) + shuffled_group_sizes = group_sizes.sample( + frac=1, random_state=self.random_state + ) ratio_masks = shuffled_group_sizes.cumsum() / len(X) ratio_masks = ratio_masks.reset_index(name="ratio") @@ -715,7 +879,9 @@ def split(self, X: pd.DataFrame) -> List[pd.DataFrame]: closest_ratio_mask = ratio_masks.iloc[ (ratio_masks["ratio"] - self.ratio_masked).abs().argsort()[:1] ] - groups_masked = ratio_masks.iloc[: closest_ratio_mask.index[0], :]["_ngroup"].values + groups_masked = ratio_masks.iloc[: closest_ratio_mask.index[0], :][ + "_ngroup" + ].values if closest_ratio_mask.index[0] == 0: groups_masked = ratio_masks.iloc[:1, :]["_ngroup"].values diff --git a/qolmat/imputations/diffusions/base.py b/qolmat/imputations/diffusions/base.py index 84fe339d..1b6a9abd 100644 --- a/qolmat/imputations/diffusions/base.py +++ b/qolmat/imputations/diffusions/base.py @@ -1,19 +1,24 @@ +"""Script for base classes.""" + +import math from typing import Tuple + import torch -import math class ResidualBlock(torch.nn.Module): - """Residual block based on the work of Gorishniy et al., 2023 + """ResidualBlock. + + Based on the work of Gorishniy et al., 2023 (https://arxiv.org/abs/2106.11959). We follow the implementation found in - https://github.com/Yura52/rtdl/blob/main/rtdl/nn/_backbones.py""" + https://github.com/Yura52/rtdl/blob/main/rtdl/nn/_backbones.py + """ - def __init__(self, dim_input: int, dim_embedding: int = 128, p_dropout: float = 0.0): - """Residual block based on the work of Gorishniy et al., 2023 - (https://arxiv.org/abs/2106.11959). - We follow the implementation found in - https://github.com/Yura52/rtdl/blob/main/rtdl/nn/_backbones.py + def __init__( + self, dim_input: int, dim_embedding: int = 128, p_dropout: float = 0.0 + ): + """Init funciton. Parameters ---------- @@ -23,8 +28,8 @@ def __init__(self, dim_input: int, dim_embedding: int = 128, p_dropout: float = Embedding dimension, by default 128 p_dropout : float, optional Dropout probability, by default 0.1 - """ + """ super().__init__() self.layer_norm = torch.nn.LayerNorm(dim_input) @@ -34,8 +39,10 @@ def __init__(self, dim_input: int, dim_embedding: int = 128, p_dropout: float = self.linear_out = torch.nn.Linear(dim_embedding, dim_input) - def forward(self, x: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Return an output of a residual block + def forward( + self, x: torch.Tensor, t: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Return an output of a residual block. Parameters ---------- @@ -48,8 +55,8 @@ def forward(self, x: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch ------- Tuple[torch.Tensor, torch.Tensor] Output data at noise step t - """ + """ x_t = self.layer_norm(x + t) x_t_emb = torch.nn.functional.relu(self.linear_in(x_t)) x_t_emb = self.dropout(x_t_emb) @@ -59,12 +66,15 @@ def forward(self, x: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch class ResidualBlockTS(torch.nn.Module): - """Residual block based on the work of Gorishniy et al., 2023 + """Residual block time series. + + Residual block based on the work of Gorishniy et al., 2023 (https://arxiv.org/abs/2106.11959). We follow the implementation found in https://github.com/Yura52/rtdl/blob/main/rtdl/nn/_backbones.py This class is for Time-Series data where we add Tranformers to - encode time-based/feature-based context.""" + encode time-based/feature-based context. + """ def __init__( self, @@ -76,12 +86,7 @@ def __init__( nheads_time: int = 8, num_layers_transformer: int = 1, ): - """Residual block based on the work of Gorishniy et al., 2023 - (https://arxiv.org/abs/2106.11959). - We follow the implementation found in - https://github.com/Yura52/rtdl/blob/main/rtdl/nn/_backbones.py - This class is for Time-Series data where we add Tranformers to - encode time-based/feature-based context. + """Init function. Parameters ---------- @@ -99,6 +104,7 @@ def __init__( Number of heads to encode time-based context, by default 8 num_layers_transformer : int, optional Number of transformer layer, by default 1 + """ super().__init__() @@ -118,8 +124,10 @@ def __init__( self.linear_out = torch.nn.Linear(dim_embedding, dim_input) - def forward(self, x: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Return an output of a residual block + def forward( + self, x: torch.Tensor, t: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Return an output of a residual block. Parameters ---------- @@ -132,12 +140,15 @@ def forward(self, x: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch ------- torch.Tensor Data output, noise predicted + """ batch_size, size_window, dim_emb = x.shape x_emb = self.layer_norm(x) x_emb_time = self.time_layer(x_emb) - t_emb = t.repeat(1, size_window).reshape(batch_size, size_window, dim_emb) + t_emb = t.repeat(1, size_window).reshape( + batch_size, size_window, dim_emb + ) x_t = x + x_emb_time + t_emb x_t = self.linear_out(x_t) @@ -146,11 +157,14 @@ def forward(self, x: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch class AutoEncoder(torch.nn.Module): - """Epsilon_theta model of the Algorithm 1 in + """Auto encoder class. + + Epsilon_theta model of the Algorithm 1 in Ho et al., 2020 (https://arxiv.org/abs/2006.11239). This implementation is based on the work of Tashiro et al., 2021 (https://arxiv.org/abs/2107.03502). - Their code: https://github.com/ermongroup/CSDI/blob/main/diff_models.py""" + Their code: https://github.com/ermongroup/CSDI/blob/main/diff_models.py + """ def __init__( self, @@ -161,8 +175,7 @@ def __init__( num_blocks: int = 1, p_dropout: float = 0.0, ): - """Epsilon_theta model in Algorithm 1 in - Ho et al., 2020 (https://arxiv.org/abs/2006.11239) + """Init function. Parameters ---------- @@ -170,12 +183,15 @@ def __init__( Number of steps in forward/reverse processes dim_input : int Input dimension + residual_block: torch.nn.Module + residual blocks dim_embedding : int, optional Embedding dimension, by default 128 num_blocks : int, optional Number of residual blocks, by default 1 p_dropout : float, optional Dropout probability, by default 0.0 + """ super().__init__() @@ -193,10 +209,12 @@ def __init__( self.layer_out_2 = torch.nn.Linear(dim_embedding, dim_input) self.dropout_out = torch.nn.Dropout(p_dropout) - self.residual_layers = torch.nn.ModuleList([residual_block for _ in range(num_blocks)]) + self.residual_layers = torch.nn.ModuleList( + [residual_block for _ in range(num_blocks)] + ) def forward(self, x: torch.Tensor, t: torch.LongTensor) -> torch.Tensor: - """Predict a noise + """Predict a noise. Parameters ---------- @@ -209,6 +227,7 @@ def forward(self, x: torch.Tensor, t: torch.LongTensor) -> torch.Tensor: ------- torch.Tensor Data output, noise predicted + """ # Noise step embedding t_emb = torch.as_tensor(self.embedding_noise_step)[t].squeeze() @@ -224,15 +243,20 @@ def forward(self, x: torch.Tensor, t: torch.LongTensor) -> torch.Tensor: x_emb, skip_connection = layer(x_emb, t_emb) skip.append(skip_connection) - out = torch.sum(torch.stack(skip), dim=0) / math.sqrt(len(self.residual_layers)) + out = torch.sum(torch.stack(skip), dim=0) / math.sqrt( + len(self.residual_layers) + ) out = torch.nn.functional.relu(self.layer_out_1(out)) out = self.dropout_out(out) out = self.layer_out_2(out) return out - def _build_embedding(self, num_noise_steps: int, dim: int = 64) -> torch.Tensor: + def _build_embedding( + self, num_noise_steps: int, dim: int = 64 + ) -> torch.Tensor: """Build an embedding for noise step. + More details in section E.1 of Tashiro et al., 2021 (https://arxiv.org/abs/2107.03502) @@ -247,9 +271,14 @@ def _build_embedding(self, num_noise_steps: int, dim: int = 64) -> torch.Tensor: ------- torch.Tensor List of embeddings for noise steps + """ steps = torch.arange(num_noise_steps).unsqueeze(1) # (T,1) - frequencies = 10.0 ** (torch.arange(dim) / (dim - 1) * 4.0).unsqueeze(0) # (1,dim) + frequencies = 10.0 ** (torch.arange(dim) / (dim - 1) * 4.0).unsqueeze( + 0 + ) # (1,dim) table = steps * frequencies # (T,dim) - table = torch.cat([torch.sin(table), torch.cos(table)], dim=1) # (T,dim*2) + table = torch.cat( + [torch.sin(table), torch.cos(table)], dim=1 + ) # (T,dim*2) return table diff --git a/qolmat/imputations/diffusions/ddpms.py b/qolmat/imputations/diffusions/ddpms.py index 231f870e..4f8728e9 100644 --- a/qolmat/imputations/diffusions/ddpms.py +++ b/qolmat/imputations/diffusions/ddpms.py @@ -1,25 +1,31 @@ -from typing import Dict, List, Callable, Tuple, Union -from typing_extensions import Self -import sys -import numpy as np -import pandas as pd +"""Script for DDPM classes.""" + import time from datetime import timedelta -from tqdm import tqdm +from typing import Callable, Dict, List, Tuple, Union +import numpy as np +import pandas as pd import torch -from torch.utils.data import DataLoader, TensorDataset from sklearn import preprocessing from sklearn import utils as sku +from torch.utils.data import DataLoader, TensorDataset +from tqdm import tqdm - -from qolmat.imputations.diffusions.base import AutoEncoder, ResidualBlock, ResidualBlockTS +# from typing_extensions import Self +from qolmat.benchmark import metrics, missing_patterns +from qolmat.imputations.diffusions.base import ( + AutoEncoder, + ResidualBlock, + ResidualBlockTS, +) from qolmat.imputations.diffusions.utils import get_num_params -from qolmat.benchmark import missing_patterns, metrics class TabDDPM: - """Diffusion model for tabular data based on + """Tab DDPM. + + Diffusion model for tabular data based on Denoising Diffusion Probabilistic Models (DDPM) of Ho et al., 2020 (https://arxiv.org/abs/2006.11239), Tashiro et al., 2021 (https://arxiv.org/abs/2107.03502). @@ -42,13 +48,7 @@ def __init__( is_clip: bool = True, random_state: Union[None, int, np.random.RandomState] = None, ): - """Diffusion model for tabular data based on - Denoising Diffusion Probabilistic Models (DDPM) of - Ho et al., 2020 (https://arxiv.org/abs/2006.11239), - Tashiro et al., 2021 (https://arxiv.org/abs/2107.03502). - This implementation follows the implementations found in - https://github.com/quickgrid/pytorch-diffusion/tree/main, - https://github.com/ermongroup/CSDI/tree/main + """Init function. Parameters ---------- @@ -70,11 +70,18 @@ def __init__( Dropout probability, by default 0.0 num_sampling : int, optional Number of samples generated for each cell, by default 1 + is_clip : bool, optional + if values have to be clipped, by default True random_state : int, RandomState instance or None, default=None Controls the randomness. Pass an int for reproducible output across multiple function calls. + """ - self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + self.device = ( + torch.device("cuda") + if torch.cuda.is_available() + else torch.device("cpu") + ) # Hyper-parameters for DDPM # Section 2, equation 1, num_noise_steps is T. @@ -92,7 +99,8 @@ def __init__( self.alpha = 1 - self.beta self.alpha_hat = torch.cumprod(self.alpha, dim=0) - # Section 3.2, algorithm 1 formula implementation. Generate values early reuse later. + # Section 3.2, algorithm 1 formula implementation. + # Generate values early reuse later. self.sqrt_alpha_hat = torch.sqrt(self.alpha_hat) self.sqrt_one_minus_alpha_hat = torch.sqrt(1 - self.alpha_hat) @@ -117,10 +125,14 @@ def __init__( seed_torch = self.random_state.randint(2**31 - 1) torch.manual_seed(seed_torch) - def _q_sample(self, x: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Section 3.2, algorithm 1 formula implementation. Forward process, defined by `q`. - Found in section 2. `q` gradually adds gaussian noise according to variance schedule. Also, - can be seen on figure 2. + def _q_sample( + self, x: torch.Tensor, t: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Sample q. + + Section 3.2, algorithm 1 formula implementation. Forward process, + defined by `q`. Found in section 2. `q` gradually adds gaussian noise + according to variance schedule. Also, can be seen on figure 2. Ho et al., 2020 (https://arxiv.org/abs/2006.11239) Parameters @@ -134,8 +146,8 @@ def _q_sample(self, x: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, tor ------- Tuple[torch.Tensor, torch.Tensor] Noised data at noise step t - """ + """ sqrt_alpha_hat = self.sqrt_alpha_hat[t].view(-1, 1) sqrt_one_minus_alpha_hat = self.sqrt_one_minus_alpha_hat[t].view(-1, 1) @@ -146,16 +158,20 @@ def _set_eps_model(self) -> None: self._eps_model = AutoEncoder( num_noise_steps=self.num_noise_steps, dim_input=self.dim_input, - residual_block=ResidualBlock(self.dim_embedding, self.dim_embedding, self.p_dropout), + residual_block=ResidualBlock( + self.dim_embedding, self.dim_embedding, self.p_dropout + ), dim_embedding=self.dim_embedding, num_blocks=self.num_blocks, p_dropout=self.p_dropout, ).to(self.device) - self.optimiser = torch.optim.Adam(self._eps_model.parameters(), lr=self.lr) + self.optimiser = torch.optim.Adam( + self._eps_model.parameters(), lr=self.lr + ) def _print_valid(self, epoch: int, time_duration: float) -> None: - """Print model performance on validation data + """Print model performance on validation data. Parameters ---------- @@ -163,22 +179,31 @@ def _print_valid(self, epoch: int, time_duration: float) -> None: Epoch of the printed performance time_duration : float Duration for training step + """ self.time_durations.append(time_duration) print_step = 1 if int(self.epochs / 10) == 0 else int(self.epochs / 10) if self.print_valid and epoch == 0: - print(f"Num params of {self.__class__.__name__}: {self.num_params}") + print( + f"Num params of {self.__class__.__name__}: {self.num_params}" + ) if self.print_valid and epoch % print_step == 0: string_valid = f"Epoch {epoch}: " for s in self.summary: - string_valid += f" {s}={round(self.summary[s][epoch], self.round)}" + string_valid += ( + f" {s}={round(self.summary[s][epoch], self.round)}" + ) # string_valid += f" | in {round(time_duration, 3)} secs" - remaining_duration = np.mean(self.time_durations) * (self.epochs - epoch) - string_valid += f" | remaining {timedelta(seconds=remaining_duration)}" + remaining_duration = np.mean(self.time_durations) * ( + self.epochs - epoch + ) + string_valid += ( + f" | remaining {timedelta(seconds=remaining_duration)}" + ) print(string_valid) def _impute(self, x: np.ndarray, x_mask_obs: np.ndarray) -> np.ndarray: - """Impute data array + """Impute data array. Parameters ---------- @@ -191,6 +216,7 @@ def _impute(self, x: np.ndarray, x_mask_obs: np.ndarray) -> np.ndarray: ------- np.ndarray Imputed data + """ x_tensor = torch.from_numpy(x).float().to(self.device) x_mask_tensor = torch.from_numpy(x_mask_obs).float().to(self.device) @@ -207,37 +233,55 @@ def _impute(self, x: np.ndarray, x_mask_obs: np.ndarray) -> np.ndarray: for i in reversed(range(1, self.num_noise_steps)): t = ( - torch.ones((x_batch.size(dim=0), 1), dtype=torch.long, device=self.device) + torch.ones( + (x_batch.size(dim=0), 1), + dtype=torch.long, + device=self.device, + ) * i ) if len(x_batch.size()) == 3: - # Data are splited into chunks (i.e., Time-series data), a window of rows + # Data are splited into chunks + # (i.e., Time-series data), + # a window of rows # is processed. sqrt_alpha_t = self.sqrt_alpha[t].view(-1, 1, 1) beta_t = self.beta[t].view(-1, 1, 1) - sqrt_one_minus_alpha_hat_t = self.sqrt_one_minus_alpha_hat[t].view( - -1, 1, 1 + sqrt_one_minus_alpha_hat_t = ( + self.sqrt_one_minus_alpha_hat[t].view(-1, 1, 1) ) epsilon_t = self.std_beta[t].view(-1, 1, 1) else: # Each row of data is separately processed. sqrt_alpha_t = self.sqrt_alpha[t].view(-1, 1) beta_t = self.beta[t].view(-1, 1) - sqrt_one_minus_alpha_hat_t = self.sqrt_one_minus_alpha_hat[t].view(-1, 1) + sqrt_one_minus_alpha_hat_t = ( + self.sqrt_one_minus_alpha_hat[t].view(-1, 1) + ) epsilon_t = self.std_beta[t].view(-1, 1) - random_noise = torch.randn_like(noise) if i > 1 else torch.zeros_like(noise) + random_noise = ( + torch.randn_like(noise) + if i > 1 + else torch.zeros_like(noise) + ) noise = ( (1 / sqrt_alpha_t) * ( noise - - ((beta_t / sqrt_one_minus_alpha_hat_t) * self._eps_model(noise, t)) + - ( + (beta_t / sqrt_one_minus_alpha_hat_t) + * self._eps_model(noise, t) + ) ) ) + (epsilon_t * random_noise) - noise = mask_x_batch * x_batch + (1.0 - mask_x_batch) * noise + noise = ( + mask_x_batch * x_batch + (1.0 - mask_x_batch) * noise + ) - # Generate data output, this activation function depends on normalizer_x + # Generate data output, this activation function depends on + # normalizer_x x_out = noise.detach().cpu().numpy() outputs.append(x_out) @@ -252,7 +296,7 @@ def _eval( x_mask_obs_df: pd.DataFrame, x_indices: List, ) -> Dict: - """Evaluate the model + """Evaluate the model. Parameters ---------- @@ -271,8 +315,8 @@ def _eval( ------- Dict Scores - """ + """ list_x_imputed = [] for i in tqdm(range(self.num_sampling), disable=True, leave=False): x_imputed = self._impute(x, x_mask_obs) @@ -289,7 +333,9 @@ def _eval( x_final.loc[x_out.index] = x_out.loc[x_out.index] x_mask_imputed_df = ~x_mask_obs_df - columns_with_True = x_mask_imputed_df.columns[(x_mask_imputed_df == True).any()] + columns_with_True = x_mask_imputed_df.columns[ + (x_mask_imputed_df).any() + ] scores = {} for metric in self.metrics_valid: scores[metric.__name__] = metric( @@ -300,9 +346,12 @@ def _eval( return scores def _process_data( - self, x: pd.DataFrame, mask: pd.DataFrame = None, is_training: bool = False + self, + x: pd.DataFrame, + mask: pd.DataFrame = None, + is_training: bool = False, ) -> Tuple[np.ndarray, np.ndarray, List]: - """Pre-process data + """Pre-process data. Parameters ---------- @@ -317,10 +366,13 @@ def _process_data( ------- Tuple[np.ndarray, np.ndarray] Data and mask pre-processed + """ if is_training: self.normalizer_x.fit(x.values) - x_windows_processed = self.normalizer_x.transform(x.fillna(x.mean()).values) + x_windows_processed = self.normalizer_x.transform( + x.fillna(x.mean()).values + ) x_windows_mask_processed = ~x.isna().to_numpy() if mask is not None: x_windows_mask_processed = mask.to_numpy() @@ -332,7 +384,9 @@ def _process_reversely_data( ): x_normalized = self.normalizer_x.inverse_transform(x_imputed) x_normalized = x_normalized[: x_input.shape[0]] - x_out = pd.DataFrame(x_normalized, columns=self.columns, index=x_input.index) + x_out = pd.DataFrame( + x_normalized, columns=self.columns, index=x_input.index + ) x_final = x_input.copy() x_final.loc[x_out.index] = x_out.loc[x_out.index] @@ -352,8 +406,8 @@ def fit( ), round: int = 10, cols_imputed: Tuple[str, ...] = (), - ) -> Self: - """Fit data + ) -> "TabDDPM": + """Fit data. Parameters ---------- @@ -368,8 +422,8 @@ def fit( x_valid : pd.DataFrame, optional Dataframe for validation, by default None metrics_valid : Tuple[Callable, ...], optional - Set of validation metrics, by default ( metrics.mean_absolute_error, - metrics.dist_wasserstein ) + Set of validation metrics, by default (metrics.mean_absolute_error, + metrics.dist_wasserstein) round : int, optional Number of decimal places to round to, for better displaying model performance, by default 10 @@ -380,10 +434,12 @@ def fit( ------ ValueError Batch size is larger than data size + Returns ------- Self Return Self + """ self.dim_input = len(x.columns) self.epochs = epochs @@ -398,23 +454,29 @@ def fit( if len(self.cols_imputed) != 0: self.cols_idx_not_imputed = [ - idx for idx, col in enumerate(self.columns) if col not in self.cols_imputed + idx + for idx, col in enumerate(self.columns) + if col not in self.cols_imputed ] - self.interval_x = {col: [x[col].min(), x[col].max()] for col in self.columns} + self.interval_x = { + col: [x[col].min(), x[col].max()] for col in self.columns + } # x_mask: 1 for observed values, 0 for nan x_processed, x_mask, _ = self._process_data(x, is_training=True) if self.batch_size > x_processed.shape[0]: raise ValueError( - f"Batch size {self.batch_size} larger than size of pre-processed x" - + f" size={x_processed.shape[0]}. Please reduce batch_size." - + " In the case of TabDDPMTS, you can also reduce freq_str." + f"Batch size {self.batch_size} larger than size of " + "pre-processed x " + f"size={x_processed.shape[0]}. Please reduce batch_size. " + "In the case of TabDDPMTS, you can also reduce freq_str." ) if x_valid is not None: - # We reuse the UniformHoleGenerator to generate artificial holes (with one mask) + # We reuse the UniformHoleGenerator to generate artificial holes + # (with one mask) # in validation dataset x_valid_mask = missing_patterns.UniformHoleGenerator( n_splits=1, ratio_masked=self.ratio_nan @@ -425,7 +487,9 @@ def fit( x_processed_valid, x_processed_valid_obs_mask, x_processed_valid_indices, - ) = self._process_data(x_valid, x_valid_obs_mask, is_training=False) + ) = self._process_data( + x_valid, x_valid_obs_mask, is_training=False + ) x_tensor = torch.from_numpy(x_processed).float().to(self.device) x_mask_tensor = torch.from_numpy(x_mask).float().to(self.device) @@ -447,7 +511,10 @@ def fit( time_start = time.time() self._eps_model.train() for id_batch, (x_batch, mask_x_batch) in enumerate(dataloader): - mask_obs_rand = torch.FloatTensor(mask_x_batch.size()).uniform_() > self.ratio_nan + mask_obs_rand = ( + torch.FloatTensor(mask_x_batch.size()).uniform_() + > self.ratio_nan + ) for col in self.cols_idx_not_imputed: mask_obs_rand[:, col] = 0.0 mask_x_batch = mask_x_batch * mask_obs_rand.to(self.device) @@ -461,7 +528,9 @@ def fit( ) x_batch_t, noise = self._q_sample(x=x_batch, t=t) predicted_noise = self._eps_model(x=x_batch_t, t=t) - loss = (self.loss_func(predicted_noise, noise) * mask_x_batch).mean() + loss = ( + self.loss_func(predicted_noise, noise) * mask_x_batch + ).mean() loss.backward() self.optimiser.step() loss_epoch += loss.item() @@ -487,7 +556,7 @@ def fit( return self def predict(self, x: pd.DataFrame) -> pd.DataFrame: - """Predict/impute data + """Predict/impute data. Parameters ---------- @@ -498,10 +567,13 @@ def predict(self, x: pd.DataFrame) -> pd.DataFrame: ------- pd.DataFrame Imputed data + """ self._eps_model.eval() - x_processed, x_mask, x_indices = self._process_data(x, is_training=False) + x_processed, x_mask, x_indices = self._process_data( + x, is_training=False + ) list_x_imputed = [] for i in tqdm(range(self.num_sampling), leave=False): @@ -519,7 +591,9 @@ def predict(self, x: pd.DataFrame) -> pd.DataFrame: class TsDDPM(TabDDPM): - """Diffusion model for time-series data based on + """Time series DDPM. + + Diffusion model for time-series data based on Denoising Diffusion Probabilistic Models (DDPMs) of Ho et al., 2020 (https://arxiv.org/abs/2006.11239), Tashiro et al., 2021 (https://arxiv.org/abs/2107.03502). @@ -546,12 +620,7 @@ def __init__( is_rolling: bool = False, random_state: Union[None, int, np.random.RandomState] = None, ): - """Diffusion model for time-series data based on the works of - Ho et al., 2020 (https://arxiv.org/abs/2006.11239), - Tashiro et al., 2021 (https://arxiv.org/abs/2107.03502). - This implementation follows the implementations found in - https://github.com/quickgrid/pytorch-diffusion/tree/main, - https://github.com/ermongroup/CSDI/tree/main + """Init function. Parameters ---------- @@ -582,10 +651,12 @@ def __init__( num_sampling : int, optional Number of samples generated for each cell, by default 1 is_rolling : bool, optional - Use pandas.DataFrame.rolling for preprocessing data, by default False + Use pandas.DataFrame.rolling for preprocessing data, + by default False random_state : int, RandomState instance or None, default=None Controls the randomness. Pass an int for reproducible output across multiple function calls. + """ super().__init__( num_noise_steps, @@ -606,10 +677,14 @@ def __init__( self.num_layers_transformer = num_layers_transformer self.is_rolling = is_rolling - def _q_sample(self, x: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Section 3.2, algorithm 1 formula implementation. Forward process, defined by `q`. - Found in section 2. `q` gradually adds gaussian noise according to variance schedule. Also, - can be seen on figure 2. + def _q_sample( + self, x: torch.Tensor, t: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Sample q. + + Section 3.2, algorithm 1 formula implementation. Forward process, + defined by `q`. Found in section 2. `q` gradually adds gaussian noise + according to variance schedule. Also, can be seen on figure 2. Parameters ---------- @@ -622,10 +697,12 @@ def _q_sample(self, x: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, tor ------- Tuple[torch.Tensor, torch.Tensor] Noised data at noise step t - """ + """ sqrt_alpha_hat = self.sqrt_alpha_hat[t].view(-1, 1, 1) - sqrt_one_minus_alpha_hat = self.sqrt_one_minus_alpha_hat[t].view(-1, 1, 1) + sqrt_one_minus_alpha_hat = self.sqrt_one_minus_alpha_hat[t].view( + -1, 1, 1 + ) epsilon = torch.randn_like(x, device=self.device) return sqrt_alpha_hat * x + sqrt_one_minus_alpha_hat * epsilon, epsilon @@ -648,12 +725,17 @@ def _set_eps_model(self): p_dropout=self.p_dropout, ).to(self.device) - self.optimiser = torch.optim.Adam(self._eps_model.parameters(), lr=self.lr) + self.optimiser = torch.optim.Adam( + self._eps_model.parameters(), lr=self.lr + ) def _process_data( - self, x: pd.DataFrame, mask: pd.DataFrame = None, is_training: bool = False + self, + x: pd.DataFrame, + mask: pd.DataFrame = None, + is_training: bool = False, ) -> Tuple[np.ndarray, np.ndarray, List]: - """Pre-process data + """Pre-process data. Parameters ---------- @@ -668,30 +750,45 @@ def _process_data( ------- Tuple[np.ndarray, np.ndarray] Data and mask pre-processed + """ if is_training: self.normalizer_x.fit(x.values) x_windows: List = [] x_windows_indices: List = [] - columns_index = [col for col in x.index.names if col != self.index_datetime] + columns_index = [ + col for col in x.index.names if col != self.index_datetime + ] if is_training: if self.is_rolling: if self.print_valid: print( - "Preprocessing data with sliding window (pandas.DataFrame.rolling)" - + " can require more times than usual. Please be patient!" + "Preprocessing data with sliding window " + "(pandas.DataFrame.rolling) " + "can require more times than usual. " + "Please be patient!" ) if len(columns_index) == 0: x_windows = x.rolling(window=self.freq_str) else: - columns_index_ = columns_index[0] if len(columns_index) == 1 else columns_index - for x_group in tqdm(x.groupby(by=columns_index_), disable=True, leave=False): + columns_index_ = ( + columns_index[0] + if len(columns_index) == 1 + else columns_index + ) + for x_group in tqdm( + x.groupby(by=columns_index_), disable=True, leave=False + ): x_windows += list( - x_group[1].droplevel(columns_index).rolling(window=self.freq_str) + x_group[1] + .droplevel(columns_index) + .rolling(window=self.freq_str) ) else: - for x_w in x.resample(rule=self.freq_str, level=self.index_datetime): + for x_w in x.resample( + rule=self.freq_str, level=self.index_datetime + ): x_windows.append(x_w[1]) else: if self.is_rolling: @@ -703,23 +800,43 @@ def _process_data( x_windows.append(x_rolling) x_windows_indices.append(x_rolling.index) else: - columns_index_ = columns_index[0] if len(columns_index) == 1 else columns_index - for x_group in tqdm(x.groupby(by=columns_index_), disable=True, leave=False): - x_group_index = [x_group[0]] if len(columns_index) == 1 else x_group[0] + columns_index_ = ( + columns_index[0] + if len(columns_index) == 1 + else columns_index + ) + for x_group in tqdm( + x.groupby(by=columns_index_), disable=True, leave=False + ): + x_group_index = ( + [x_group[0]] + if len(columns_index) == 1 + else x_group[0] + ) x_group_value = x_group[1].droplevel(columns_index) - indices_nan = x_group_value.loc[x_group_value.isna().any(axis=1), :].index - x_group_rolling = x_group_value.rolling(window=self.freq_str) + indices_nan = x_group_value.loc[ + x_group_value.isna().any(axis=1), : + ].index + x_group_rolling = x_group_value.rolling( + window=self.freq_str + ) for x_rolling in x_group_rolling: if x_rolling.index[-1] in indices_nan: x_windows.append(x_rolling) x_rolling_ = x_rolling.copy() for idx, col in enumerate(columns_index): x_rolling_[col] = x_group_index[idx] - x_rolling_ = x_rolling_.set_index(columns_index, append=True) - x_rolling_ = x_rolling_.reorder_levels(x.index.names) + x_rolling_ = x_rolling_.set_index( + columns_index, append=True + ) + x_rolling_ = x_rolling_.reorder_levels( + x.index.names + ) x_windows_indices.append(x_rolling_.index) else: - for x_w in x.resample(rule=self.freq_str, level=self.index_datetime): + for x_w in x.resample( + rule=self.freq_str, level=self.index_datetime + ): x_windows.append(x_w[1]) x_windows_indices.append(x_w[1].index) @@ -736,7 +853,12 @@ def _process_data( if x_w_shape[0] < self.size_window: npad = [(0, self.size_window - x_w_shape[0]), (0, 0)] x_w_norm = np.pad(x_w_norm, pad_width=npad, mode="wrap") - x_w_mask = np.pad(x_w_mask, pad_width=npad, mode="constant", constant_values=1) + x_w_mask = np.pad( + x_w_mask, + pad_width=npad, + mode="constant", + constant_values=1, + ) x_windows_processed.append(x_w_norm) x_windows_mask_processed.append(x_w_mask) @@ -750,10 +872,19 @@ def _process_data( x_m_shape = x_m.shape if x_m_shape[0] < self.size_window: npad = [(0, self.size_window - x_m_shape[0]), (0, 0)] - x_m_mask = np.pad(x_m_mask, pad_width=npad, mode="constant", constant_values=1) + x_m_mask = np.pad( + x_m_mask, + pad_width=npad, + mode="constant", + constant_values=1, + ) x_windows_mask_processed.append(x_m_mask) - return np.array(x_windows_processed), np.array(x_windows_mask_processed), x_windows_indices + return ( + np.array(x_windows_processed), + np.array(x_windows_mask_processed), + x_windows_indices, + ) def _process_reversely_data( self, x_imputed: np.ndarray, x_input: pd.DataFrame, x_indices: List @@ -766,9 +897,13 @@ def _process_reversely_data( x_indices_nan_only.append(x_indices_batch[imputed_index]) if len(np.shape(x_indices_nan_only)) == 1: - x_out_index = pd.Index(x_indices_nan_only, name=x_input.index.names[0]) + x_out_index = pd.Index( + x_indices_nan_only, name=x_input.index.names[0] + ) else: - x_out_index = pd.MultiIndex.from_tuples(x_indices_nan_only, names=x_input.index.names) + x_out_index = pd.MultiIndex.from_tuples( + x_indices_nan_only, names=x_input.index.names + ) x_normalized = self.normalizer_x.inverse_transform(x_imputed_nan_only) x_out = pd.DataFrame( x_normalized, @@ -796,8 +931,8 @@ def fit( cols_imputed: Tuple[str, ...] = (), index_datetime: str = "", freq_str: str = "1D", - ) -> Self: - """Fit data + ) -> "TsDDPM": + """Fit data. Parameters ---------- @@ -812,8 +947,8 @@ def fit( x_valid : pd.DataFrame, optional Dataframe for validation, by default None metrics_valid : Tuple[Callable, ...], optional - Set of validation metrics, by default ( metrics.mean_absolute_error, - metrics.dist_wasserstein ) + Set of validation metrics, by default (metrics.mean_absolute_error, + metrics.dist_wasserstein) round : int, optional Number of decimal places to round to, by default 10 cols_imputed : Tuple[str, ...], optional @@ -822,19 +957,23 @@ def fit( Name of datetime-like index freq_str : str Frequency string of DateOffset of Pandas + Raises ------ ValueError Batch size is larger than data size + Returns ------- Self Return Self + """ if index_datetime == "": raise ValueError( - "Please set the params index_datetime (the name of datatime-like index column)." - + f" Suggestions: {x.index.names}" + "Please set the params index_datetime " + "(the name of datatime-like index column). " + f" Suggestions: {x.index.names}" ) self.index_datetime = index_datetime self.freq_str = freq_str diff --git a/qolmat/imputations/diffusions/utils.py b/qolmat/imputations/diffusions/utils.py index c67a2f5f..eb24fb4f 100644 --- a/qolmat/imputations/diffusions/utils.py +++ b/qolmat/imputations/diffusions/utils.py @@ -1,9 +1,11 @@ +"""Utils for diffusion imputers.""" + import numpy as np import torch def get_num_params(model: torch.nn.Module) -> int: - """Get the total number of parameters of a model + """Get the total number of parameters of a model. Parameters ---------- @@ -14,6 +16,7 @@ def get_num_params(model: torch.nn.Module) -> int: ------- float the total number of parameters + """ model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py index 463add50..eba85062 100644 --- a/qolmat/imputations/em_sampler.py +++ b/qolmat/imputations/em_sampler.py @@ -1,6 +1,8 @@ +"""Script for EM imputation.""" + +import warnings from abc import abstractmethod from typing import Dict, List, Literal, Tuple, Union -import warnings import numpy as np from numpy.typing import NDArray @@ -8,15 +10,17 @@ from scipy import optimize as spo from sklearn import utils as sku from sklearn.base import BaseEstimator, TransformerMixin -from typing_extensions import Self +# from typing_extensions import Self from qolmat.utils import utils def _conjugate_gradient(A: NDArray, X: NDArray, mask: NDArray) -> NDArray: - """ - Minimize Tr(X.T AX) wrt X where X is constrained to the initial value outside the given mask - To this aim, we compute in parallel a gradient algorithm for each row. + """Compute conjugate gradient. + + Minimize Tr(X.T AX) wrt X where X is constrained to the initial value + outside the given mask To this aim, we compute in parallel a gradient + algorithm for each row. Parameters ---------- @@ -25,12 +29,14 @@ def _conjugate_gradient(A: NDArray, X: NDArray, mask: NDArray) -> NDArray: X : NDArray Array containing the values to optimize mask : NDArray - Boolean array indicating if a value of X is a variable of the optimization + Boolean array indicating if a value of X is a variable of + the optimization Returns ------- NDArray Minimized array. + """ rows_imputed = mask.any(axis=1) X_temp = X[rows_imputed, :].copy() @@ -44,7 +50,7 @@ def _conjugate_gradient(A: NDArray, X: NDArray, mask: NDArray) -> NDArray: alphan = np.zeros(n_rows) betan = np.zeros(n_rows) for n in range(n_iter + 2): - # if np.max(np.sum(rn**2)) < tolerance : # Condition de sortie " usuelle " + # if np.max(np.sum(rn**2)) < tolerance : # X_temp[mask_isna] = xn[mask_isna] # return X_temp.transpose() Apn = pn @ A @@ -53,14 +59,18 @@ def _conjugate_gradient(A: NDArray, X: NDArray, mask: NDArray) -> NDArray: denominator = np.sum(pn * Apn, axis=1) not_converged = denominator != 0 # we stop updating if convergence is reached for this row - alphan[not_converged] = numerator[not_converged] / denominator[not_converged] + alphan[not_converged] = ( + numerator[not_converged] / denominator[not_converged] + ) xn, rnp1 = xn + pn * alphan[:, None], rn - Apn * alphan[:, None] numerator = np.sum(rnp1**2, axis=1) denominator = np.sum(rn**2, axis=1) not_converged = denominator != 0 # we stop updating if convergence is reached for this row - betan[not_converged] = numerator[not_converged] / denominator[not_converged] + betan[not_converged] = ( + numerator[not_converged] / denominator[not_converged] + ) pn, rn = rnp1 + pn * betan[:, None], rnp1 @@ -71,8 +81,12 @@ def _conjugate_gradient(A: NDArray, X: NDArray, mask: NDArray) -> NDArray: return X_final -def max_diff_Linf(list_params: List[NDArray], n_steps: int, order: int = 1) -> float: - """Computes the maximal L infinity norm between the `n_steps` last elements spaced by order. +def max_diff_Linf( + list_params: List[NDArray], n_steps: int, order: int = 1 +) -> float: + """Compute the maximal L infinity norm. + + Computed between the `n_steps` last elements spaced by order. Used to compute the stop criterion. Parameters @@ -88,6 +102,7 @@ def max_diff_Linf(list_params: List[NDArray], n_steps: int, order: int = 1) -> f ------- float Minimal norm of differences + """ params = np.stack(list_params[-n_steps - order : -order]) params_shift = np.stack(list_params[-n_steps:]) @@ -96,8 +111,9 @@ def max_diff_Linf(list_params: List[NDArray], n_steps: int, order: int = 1) -> f class EM(BaseEstimator, TransformerMixin): - """ - Generic abstract class for missing values imputation through EM optimization and + """Abstract class for EM imputatoin. + + It uses imputation through EM optimization and a projected MCMC sampling process. Parameters @@ -110,30 +126,35 @@ class EM(BaseEstimator, TransformerMixin): Number of iterations for the Gibbs sampling method (+ noise addition), necessary for convergence, by default 50. n_samples : int, optional - Number of data samples used to estimate the parameters of the distribution. Default, 10 + Number of data samples used to estimate the parameters of the + distribution. Default, 10 ampli : float, optional Whether to sample the posterior (1) or to maximise likelihood (0), by default 1. random_state : int, optional - The seed of the pseudo random number generator to use, for reproductibility. + The seed of the pseudo random number generator to use, + for reproductibility. dt : float, optional - Process integration time step, a large value increases the sample bias and can make - the algorithm unstable, but compensates for a smaller n_iter_ou. By default, 2e-2. + Process integration time step, a large value increases the sample bias + and can make the algorithm unstable, but compensates for a + smaller n_iter_ou. By default, 2e-2. tolerance : float, optional - Threshold below which a L infinity norm difference indicates the convergence of the - parameters - stagnation_threshold : float, optional - Threshold below which a stagnation of the L infinity norm difference indicates the + Threshold below which a L infinity norm difference indicates the convergence of the parameters + stagnation_threshold : float, optional + Threshold below which a stagnation of the L infinity norm difference + indicates the convergence of the parameters stagnation_loglik : float, optional - Threshold below which an absolute difference of the log likelihood indicates the - convergence of the parameters + Threshold below which an absolute difference of the log likelihood + indicates the convergence of the parameters min_std: float, optional - Threshold below which the initial data matrix is considered ill-conditioned + Threshold below which the initial data matrix is considered + ill-conditioned period : int, optional Integer used to fold the temporal data periodically verbose : bool, optional Verbosity level, if False the warnings are silenced + """ def __init__( @@ -153,7 +174,10 @@ def __init__( verbose: bool = False, ): if method not in ["mle", "sample"]: - raise ValueError(f"`method` must be 'mle' or 'sample', provided value is '{method}'") + raise ValueError( + "`method` must be 'mle' or 'sample', " + f"provided value is '{method}'." + ) self.method = method self.max_iter_em = max_iter_em @@ -180,38 +204,73 @@ def _check_convergence(self) -> bool: @abstractmethod def reset_learned_parameters(self): + """Reset learned parameters.""" pass @abstractmethod def update_parameters(self, X: NDArray): + """Update parameters.""" pass @abstractmethod def combine_parameters(self): + """Combine parameters.""" pass def fit_parameters(self, X: NDArray): + """Fir parameters. + + Parameters + ---------- + __________ + X: NDArray + Array to compute the parameters. + + """ self.reset_learned_parameters() self.update_parameters(X) self.combine_parameters() def fit_parameters_with_missingness(self, X: NDArray): - """ - First estimation of the model parameters based on data with missing values. + """Fit the first estimation of the model parameters. + + It is based on data with missing values. Parameters ---------- X : NDArray Data matrix with missingness + """ X_imp = self.init_imputation(X) self.fit_parameters(X_imp) def update_criteria_stop(self, X: NDArray): + """Update the stopping criteria based on X. + + Parameters + ---------- + X : NDArray + array used to compute log likelihood. + + """ self.loglik = self.get_loglikelihood(X) @abstractmethod def get_loglikelihood(self, X: NDArray) -> float: + """Compute the loglikelihood of an array. + + Parameters + ---------- + X : NDArray + Input array. + + Returns + ------- + float + log-likelihood. + + """ return 0 @abstractmethod @@ -219,10 +278,24 @@ def gradient_X_loglik( self, X: NDArray, ) -> NDArray: + """Compute the gradient X loglik. + + Parameters + ---------- + X : NDArray + input array + + Returns + ------- + NDArray + gradient + + """ return np.empty # type: ignore #noqa def get_gamma(self, n_cols: int) -> NDArray: - """ + """Get gamma. + Normalization matrix in the sampling process. Parameters @@ -234,6 +307,7 @@ def get_gamma(self, n_cols: int) -> NDArray: ------- NDArray Gamma matrix + """ # return np.ones((1, n_cols)) return np.eye(n_cols) @@ -246,13 +320,14 @@ def _maximize_likelihood(self, X: NDArray, mask_na: NDArray) -> NDArray: X : NDArray Input numpy array without missingness mask_na : NDArray - Boolean dataframe indicating which coefficients should be resampled, and are therefore - the variables of the optimization + Boolean dataframe indicating which coefficients should be + resampled, and are therefore the variables of the optimization Returns ------- NDArray DataFrame with imputed values. + """ def fun_obj(x): @@ -267,7 +342,8 @@ def fun_jac(x): grad_x = grad_x[mask_na] return grad_x - # the method BFGS is much slower, probabily not adapted to the high-dimension setting + # the method BFGS is much slower, probabily not adapted + # to the high-dimension setting res = spo.minimize(fun_obj, X[mask_na], jac=fun_jac, method="CG") x = res.x @@ -281,27 +357,31 @@ def _sample_ou( mask_na: NDArray, estimate_params: bool = True, ) -> NDArray: - """ - Samples the Gaussian distribution under the constraint that not na values must remain + """Sample the Gaussian distribution. + + Under the constraint that not na values must remain unchanged, using a projected Ornstein-Uhlenbeck process. - The sampled distribution tends to the target distribution in the limit dt -> 0 and - n_iter_ou x dt -> infty. + The sampled distribution tends to the target distribution + in the limit dt -> 0 and n_iter_ou x dt -> infty. Parameters ---------- - df : NDArray - Inital dataframe to be imputed, which should have been already imputed using a simple - method. This first imputation will be used as an initial guess. + X : NDArray + Inital dataframe to be imputed, which should have been already + imputed using a simple method. This first imputation will be used + as an initial guess. mask_na : NDArray - Boolean dataframe indicating which coefficients should be resampled. + Boolean dataframe indicating which coefficients should be + resampled. estimate_params : bool - Indicates if the parameters of the distribution should be estimated while the data are - sampled. + Indicates if the parameters of the distribution should be estimated + while the data are sampled. Returns ------- NDArray Sampled data matrix + """ X_copy = X.copy() n_rows, n_cols = X_copy.shape @@ -314,7 +394,10 @@ def _sample_ou( for i in range(self.n_iter_ou): noise = self.ampli * self.rng.normal(0, 1, size=(n_rows, n_cols)) grad_X = -self.gradient_X_loglik(X_copy) - X_copy += -self.dt * grad_X @ gamma + np.sqrt(2 * self.dt) * noise @ sqrt_gamma + X_copy += ( + -self.dt * grad_X @ gamma + + np.sqrt(2 * self.dt) * noise @ sqrt_gamma + ) X_copy[~mask_na] = X_init[~mask_na] if estimate_params: self.update_parameters(X_copy) @@ -322,6 +405,14 @@ def _sample_ou( return X_copy def fit_X(self, X: NDArray) -> None: + """Ft X array. + + Parameters + ---------- + X : NDArray + Input array. + + """ mask_na = np.isnan(X) # first imputation @@ -351,14 +442,14 @@ def fit_X(self, X: NDArray) -> None: self.dict_criteria_stop = {key: [] for key in self.dict_criteria_stop} self.X = X - def fit(self, X: NDArray) -> Self: - """ - Fit the statistical distribution with the input X array. + def fit(self, X: NDArray) -> "EM": + """Fit the statistical distribution with the input X array. Parameters ---------- X : NDArray Numpy array to be imputed + """ X = X.copy() self.shape_original = X.shape @@ -394,8 +485,7 @@ def fit(self, X: NDArray) -> Self: return self def transform(self, X: NDArray) -> NDArray: - """ - Transform the input X array by imputing the missing values. + """Transform the input X array by imputing the missing values. Parameters ---------- @@ -406,6 +496,7 @@ def transform(self, X: NDArray) -> NDArray: ------- NDArray Final array after EM sampling. + """ mask_na = np.isnan(X) X = X.copy() @@ -432,8 +523,7 @@ def transform(self, X: NDArray) -> NDArray: return X def pretreatment(self, X, mask_na) -> Tuple[NDArray, NDArray]: - """ - Pretreats the data before imputation by EM, making it more robust. + """Pretreat the data before imputation by EM, making it more robust. Parameters ---------- @@ -448,13 +538,15 @@ def pretreatment(self, X, mask_na) -> Tuple[NDArray, NDArray]: A tuple containing: - X the pretreatd data matrix - mask_na the updated mask + """ return X, mask_na def _check_conditionning(self, X: NDArray): - """ - Check that the data matrix X is not ill-conditioned. Running the EM algorithm on data with - colinear columns leads to numerical instability and unconsistent results. + """Check that the data matrix X is not ill-conditioned. + + Running the EM algorithm on data with colinear columns leads to + numerical instability and unconsistent results. Parameters ---------- @@ -465,6 +557,7 @@ def _check_conditionning(self, X: NDArray): ------ IllConditioned Data matrix is ill-conditioned due to colinear columns. + """ n_samples, n_cols = X.shape # if n_rows == 1 the function np.cov returns a float @@ -476,17 +569,20 @@ def _check_conditionning(self, X: NDArray): min_sv = min(np.sqrt(sv)) if min_sv < self.min_std: warnings.warn( - f"The covariance matrix is ill-conditioned, indicating high-colinearity: the " - f"smallest singular value of the data matrix is smaller than the threshold " - f"min_std ({min_sv} < {self.min_std}). Consider removing columns of decreasing " - f"the threshold." + "The covariance matrix is ill-conditioned, " + "indicating high-colinearity: the " + "smallest singular value of the data matrix is smaller " + "than the threshold " + f"min_std ({min_sv} < {self.min_std}). " + "Consider removing columns of decreasing the threshold." ) class MultiNormalEM(EM): - """ - Imputation of missing values using a multivariate Gaussian model through EM optimization and - using a projected Ornstein-Uhlenbeck process. + """Multinormal EM imputer. + + Imputation of missing values using a multivariate Gaussian model through + EM optimization and using a projected Ornstein-Uhlenbeck process. Parameters ---------- @@ -498,28 +594,32 @@ class MultiNormalEM(EM): Number of iterations for the Gibbs sampling method (+ noise addition), necessary for convergence, by default 50. n_samples : int, optional - Number of data samples used to estimate the parameters of the distribution. Default, 10 + Number of data samples used to estimate the parameters of the + distribution. Default, 10 ampli : float, optional Whether to sample the posterior (1) or to maximise likelihood (0), by default 1. random_state : int, optional - The seed of the pseudo random number generator to use, for reproductibility. + The seed of the pseudo random number generator to use, + for reproductibility. dt : float - Process integration time step, a large value increases the sample bias and can make - the algorithm unstable, but compensates for a smaller n_iter_ou. By default, 2e-2. + Process integration time step, a large value increases the sample bias + and can make the algorithm unstable, but compensates for a + smaller n_iter_ou. By default, 2e-2. tolerance : float, optional - Threshold below which a L infinity norm difference indicates the convergence of the - parameters + Threshold below which a L infinity norm difference indicates the + convergence of the parameters stagnation_threshold : float, optional - Threshold below which a L infinity norm difference indicates the convergence of the - parameters - stagnation_loglik : float, optional - Threshold below which an absolute difference of the log likelihood indicates the + Threshold below which a L infinity norm difference indicates the convergence of the parameters + stagnation_loglik : float, optional + Threshold below which an absolute difference of the log likelihood + indicates the convergence of the parameters period : int, optional Integer used to fold the temporal data periodically verbose : bool, optional Verbosity level, if False the warnings are silenced + """ def __init__( @@ -554,9 +654,11 @@ def __init__( self.dict_criteria_stop = {"logliks": [], "means": [], "covs": []} def get_loglikelihood(self, X: NDArray) -> float: - """ - Value of the log-likelihood up to a constant for the provided X, using the attributes - `means` and `cov_inv` for the multivariate normal distribution. + """Get the log-likelihood. + + Value of the log-likelihood up to a constant for the provided X, + using the attributes `means` and `cov_inv` for the multivariate + normal distribution. Parameters ---------- @@ -567,13 +669,15 @@ def get_loglikelihood(self, X: NDArray) -> float: ------- float Computed value + """ Xc = X - self.means return -((Xc @ self.cov_inv) * Xc).sum().sum() / 2 def gradient_X_loglik(self, X: NDArray) -> NDArray: - """ - Gradient of the log-likelihood for the provided X, using the attributes + """Compute the gradient of the log-likelihood for the provided X. + + It uses the attributes `means` and `cov_inv` for the multivariate normal distribution. Parameters @@ -584,15 +688,19 @@ def gradient_X_loglik(self, X: NDArray) -> NDArray: Returns ------- NDArray - The gradient of the log-likelihood with respect to the input variable `X`. + The gradient of the log-likelihood with respect to the input + variable `X`. + """ grad_X = -(X - self.means) @ self.cov_inv return grad_X def get_gamma(self, n_cols: int) -> NDArray: - """ - If the covariance matrix is not full-rank, defines the projection matrix keeping the - sampling process in the relevant subspace. + """Get gamma. + + If the covariance matrix is not full-rank, defines the + projection matrix keeping the sampling process in the relevant + subspace. Parameters ---------- @@ -603,6 +711,7 @@ def get_gamma(self, n_cols: int) -> NDArray: ------- NDArray Gamma matrix + """ U, diag, Vt = spl.svd(self.cov) diag_trunc = np.where(diag < self.min_std**2, 0, diag) @@ -614,13 +723,13 @@ def get_gamma(self, n_cols: int) -> NDArray: return gamma def update_criteria_stop(self, X: NDArray): - """ - Updates the variables which will be used to compute the stop critera + """Update the variables to compute the stopping critera. Parameters ---------- X : NDArray Input matrix with variables in column + """ self.loglik = self.get_loglikelihood(X) self.dict_criteria_stop["means"].append(self.means) @@ -628,20 +737,18 @@ def update_criteria_stop(self, X: NDArray): self.dict_criteria_stop["logliks"].append(self.loglik) def reset_learned_parameters(self): - """ - Resets all lists of estimated parameters before starting a new estimation. - """ + """Reset lists of parameters before starting a new estimation.""" self.list_means = [] self.list_cov = [] def update_parameters(self, X): - """ - Retains statistics relative to the current sample, in prevision of combining them. + """Retain statistics relative to the current sample. Parameters ---------- X : NDArray Input matrix with variables in column + """ n_rows, n_cols = X.shape means = np.mean(X, axis=0) @@ -654,9 +761,9 @@ def update_parameters(self, X): self.list_cov.append(cov) def combine_parameters(self): - """ - Combine all statistics computed for each sample in the update step, using the MANOVA - formula. + """Combine all statistics computed for each sample in the update step. + + If uses the MANOVA formula. """ list_means = self.list_means[-self.n_samples :] list_cov = self.list_cov[-self.n_samples :] @@ -674,20 +781,21 @@ def combine_parameters(self): self.cov_inv = np.linalg.pinv(self.cov) def fit_parameters_with_missingness(self, X: NDArray): - """ - First estimation of the model parameters based on data with missing values. + """Fit the first estimation of the model parameters. + + It is based on data with missing values. Parameters ---------- X : NDArray Data matrix with missingness + """ self.means, self.cov = utils.nan_mean_cov(X) self.cov_inv = np.linalg.pinv(self.cov) def set_parameters(self, means: NDArray, cov: NDArray): - """ - Sets the model parameters from a user value. + """Set the model parameters from a user value. Parameters ---------- @@ -695,27 +803,28 @@ def set_parameters(self, means: NDArray, cov: NDArray): Specified value for the mean vector cov : NDArray Specified value for the covariance matrix + """ self.means = means self.cov = cov self.cov_inv = np.linalg.pinv(self.cov) def _maximize_likelihood(self, X: NDArray, mask_na: NDArray) -> NDArray: - """ - Get the argmax of a posterior distribution. + """Get the argmax of a posterior distribution. Parameters ---------- X : NDArray Input DataFrame without missingness mask_na : NDArray - Boolean dataframe indicating which coefficients should be resampled, and are therefore - the variables of the optimization + Boolean dataframe indicating which coefficients should be + resampled, and are therefore the variables of the optimization Returns ------- NDArray DataFrame with imputed values. + """ X_center = X - self.means X_imputed = _conjugate_gradient(self.cov_inv, X_center, mask_na) @@ -723,8 +832,7 @@ def _maximize_likelihood(self, X: NDArray, mask_na: NDArray) -> NDArray: return X_imputed def init_imputation(self, X: NDArray) -> NDArray: - """ - First simple imputation before iterating. + """First simple imputation before iterating. Parameters ---------- @@ -735,24 +843,29 @@ def init_imputation(self, X: NDArray) -> NDArray: ------- NDArray Imputed matrix + """ return utils.impute_nans(X, method="median") def _check_convergence(self) -> bool: - """ - Check if the EM algorithm has converged. Three criteria: - 1) if the differences between the estimates of the parameters (mean and covariance) is - less than a threshold (min_diff_reached - tolerance). - 2) if the difference of the consecutive differences of the estimates is less than a - threshold, i.e. stagnates over the last 5 interactions (min_diff_stable - - stagnation_threshold). + """Check if the EM algorithm has converged. + + Three criteria: + 1) if the differences between the estimates of the parameters + (mean and covariance) is less than a threshold + (min_diff_reached - tolerance). + 2) if the difference of the consecutive differences of the estimates + is less than a threshold, i.e. stagnates over the last 5 interactions + (min_diff_stable - stagnation_threshold). 3) if the likelihood of the data no longer increases, - i.e. stagnates over the last 5 iterations (max_loglik - stagnation_loglik). + i.e. stagnates over the last 5 iterations + (max_loglik - stagnation_loglik). Returns ------- bool True/False if the algorithm has converged + """ list_means = self.dict_criteria_stop["means"] list_covs = self.dict_criteria_stop["covs"] @@ -764,7 +877,10 @@ def _check_convergence(self) -> bool: min_diff_means1 = max_diff_Linf(list_means, n_steps=1) min_diff_covs1 = max_diff_Linf(list_covs, n_steps=1) - min_diff_reached = min_diff_means1 < self.tolerance and min_diff_covs1 < self.tolerance + min_diff_reached = ( + min_diff_means1 < self.tolerance + and min_diff_covs1 < self.tolerance + ) if min_diff_reached: return True @@ -789,9 +905,11 @@ def _check_convergence(self) -> bool: class VARpEM(EM): - """ - Imputation of missing values using a vector autoregressive model through EM optimization and - using a projected Ornstein-Uhlenbeck process. Equations and notations and from the following + """VAR(p) EM imputer. + + Imputation of missing values using a vector autoregressive model through + EM optimization and using a projected Ornstein-Uhlenbeck process. + Equations and notations and from the following reference, matrices are transposed for consistency: Lütkepohl (2005) New Introduction to Multiple Time Series Analysis @@ -810,19 +928,21 @@ class VARpEM(EM): Whether to sample the posterior (1) or to maximise likelihood (0), by default 1. random_state : int, optional - The seed of the pseudo random number generator to use, for reproductibility. + The seed of the pseudo random number generator to use, + for reproductibility. dt : float - Process integration time step, a large value increases the sample bias and can make - the algorithm unstable, but compensates for a smaller n_iter_ou. By default, 2e-2. + Process integration time step, a large value increases the sample bias + and can make the algorithm unstable, but compensates for + a smaller n_iter_ou. By default, 2e-2. tolerance : float, optional - Threshold below which a L infinity norm difference indicates the convergence of the - parameters + Threshold below which a L infinity norm difference indicates + the convergence of the parameters stagnation_threshold : float, optional - Threshold below which a L infinity norm difference indicates the convergence of the - parameters - stagnation_loglik : float, optional - Threshold below which an absolute difference of the log likelihood indicates the + Threshold below which a L infinity norm difference indicates the convergence of the parameters + stagnation_loglik : float, optional + Threshold below which an absolute difference of the log likelihood + indicates the convergence of the parameters period : int, optional Integer used to fold the temporal data periodically verbose: bool @@ -831,18 +951,19 @@ class VARpEM(EM): Attributes ---------- X_intermediate : list - List of pd.DataFrame giving the results of the EM process as function of the - iteration number. + List of pd.DataFrame giving the results of the EM process as function + of the iteration number. Examples -------- >>> import numpy as np >>> from qolmat.imputations.em_sampler import VARpEM >>> imputer = VARpEM(method="sample", random_state=11) - >>> X = np.array([[1, 1, 1, 1], - ... [np.nan, np.nan, 3, 2], - ... [1, 2, 2, 1], [2, 2, 2, 2]]) + >>> X = np.array( + ... [[1, 1, 1, 1], [np.nan, np.nan, 3, 2], [1, 2, 2, 1], [2, 2, 2, 2]] + ... ) >>> imputer.fit_transform(X) # doctest: +SKIP + """ def __init__( @@ -882,9 +1003,10 @@ def __init__( self.p_to_fit = True def get_loglikelihood(self, X: NDArray) -> float: - """ - Value of the log-likelihood up to a constant for the provided X, using the attributes - `nu`, `B` and `S` for the VAR(p) distribution. + """Get the log-likelihood. + + Value of the log-likelihood up to a constant for the provided X, + using the attributes `nu`, `B` and `S` for the VAR(p) distribution. Parameters ---------- @@ -895,15 +1017,17 @@ def get_loglikelihood(self, X: NDArray) -> float: ------- float Computed value + """ Z, Y = utils.create_lag_matrices(X, self.p) U = Y - Z @ self.B return -(U @ self.S_inv * U).sum().sum() / 2 def gradient_X_loglik(self, X: NDArray) -> NDArray: - """ - Gradient of the log-likelihood for the provided X, using the attributes - `means` and `cov_inv` for the VAR(p) distribution. + """Compute the gradient of the log-likelihood for the provided X. + + It uses the attributes `means` and `cov_inv` + for the VAR(p) distribution. Parameters ---------- @@ -913,7 +1037,9 @@ def gradient_X_loglik(self, X: NDArray) -> NDArray: Returns ------- NDArray - The gradient of the log-likelihood with respect to the input variable `X`. + The gradient of the log-likelihood with respect + to the input variable `X`. + """ n_rows, n_cols = X.shape Z, Y = utils.create_lag_matrices(X, p=self.p) @@ -928,9 +1054,11 @@ def gradient_X_loglik(self, X: NDArray) -> NDArray: return grad_1 + grad_2 def get_gamma(self, n_cols: int) -> NDArray: - """ - If the noise matrix is not full-rank, defines the projection matrix keeping the - sampling process in the relevant subspace. Rescales the process to avoid instabilities. + """Compue gamma. + + If the noise matrix is not full-rank, defines the projection matrix + keeping the sampling process in the relevant subspace. + Rescales the process to avoid instabilities. Parameters ---------- @@ -941,6 +1069,7 @@ def get_gamma(self, n_cols: int) -> NDArray: ------- NDArray Gamma matrix + """ U, diag, Vt = spl.svd(self.S) diag_trunc = np.where(diag < self.min_std**2, 0, diag) @@ -952,13 +1081,13 @@ def get_gamma(self, n_cols: int) -> NDArray: return gamma def update_criteria_stop(self, X: NDArray): - """ - Updates the variables which will be used to compute the stop critera + """Update the variable to compute the stopping critera. Parameters ---------- X : NDArray Input matrix with variables in column + """ self.loglik = self.get_loglikelihood(X) self.dict_criteria_stop["S"].append(self.list_S[-1]) @@ -966,9 +1095,7 @@ def update_criteria_stop(self, X: NDArray): self.dict_criteria_stop["logliks"].append(self.loglik) def reset_learned_parameters(self): - """ - Resets all lists of estimated parameters before starting a new estimation. - """ + """Reset lists of parameters before starting a new estimation.""" self.list_ZZ = [] self.list_ZY = [] self.list_B = [] @@ -976,15 +1103,14 @@ def reset_learned_parameters(self): self.list_YY = [] def update_parameters(self, X: NDArray) -> None: - """ - Retains statistics relative to the current sample, in prevision of combining them. + """Retain statistics relative to the current sample. Parameters ---------- X : NDArray Input matrix with variables in column - """ + """ Z, Y = utils.create_lag_matrices(X, self.p) n_obs = len(Z) ZZ = Z.T @ Z / n_obs @@ -1002,9 +1128,10 @@ def update_parameters(self, X: NDArray) -> None: self.list_YY.append(YY) def combine_parameters(self) -> None: - """ - Combine all statistics computed for each sample in the update step. The estimation of `nu` - and `B` corresponds to the MLE, whereas `S` is approximated. + """Combine statistics computed for each sample in the update step. + + The estimation of `nu` and `B` corresponds to the MLE, + whereas `S` is approximated. """ list_ZZ = self.list_ZZ[-self.n_samples :] list_ZY = self.list_ZY[-self.n_samples :] @@ -1018,28 +1145,32 @@ def combine_parameters(self) -> None: self.B = self.ZZ_inv @ self.ZY stack_YY = np.stack(list_YY) self.YY = np.mean(stack_YY, axis=0) - self.S = self.YY - self.ZY.T @ self.B - self.B.T @ self.ZY + self.B.T @ self.ZZ @ self.B + self.S = ( + self.YY + - self.ZY.T @ self.B + - self.B.T @ self.ZY + + self.B.T @ self.ZZ @ self.B + ) self.S[np.abs(self.S) < 1e-12] = 0 self.S_inv = np.linalg.pinv(self.S, rcond=1e-10) def set_parameters(self, B: NDArray, S: NDArray): - """ - Sets the model parameters from a user value. + """Set the model parameters from a user value. Parameters ---------- - means : NDArray + B : NDArray Specified value for the autoregression matrix S : NDArray Specified value for the noise covariance matrix + """ self.B = B self.S = S self.S_inv = np.linalg.pinv(self.S) def init_imputation(self, X: NDArray) -> NDArray: - """ - First simple imputation before iterating. + """First simple imputation before iterating. Parameters ---------- @@ -1050,14 +1181,16 @@ def init_imputation(self, X: NDArray) -> NDArray: ------- NDArray Imputed matrix + """ return utils.linear_interpolation(X) def pretreatment(self, X, mask_na) -> Tuple[NDArray, NDArray]: - """ - Pretreats the data before imputation by EM, making it more robust. In the case of the - VAR(p) model we freeze the naive imputation on the first observations if all variables are - missing to avoid explosive imputations. + """Pretreat the data before imputation by EM, making it more robust. + + In the case of the + VAR(p) model we freeze the naive imputation on the first observations + if all variables are missing to avoid explosive imputations. Parameters ---------- @@ -1072,6 +1205,7 @@ def pretreatment(self, X, mask_na) -> Tuple[NDArray, NDArray]: A tuple containing: - X the pretreatd data matrix - mask_na the updated mask + """ if self.p == 0: return X, mask_na @@ -1081,22 +1215,25 @@ def pretreatment(self, X, mask_na) -> Tuple[NDArray, NDArray]: return X, mask_na def _check_convergence(self) -> bool: - """ - Check if the EM algorithm has converged. Three criteria: - 1) if the differences between the estimates of the parameters (mean and covariance) is - less than a threshold (min_diff_reached - tolerance). - OR 2) if the difference of the consecutive differences of the estimates is less than a - threshold, i.e. stagnates over the last 5 interactions (min_diff_stable - - stagnation_threshold). + """Check if the EM algorithm has converged. + + Three criteria: + 1) if the differences between the estimates of the parameters + (mean and covariance) is less than a threshold + (min_diff_reached - tolerance). + OR 2) if the difference of the consecutive differences of the + estimates is less than a threshold, i.e. stagnates over the + last 5 interactions (min_diff_stable - stagnation_threshold). OR 3) if the likelihood of the data no longer increases, - i.e. stagnates over the last 5 iterations (max_loglik - stagnation_loglik). + i.e. stagnates over the last 5 iterations + (max_loglik - stagnation_loglik). Returns ------- bool True/False if the algorithm has converged - """ + """ list_B = self.dict_criteria_stop["B"] list_S = self.dict_criteria_stop["S"] list_logliks = self.dict_criteria_stop["logliks"] @@ -1107,7 +1244,9 @@ def _check_convergence(self) -> bool: min_diff_B1 = max_diff_Linf(list_B, n_steps=1) min_diff_S1 = max_diff_Linf(list_S, n_steps=1) - min_diff_reached = min_diff_B1 < self.tolerance and min_diff_S1 < self.tolerance + min_diff_reached = ( + min_diff_B1 < self.tolerance and min_diff_S1 < self.tolerance + ) if min_diff_reached: return True @@ -1118,7 +1257,8 @@ def _check_convergence(self) -> bool: min_diff_B5 = max_diff_Linf(list_B, n_steps=5) min_diff_S5 = max_diff_Linf(list_S, n_steps=5) min_diff_stable = ( - min_diff_B5 < self.stagnation_threshold and min_diff_S5 < self.stagnation_threshold + min_diff_B5 < self.stagnation_threshold + and min_diff_S5 < self.stagnation_threshold ) max_loglik5_ord1 = max_diff_Linf(list_logliks, n_steps=5, order=1) diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py index a2550700..170a1659 100644 --- a/qolmat/imputations/imputers.py +++ b/qolmat/imputations/imputers.py @@ -1,54 +1,48 @@ +"""Script for the imputers.""" + import copy -from functools import partial import warnings -from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union -from typing_extensions import Self from abc import abstractmethod +from functools import partial +from typing import Any, Callable, Dict, Literal, Optional, Tuple, Union import numpy as np -from numpy.typing import NDArray -from scipy import sparse import pandas as pd import sklearn as skl +from numpy.typing import NDArray from sklearn import utils as sku -from sklearn.impute import SimpleImputer from sklearn.base import BaseEstimator -from sklearn.experimental import enable_iterative_imputer +from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.impute import IterativeImputer, KNNImputer from sklearn.impute._base import _BaseImputer -from sklearn.utils.validation import ( - _check_feature_names_in, - _num_samples, - check_array, - check_is_fitted, -) from statsmodels.tsa import seasonal as tsa_seasonal -from qolmat.imputations import em_sampler -from qolmat.imputations.rpca import rpca, rpca_noisy, rpca_pcp -from qolmat.imputations import softimpute +# from typing_extensions import Self +from qolmat.imputations import em_sampler, softimpute +from qolmat.imputations.rpca import rpca_noisy, rpca_pcp from qolmat.utils import utils -from qolmat.utils.exceptions import NotDataFrame, TypeNotHandled -from qolmat.utils.utils import HyperValue +from qolmat.utils.exceptions import NotDataFrame class _Imputer(_BaseImputer): - """ - Base class for all imputers. + """Base class for all imputers. Parameters ---------- columnwise : bool, optional - If True, the imputer will be computed for each column, else it will be computed on the - whole dataframe, by default False + If True, the imputer will be computed for each column, else it will be + computed on the whole dataframe, by default False shrink : bool, optional - Indicates if the elementwise imputation method returns a single value, by default False + Indicates if the elementwise imputation method returns a single value, + by default False random_state : Union[None, int, np.random.RandomState], optional Controls the randomness of the fit_transform, by default None imputer_params: Tuple[str, ...] - List of parameters of the imputer, which can be specified globally or columnwise + List of parameters of the imputer, which can be specified globally or + columnwise groups: Tuple[str, ...] List of column names to group by, by default [] + """ def __init__( @@ -67,9 +61,11 @@ def __init__( self.missing_values = np.nan def get_hyperparams(self, col: Optional[str] = None): - """ - Filter hyperparameters based on the specified column, the dictionary keys in the form - name_params/column are only relevent for the specified column and are filtered accordingly. + """Filter hyperparameters based on the specified column. + + The dictionary keys in the form + name_params/column are only relevent for the specified column and + are filtered accordingly. Parameters ---------- @@ -96,8 +92,7 @@ def get_hyperparams(self, col: Optional[str] = None): return hyperparams def _check_dataframe(self, X: NDArray): - """ - Checks that the input X is a dataframe, otherwise raises an error. + """Check that the input X is a dataframe, otherwise raises an error. Parameters ---------- @@ -108,32 +103,37 @@ def _check_dataframe(self, X: NDArray): ------ ValueError Input has to be a pandas.DataFrame. + """ if not isinstance(X, (pd.DataFrame)): raise NotDataFrame(type(X)) def _more_tags(self): - """ - This method indicates that this class allows inputs with categorical data and nans. It - modifies the behaviour of the functions checking data. - """ - return {"X_types": ["2darray", "categorical", "string"], "allow_nan": True} + """Indicate this class allows inputs with categorical data and nans. - def fit(self, X: pd.DataFrame, y=None) -> Self: + It modifies the behaviour of the functions checking data. """ - Fit the imputer on X. + return { + "X_types": ["2darray", "categorical", "string"], + "allow_nan": True, + } + + def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> "_Imputer": + """Fit the imputer on X. Parameters ---------- X : pd.DataFrame Data matrix on which the Imputer must be fitted. + y : pd.DataFrame + None. Returns ------- self : Self Returns self. - """ + """ df = utils._validate_input(X) self.n_features_in_ = len(df.columns) @@ -143,11 +143,15 @@ def fit(self, X: pd.DataFrame, y=None) -> Self: self.columns_ = tuple(df.columns) self._rng = sku.check_random_state(self.random_state) - if hasattr(self, "estimator") and hasattr(self.estimator, "random_state"): + if hasattr(self, "estimator") and hasattr( + self.estimator, "random_state" + ): self.estimator.random_state = self._rng if self.groups: - self.ngroups_ = df.groupby(list(self.groups)).ngroup().rename("_ngroup") + self.ngroups_ = ( + df.groupby(list(self.groups)).ngroup().rename("_ngroup") + ) else: self.ngroups_ = pd.Series(0, index=df.index).rename("_ngroup") @@ -161,12 +165,14 @@ def fit(self, X: pd.DataFrame, y=None) -> Self: return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """ - Returns a dataframe with same shape as `X`, unchanged values, where all nans are replaced - by non-nan values. Depending on the imputer parameters, the dataframe can be imputed with + """Transform/impute a dataframe. + + It retruns a dataframe with same shape as `X`, + unchanged values, where all nans are replaced by non-nan values. + Depending on the imputer parameters, the dataframe can be imputed with columnwise and/or groupwise methods. - Also works for numpy arrays, returning numpy arrays, but the use of pandas dataframe is - advised. + Also works for numpy arrays, returning numpy arrays, but the use of + pandas dataframe is advised. Parameters ---------- @@ -177,12 +183,13 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: ------- pd.DataFrame Imputed dataframe. - """ + """ df = utils._validate_input(X) if tuple(df.columns) != self.columns_: raise ValueError( - """The number of features is different from the counterpart in fit. + """The number of features is different + from the counterpart in fit. Reshape your data""" ) @@ -198,7 +205,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: if self.columnwise: df_imputed = df.copy() for col in cols_with_nans: - df_imputed[col] = self._transform_allgroups(df[[col]], col=col) + df_imputed[col] = self._transform_allgroups( + df[[col]], col=col + ) else: df_imputed = self._transform_allgroups(df) @@ -207,29 +216,35 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: return df_imputed - def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: - """ - Returns a dataframe with same shape as `X`, unchanged values, where all nans are replaced - by non-nan values. - Depending on the imputer parameters, the dataframe can be imputed with columnwise and/or - groupwise methods. + def fit_transform( + self, X: pd.DataFrame, y: pd.DataFrame = None + ) -> pd.DataFrame: + """Return a imputed dataframe. + + The retruned df has same shape as `X`, with unchanged values, + but all nans are replaced by non-nan values. + Depending on the imputer parameters, the dataframe can be imputed + with columnwise and/or groupwise methods. Parameters ---------- X : pd.DataFrame Dataframe to impute. + y : pd.DataFrame + None Returns ------- pd.DataFrame Imputed dataframe. + """ self.fit(X) return self.transform(X) def _fit_transform_fallback(self, df: pd.DataFrame) -> pd.DataFrame: - """ - Impute `df` by the median of each column if it still contains missing values. + """Impute `df` with each column's median if missing values remain. + This can introduce data leakage for forward imputers if unchecked. Parameters @@ -241,6 +256,7 @@ def _fit_transform_fallback(self, df: pd.DataFrame) -> pd.DataFrame: ------- pd.DataFrame Dataframe df imputed by the median of each column. + """ self._check_dataframe(df) cols_with_nan = df.columns[df.isna().any()] @@ -250,9 +266,12 @@ def _fit_transform_fallback(self, df: pd.DataFrame) -> pd.DataFrame: df[col] = df[col].fillna(df[col].mode()[0]) return df - def _fit_allgroups(self, df: pd.DataFrame, col: str = "__all__") -> Self: - """ - Fits the Imputer either on a column, for a columnwise setting, on or all columns. + def _fit_allgroups( + self, df: pd.DataFrame, col: str = "__all__" + ) -> "_Imputer": + """Fit the imputer. + + Either on a column, for a columnwise setting, on or all columns. Parameters ---------- @@ -270,8 +289,8 @@ def _fit_allgroups(self, df: pd.DataFrame, col: str = "__all__") -> Self: ------ ValueError Input has to be a pandas.DataFrame. - """ + """ self._check_dataframe(df) fun_on_col = partial(self._fit_element, col=col) if self.groups: @@ -283,16 +302,14 @@ def _fit_allgroups(self, df: pd.DataFrame, col: str = "__all__") -> Self: return self def _setup_fit(self) -> None: - """ - Setup step of the fit function, before looping over the columns. - """ - self._dict_fitting: Dict[str, Any] = dict() + """Set up step of the fit function, before looping over the columns.""" + self._dict_fitting: Dict[str, Any] = {} return - def _apply_groupwise(self, fun: Callable, df: pd.DataFrame, **kwargs) -> Any: - """ - Applies the function `fun`in a groupwise manner to the dataframe `df`. - + def _apply_groupwise( + self, fun: Callable, df: pd.DataFrame, **kwargs + ) -> Any: + """Apply the function `fun`in a groupwise manner to the dataframe `df`. Parameters ---------- @@ -300,11 +317,14 @@ def _apply_groupwise(self, fun: Callable, df: pd.DataFrame, **kwargs) -> Any: Function applied groupwise to the dataframe with arguments kwargs df : pd.DataFrame Dataframe on which the function is applied + **kwargs: dict + Additional arguments Returns ------- Any Depends on the function signature + """ self._check_dataframe(df) fun_on_col = partial(fun, **kwargs) @@ -317,11 +337,15 @@ def _apply_groupwise(self, fun: Callable, df: pd.DataFrame, **kwargs) -> Any: else: return fun_on_col(df) - def _transform_allgroups(self, df: pd.DataFrame, col: str = "__all__") -> pd.DataFrame: - """ - Impute `df` by applying the specialized method `transform_element` on each group, if - groups have been given. If the method leaves nan, `fit_transform_fallback` is called in - order to return a dataframe without nan. + def _transform_allgroups( + self, df: pd.DataFrame, col: str = "__all__" + ) -> pd.DataFrame: + """Impute `df`. + + It doe sit by applying the specialized method `transform_element` + on each group, if groups have been given. If the method leaves nan, + `fit_transform_fallback` is called in order to return a dataframe + without nan. Parameters ---------- @@ -339,10 +363,13 @@ def _transform_allgroups(self, df: pd.DataFrame, col: str = "__all__") -> pd.Dat ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) df = df.copy() - imputation_values = self._apply_groupwise(self._transform_element, df, col=col) + imputation_values = self._apply_groupwise( + self._transform_element, df, col=col + ) df = df.fillna(imputation_values) # fill na by applying imputation method without groups @@ -353,10 +380,13 @@ def _transform_allgroups(self, df: pd.DataFrame, col: str = "__all__") -> pd.Dat return df @abstractmethod - def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) -> Any: - """ - Fits the imputer on `df`, at the group and/or column level depending onself.groups and - self.columnwise. + def _fit_element( + self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 + ) -> Any: + """Fit the imputer on `df`. + + It does it at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -376,6 +406,7 @@ def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) return self @@ -384,9 +415,10 @@ def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -406,14 +438,14 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) return df class ImputerOracle(_Imputer): - """ - Perfect imputer, requires to know real values. + """Perfect imputer, requires to know real values. Used as a reference to evaluate imputation metrics. @@ -423,6 +455,7 @@ class ImputerOracle(_Imputer): Dataframe containing real values. groups: Tuple[str, ...] List of column names to group by, by default [] + """ def __init__( @@ -431,38 +464,45 @@ def __init__( super().__init__() def set_solution(self, df: pd.DataFrame): - """Sets the true values to be returned by the oracle. + """Set the true values to be returned by the oracle. Parameters ---------- - X : pd.DataFrame + df : pd.DataFrame True dataset with mask + """ self.df_solution = df def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Impute df with corresponding known values + """Impute df with corresponding known values. Parameters ---------- - df : pd.DataFrame + X : pd.DataFrame dataframe to impute + Returns ------- pd.DataFrame dataframe imputed with premasked values + """ df = utils._validate_input(X) if tuple(df.columns) != self.columns_: raise ValueError( - """The number of features is different from the counterpart in fit. + """The number of features is different from + the counterpart in fit. Reshape your data""" ) if hasattr(self, "df_solution"): df_imputed = df.fillna(self.df_solution) else: - warnings.warn("OracleImputer not initialized! Returning imputation with zeros") + warnings.warn( + "OracleImputer not initialized! " + "Returning imputation with zeros" + ) df_imputed = df.fillna(0) if isinstance(X, (np.ndarray)): @@ -471,8 +511,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: class ImputerSimple(_Imputer): - """ - Impute each column by its mean, its median or its mode (if its categorical). + """Simple imputer. + + Impute each column by its mean, its median or its mode + (if its categorical). Parameters ---------- @@ -485,27 +527,37 @@ class ImputerSimple(_Imputer): >>> import pandas as pd >>> from qolmat.imputations import imputers >>> imputer = imputers.ImputerSimple() - >>> df = pd.DataFrame(data=[[1, 1, 1, 1], - ... [np.nan, np.nan, np.nan, np.nan], - ... [1, 2, 2, 5], - ... [2, 2, 2, 2]], - ... columns=["var1", "var2", "var3", "var4"]) + >>> df = pd.DataFrame( + ... data=[ + ... [1, 1, 1, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [1, 2, 2, 5], + ... [2, 2, 2, 2], + ... ], + ... columns=["var1", "var2", "var3", "var4"], + ... ) >>> imputer.fit_transform(df) var1 var2 var3 var4 0 1.0 1.0 1.0 1.0 1 1.0 2.0 2.0 2.0 2 1.0 2.0 2.0 5.0 3 2.0 2.0 2.0 2.0 + """ - def __init__(self, groups: Tuple[str, ...] = (), strategy="median") -> None: + def __init__( + self, groups: Tuple[str, ...] = (), strategy="median" + ) -> None: super().__init__(groups=groups, columnwise=True, shrink=False) self.strategy = strategy - def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) -> Any: - """ - Fits the imputer on `df`, at the group and/or column level depending onself.groups and - self.columnwise. + def _fit_element( + self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 + ) -> Any: + """Fit the imputer on `df`. + + It does it at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -525,6 +577,7 @@ def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) ------ NotDataFrame Input has to be a pandas.DataFrame. + """ if pd.api.types.is_numeric_dtype(df[col]): model = skl.impute.SimpleImputer(strategy=self.strategy) @@ -535,8 +588,9 @@ def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending on self.groups + """Transform the dataframe `df`. + + It does it at the group and/or column level depending onself.groups and self.columnwise. Parameters @@ -557,6 +611,7 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ model = self._dict_fitting[col][ngroup] X_imputed = model.fit_transform(df) @@ -564,8 +619,7 @@ def _transform_element( class ImputerShuffle(_Imputer): - """ - Impute using random samples from the considered column. + """Impute using random samples from the considered column. Parameters ---------- @@ -580,17 +634,22 @@ class ImputerShuffle(_Imputer): >>> import pandas as pd >>> from qolmat.imputations import imputers >>> imputer = imputers.ImputerShuffle(random_state=42) - >>> df = pd.DataFrame(data=[[1, 1, 1, 1], - ... [np.nan, np.nan, np.nan, np.nan], - ... [1, 2, 2, 5], - ... [2, 2, 2, 2]], - ... columns=["var1", "var2", "var3", "var4"]) + >>> df = pd.DataFrame( + ... data=[ + ... [1, 1, 1, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [1, 2, 2, 5], + ... [2, 2, 2, 2], + ... ], + ... columns=["var1", "var2", "var3", "var4"], + ... ) >>> imputer.fit_transform(df) var1 var2 var3 var4 0 1.0 1.0 1.0 1.0 1 2.0 1.0 2.0 2.0 2 1.0 2.0 2.0 5.0 3 2.0 2.0 2.0 2.0 + """ def __init__( @@ -598,14 +657,17 @@ def __init__( groups: Tuple[str, ...] = (), random_state: Union[None, int, np.random.RandomState] = None, ) -> None: - super().__init__(groups=groups, columnwise=True, random_state=random_state) + super().__init__( + groups=groups, columnwise=True, random_state=random_state + ) def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -625,6 +687,7 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) n_missing = df.isna().sum().sum() @@ -640,9 +703,10 @@ def _transform_element( class ImputerLOCF(_Imputer): - """ - Impute by the last available value of the column. Relevent for time series. + """LOCF imputer. + It imputes by the last available value of the column. + Relevant for time series. If the first observations are missing, it is imputed by a NOCB Parameters @@ -656,17 +720,22 @@ class ImputerLOCF(_Imputer): >>> import pandas as pd >>> from qolmat.imputations import imputers >>> imputer = imputers.ImputerLOCF() - >>> df = pd.DataFrame(data=[[1, 1, 1, 1], - ... [np.nan, np.nan, np.nan, np.nan], - ... [1, 2, 2, 5], - ... [2, 2, 2, 2]], - ... columns=["var1", "var2", "var3", "var4"]) + >>> df = pd.DataFrame( + ... data=[ + ... [1, 1, 1, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [1, 2, 2, 5], + ... [2, 2, 2, 2], + ... ], + ... columns=["var1", "var2", "var3", "var4"], + ... ) >>> imputer.fit_transform(df) var1 var2 var3 var4 0 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0 2 1.0 2.0 2.0 5.0 3 2.0 2.0 2.0 2.0 + """ def __init__( @@ -678,9 +747,10 @@ def __init__( def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending on self.groups + and self.columnwise. Parameters ---------- @@ -700,6 +770,7 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) df_out = df.copy() @@ -709,7 +780,8 @@ def _transform_element( class ImputerNOCB(_Imputer): - """ + """NOCB imputer. + Impute by the next available value of the column. Relevent for time series. If the last observation is missing, it is imputed by a LOCF. @@ -724,17 +796,22 @@ class ImputerNOCB(_Imputer): >>> import pandas as pd >>> from qolmat.imputations import imputers >>> imputer = imputers.ImputerNOCB() - >>> df = pd.DataFrame(data=[[1, 1, 1, 1], - ... [np.nan, np.nan, np.nan, np.nan], - ... [1, 2, 2, 5], - ... [2, 2, 2, 2]], - ... columns=["var1", "var2", "var3", "var4"]) + >>> df = pd.DataFrame( + ... data=[ + ... [1, 1, 1, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [1, 2, 2, 5], + ... [2, 2, 2, 2], + ... ], + ... columns=["var1", "var2", "var3", "var4"], + ... ) >>> imputer.fit_transform(df) var1 var2 var3 var4 0 1.0 1.0 1.0 1.0 1 1.0 2.0 2.0 5.0 2 1.0 2.0 2.0 5.0 3 2.0 2.0 2.0 2.0 + """ def __init__( @@ -746,9 +823,10 @@ def __init__( def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending on self.groups + and self.columnwise. Parameters ---------- @@ -768,6 +846,7 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) df_out = df.copy() @@ -777,10 +856,11 @@ def _transform_element( class ImputerInterpolation(_Imputer): - """ - This class implements a way to impute time series using some interpolation strategies - suppoted by pd.Series.interpolate, such as "linear", "slinear", "quadratic", ... - By default, linear interpolation. + """Interpolation imputer. + + This class implements a way to impute time series using some interpolation + strategies suppoted by pd.Series.interpolate, such as "linear", "slinear", + "quadratic", ... By default, linear interpolation. As for pd.Series.interpolate, if "method" is "spline" or "polynomial", an "order" has to be passed. @@ -789,14 +869,15 @@ class ImputerInterpolation(_Imputer): groups: Tuple[str, ...] List of column names to group by, by default [] method : Optional[str] = "linear" - name of the method for interpolation: "linear", "cubic", "spline", "slinear", ... - see pd.Series.interpolate for more example. + name of the method for interpolation: "linear", "cubic", "spline", + "slinear", ... see pd.Series.interpolate for more example. By default, the value is set to "linear". order : Optional[int] order for the spline interpolation col_time : Optional[str] - Name of the column representing the time index to use for the interpolation. If None, the - index is used assuming it is one-dimensional. + Name of the column representing the time index to use for the + interpolation. If None, the index is used assuming it + is one-dimensional. Examples -------- @@ -804,17 +885,22 @@ class ImputerInterpolation(_Imputer): >>> import pandas as pd >>> from qolmat.imputations import imputers >>> imputer = imputers.ImputerInterpolation(method="spline", order=2) - >>> df = pd.DataFrame(data=[[1, 1, 1, 1], - ... [np.nan, np.nan, np.nan, np.nan], - ... [1, 2, 2, 5], - ... [2, 2, 2, 2]], - ... columns=["var1", "var2", "var3", "var4"]) + >>> df = pd.DataFrame( + ... data=[ + ... [1, 1, 1, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [1, 2, 2, 5], + ... [2, 2, 2, 2], + ... ], + ... columns=["var1", "var2", "var3", "var4"], + ... ) >>> imputer.fit_transform(df) var1 var2 var3 var4 0 1.000000 1.000000 1.000000 1.000000 1 0.666667 1.666667 1.666667 4.666667 2 1.000000 2.000000 2.000000 5.000000 3 2.000000 2.000000 2.000000 2.000000 + """ def __init__( @@ -824,7 +910,9 @@ def __init__( order: Optional[int] = None, col_time: Optional[str] = None, ) -> None: - super().__init__(imputer_params=("method", "order"), groups=groups, columnwise=True) + super().__init__( + imputer_params=("method", "order"), groups=groups, columnwise=True + ) self.method = method self.order = order self.col_time = col_time @@ -832,9 +920,10 @@ def __init__( def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending on self.groups + and self.columnwise. Parameters ---------- @@ -854,6 +943,7 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) hyperparams = self.get_hyperparams(col=col) @@ -869,10 +959,11 @@ def _transform_element( class ImputerResiduals(_Imputer): - """ + """Residual imputer. + This class implements an imputation method based on a STL decomposition. - The series are de-seasonalised, de-trended, residuals are imputed, then residuals are - re-seasonalised and re-trended. + The series are de-seasonalised, de-trended, residuals are imputed, + then residuals are re-seasonalised and re-trended. Parameters ---------- @@ -883,7 +974,8 @@ class ImputerResiduals(_Imputer): the index of x does not have a frequency. Overrides default periodicity of x if x is a pandas object with a timeseries index. model_tsa : Optional[str] - Type of seasonal component "additive" or "multiplicative". Abbreviations are accepted. + Type of seasonal component "additive" or "multiplicative". + Abbreviations are accepted. By default, the value is set to "additive" extrapolate_trend : int or 'freq', optional If set to > 0, the trend resulting from the convolution is @@ -900,15 +992,20 @@ class ImputerResiduals(_Imputer): >>> import pandas as pd >>> from qolmat.imputations.imputers import ImputerResiduals >>> np.random.seed(100) - >>> df = pd.DataFrame(index=pd.date_range('2015-01-01','2020-01-01')) + >>> df = pd.DataFrame(index=pd.date_range("2015-01-01", "2020-01-01")) >>> mean = 5 >>> offset = 10 - >>> df['y'] = np.cos(df.index.dayofyear/365*2*np.pi - np.pi)*mean + offset + >>> df["y"] = ( + ... np.cos(df.index.dayofyear / 365 * 2 * np.pi - np.pi) * mean + ... + offset + ... ) >>> trend = 5 - >>> df['y'] = df['y'] + trend*np.arange(0,df.shape[0])/df.shape[0] + >>> df["y"] = df["y"] + trend * np.arange(0, df.shape[0]) / df.shape[0] >>> noise_mean = 0 >>> noise_var = 2 - >>> df['y'] = df['y'] + np.random.normal(noise_mean, noise_var, df.shape[0]) + >>> df["y"] = df["y"] + np.random.normal( + ... noise_mean, noise_var, df.shape[0] + ... ) >>> mask = np.random.choice([True, False], size=df.shape) >>> df = df.mask(mask) >>> imputor = ImputerResiduals(period=365, model_tsa="additive") @@ -927,6 +1024,7 @@ class ImputerResiduals(_Imputer): 2020-01-01 12.780517 [1827 rows x 1 columns] + """ def __init__( @@ -955,9 +1053,10 @@ def __init__( def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending on self.groups + and self.columnwise. Parameters ---------- @@ -977,13 +1076,16 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) hyperparams = self.get_hyperparams(col=col) name = df.columns[0] values = df[df.columns[0]] values_interp = ( - values.interpolate(method=hyperparams["method_interpolation"]).ffill().bfill() + values.interpolate(method=hyperparams["method_interpolation"]) + .ffill() + .bfill() ) result = tsa_seasonal.seasonal_decompose( values_interp, @@ -996,15 +1098,18 @@ def _transform_element( residuals[values.isna()] = np.nan residuals = ( - residuals.interpolate(method=hyperparams["method_interpolation"]).ffill().bfill() + residuals.interpolate(method=hyperparams["method_interpolation"]) + .ffill() + .bfill() + ) + df_result = pd.DataFrame( + {name: result.seasonal + result.trend + residuals} ) - df_result = pd.DataFrame({name: result.seasonal + result.trend + residuals}) return df_result class ImputerKNN(_Imputer): - """ - This class implements an imputation by the k-nearest neighbors. + """K-nnearest neighbors imputer. Parameters ---------- @@ -1029,17 +1134,22 @@ class ImputerKNN(_Imputer): >>> import pandas as pd >>> from qolmat.imputations import imputers >>> imputer = imputers.ImputerKNN(n_neighbors=2) - >>> df = pd.DataFrame(data=[[1, 1, 1, 1], - ... [np.nan, np.nan, np.nan, np.nan], - ... [1, 2, 2, 5], - ... [2, 2, 2, 2]], - ... columns=["var1", "var2", "var3", "var4"]) + >>> df = pd.DataFrame( + ... data=[ + ... [1, 1, 1, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [1, 2, 2, 5], + ... [2, 2, 2, 2], + ... ], + ... columns=["var1", "var2", "var3", "var4"], + ... ) >>> imputer.fit_transform(df) var1 var2 var3 var4 0 1.000000 1.000000 1.000000 1.000000 1 1.333333 1.666667 1.666667 2.666667 2 1.000000 2.000000 2.000000 5.000000 3 2.000000 2.000000 2.000000 2.000000 + """ def __init__( @@ -1049,15 +1159,20 @@ def __init__( weights: str = "distance", ) -> None: super().__init__( - imputer_params=("n_neighbors", "weights"), groups=groups, columnwise=False + imputer_params=("n_neighbors", "weights"), + groups=groups, + columnwise=False, ) self.n_neighbors = n_neighbors self.weights = weights - def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) -> KNNImputer: - """ - Fits the imputer on `df`, at the group and/or column level depending onself.groups and - self.columnwise. + def _fit_element( + self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 + ) -> KNNImputer: + """Fit. the imputer on `df`. + + It does it at the group and/or column level depending on self.groups + and self.columnwise. Parameters ---------- @@ -1077,9 +1192,13 @@ def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) - assert col == "__all__" + if col != "__all__": + raise ValueError( + f"col must be '__all__', but '{col}' has been passed." + ) hyperparameters = self.get_hyperparams() model = KNNImputer(metric="nan_euclidean", **hyperparameters) model = model.fit(df) @@ -1088,9 +1207,10 @@ def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending on self.groups + and self.columnwise. Parameters ---------- @@ -1110,31 +1230,37 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) - assert col == "__all__" + if col != "__all__": + raise ValueError( + f"col must be '__all__', but '{col}' has been passed." + ) model = self._dict_fitting["__all__"][ngroup] X_imputed = model.fit_transform(df) return pd.DataFrame(data=X_imputed, columns=df.columns, index=df.index) class ImputerMICE(_Imputer): - """ - Wrapper of the class sklearn.impute.IterativeImputer in our framework. This imputer relies - on a estimator which is iteratively + """MICE imputer. + + Wrapper of the class sklearn.impute.IterativeImputer in our framework. + This imputer relies on a estimator which is iterative. Parameters ---------- groups : Tuple[str, ...], optional - _description_, by default () + specific groups for groupby, by default () estimator : Optional[BaseEstimator], optional - _description_, by default None + estimator to use, by default None random_state : Union[None, int, np.random.RandomState], optional - _description_, by default None + random state, by default None sample_posterior : bool, optional - _description_, by default False + true if sample, false otherwise, by default False max_iter : int, optional - _description_, by default 100 + maximum number of iterations, by default 100 + """ def __init__( @@ -1158,9 +1284,10 @@ def __init__( def _fit_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> IterativeImputer: - """ - Fits the imputer on `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Fit the imputer on `df`. + + It does it at the group and/or column level depending on self.groups + and self.columnwise. Parameters ---------- @@ -1180,9 +1307,13 @@ def _fit_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) - assert col == "__all__" + if col != "__all__": + raise ValueError( + f"col must be '__all__', but '{col}' has been passed." + ) hyperparameters = self.get_hyperparams() model = IterativeImputer(estimator=self.estimator, **hyperparameters) model = model.fit(df) @@ -1192,8 +1323,9 @@ def _fit_element( def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending on self.groups + """Transform the dataframe `df`. + + It does it at the group and/or column level depending on self.groups and self.columnwise. Parameters @@ -1214,20 +1346,24 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. - """ + """ self._check_dataframe(df) - assert col == "__all__" + if col != "__all__": + raise ValueError( + f"col must be '__all__', but '{col}' has been passed." + ) model = self._dict_fitting["__all__"][ngroup] X_imputed = model.fit_transform(df) return pd.DataFrame(data=X_imputed, columns=df.columns, index=df.index) class ImputerRegressor(_Imputer): - """ + """Regressor imputer. + This class implements a regression imputer in the multivariate case. - It imputes each column using a single fit-predict for a given estimator, based on the colunms - which have no missing values. + It imputes each column using a single fit-predict for a given estimator, + based on the colunms which have no missing values. Parameters ---------- @@ -1238,8 +1374,8 @@ class ImputerRegressor(_Imputer): handler_nan : str Can be `fit, `row` or `column`: - if `fit`, the estimator is assumed to be robust to missing values - - if `row` all non complete rows will be removed from the train dataset, and will not be - used for the inferance, + - if `row` all non complete rows will be removed from the + train dataset, and will not be used for the inference, - if `column` all non complete columns will be ignored. By default, `row` random_state : Union[None, int, np.random.RandomState], optional @@ -1252,17 +1388,22 @@ class ImputerRegressor(_Imputer): >>> from qolmat.imputations import imputers >>> from sklearn.ensemble import ExtraTreesRegressor >>> imputer = imputers.ImputerRegressor(estimator=ExtraTreesRegressor()) - >>> df = pd.DataFrame(data=[[1, 1, 1, 1], - ... [np.nan, np.nan, np.nan, np.nan], - ... [1, 2, 2, 5], - ... [2, 2, 2, 2]], - ... columns=["var1", "var2", "var3", "var4"]) + >>> df = pd.DataFrame( + ... data=[ + ... [1, 1, 1, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [1, 2, 2, 5], + ... [2, 2, 2, 2], + ... ], + ... columns=["var1", "var2", "var3", "var4"], + ... ) >>> imputer.fit_transform(df) var1 var2 var3 var4 0 1.0 1.0 1.0 1.0 1 1.0 2.0 2.0 2.0 2 1.0 2.0 2.0 5.0 3 2.0 2.0 2.0 2.0 + """ def __init__( @@ -1288,7 +1429,29 @@ def _predict_estimator(self, estimator, X) -> pd.Series: pred = estimator.predict(X) return pd.Series(pred, index=X.index) - def get_Xy_valid(self, df: pd.DataFrame, col: str) -> Tuple[pd.DataFrame, pd.Series]: + def get_Xy_valid( + self, df: pd.DataFrame, col: str + ) -> Tuple[pd.DataFrame, pd.Series]: + """Get a valid couple (X,y). + + Parameters + ---------- + df : pd.DataFrame + Input dataframe + col : str + column name. + + Returns + ------- + Tuple[pd.DataFrame, pd.Series] + Valid X and y. + + Raises + ------ + ValueError + _description_ + + """ X = df.drop(columns=col, errors="ignore") if self.handler_nan == "none": pass @@ -1298,7 +1461,8 @@ def get_Xy_valid(self, df: pd.DataFrame, col: str) -> Tuple[pd.DataFrame, pd.Ser X = X.dropna(how="any", axis=1) else: raise ValueError( - f"Value '{self.handler_nan}' is not correct for argument `handler_nan'" + f"Value '{self.handler_nan}' is not correct " + "for argument `handler_nan'." ) # X = pd.get_dummies(X, prefix_sep="=") y = df.loc[X.index, col] @@ -1307,8 +1471,9 @@ def get_Xy_valid(self, df: pd.DataFrame, col: str) -> Tuple[pd.DataFrame, pd.Ser def _fit_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> Optional[BaseEstimator]: - """ - Fits the imputer on `df`, at the group and/or column level depending onself.groups and + """Fit the imputer on `df`. + + It does it at the group and/or column level depending onself.groups and self.columnwise. Parameters @@ -1329,13 +1494,18 @@ def _fit_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) - assert col == "__all__" + if col != "__all__": + raise ValueError( + f"col must be '__all__', but '{col}' has been passed." + ) cols_with_nans = df.columns[df.isna().any()] - dict_estimators: Dict[str, BaseEstimator] = dict() + dict_estimators: Dict[str, BaseEstimator] = {} for col in cols_with_nans: - # Selects only the valid values in the Train Set according to the chosen method + # Selects only the valid values in the Train Set according + # to the chosen method X, y = self.get_Xy_valid(df, col) # Selects only non-NaN values for the Test Set @@ -1343,7 +1513,8 @@ def _fit_element( X = X[~is_na] y = y[~is_na] - # Train the model according to an ML or DL method and after predict the imputation + # Train the model according to an ML or DL method and + # after predict the imputation if not X.empty: estimator = copy.deepcopy(self.estimator) dict_estimators[col] = self._fit_estimator(estimator, X, y) @@ -1354,9 +1525,10 @@ def _fit_element( def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -1376,9 +1548,13 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) - assert col == "__all__" + if col != "__all__": + raise ValueError( + f"col must be '__all__', but '{col}' has been passed." + ) df_imputed = df.copy() cols_with_nans = df.columns[df.isna().any()] @@ -1402,10 +1578,12 @@ def _transform_element( class ImputerRpcaPcp(_Imputer): - """ - This class implements the Robust Principal Component Analysis imputation with Principal - Component Pursuit. The imputation minimizes a loss function combining a low-rank criterium on - the dataframe and a L1 penalization on the residuals. + """PCP RPCA imputer. + + This class implements the Robust Principal Component Analysis imputation + with Principal Component Pursuit. The imputation minimizes a loss function + combining a low-rank criterium on the dataframe and a L1 penalization on + the residuals. Parameters ---------- @@ -1414,9 +1592,11 @@ class ImputerRpcaPcp(_Imputer): columnwise : bool For the RPCA method to be applied columnwise (with reshaping of each column into an array) - or to be applied directly on the dataframe. By default, the value is set to False. + or to be applied directly on the dataframe. + By default, the value is set to False. random_state : Union[None, int, np.random.RandomState], optional Controls the randomness of the fit_transform, by default None + """ def __init__( @@ -1452,13 +1632,13 @@ def __init__( self.verbose = verbose def get_model(self, **hyperparams) -> rpca_pcp.RpcaPcp: - """ - Get the underlying model of the imputer based on its attributes. + """Get the underlying model of the imputer based on its attributes. Returns ------- rpca.RPCA RPCA model to be used in the fit and transform methods. + """ hyperparams = { key: hyperparams[key] @@ -1469,16 +1649,19 @@ def get_model(self, **hyperparams) -> rpca_pcp.RpcaPcp: "tolerance", ] } - model = rpca_pcp.RpcaPcp(random_state=self._rng, verbose=self.verbose, **hyperparams) + model = rpca_pcp.RpcaPcp( + random_state=self._rng, verbose=self.verbose, **hyperparams + ) return model def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -1498,6 +1681,7 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) hyperparams = self.get_hyperparams() @@ -1515,22 +1699,21 @@ def _transform_element( D_scale = (D - means) / stds M, A = model.decompose(D_scale, Omega) M = M * stds + means - A = A * stds + means M_final = utils.get_shape_original(M, X.shape) - A_final = utils.get_shape_original(A, X.shape) - X_imputed = M_final + A_final - df_imputed = pd.DataFrame(X_imputed, index=df.index, columns=df.columns) + df_imputed = pd.DataFrame(M_final, index=df.index, columns=df.columns) df_imputed = df.where(~df.isna(), df_imputed) return df_imputed class ImputerRpcaNoisy(_Imputer): - """ - This class implements the Robust Principal Component Analysis imputation with added noise. - The imputation minimizes a loss function combining a low-rank criterium on the dataframe and + """Noise RPCA imputer. + + This class implements the Robust Principal Component Analysis imputation + with added noise. The imputation minimizes a loss function combining + a low-rank criterium on the dataframe and a L1 penalization on the residuals. Parameters @@ -1540,9 +1723,11 @@ class ImputerRpcaNoisy(_Imputer): columnwise : bool For the RPCA method to be applied columnwise (with reshaping of each column into an array) - or to be applied directly on the dataframe. By default, the value is set to False. + or to be applied directly on the dataframe. + By default, the value is set to False. random_state : Union[None, int, np.random.RandomState], optional Controls the randomness of the fit_transform, by default None + """ def __init__( @@ -1593,15 +1778,14 @@ def __init__( self.verbose = verbose def get_model(self, **hyperparams) -> rpca_noisy.RpcaNoisy: - """ - Get the underlying model of the imputer based on its attributes. + """Get the underlying model of the imputer based on its attributes. Returns ------- rpca.RPCA RPCA model to be used in the fit and transform methods. - """ + """ hyperparams = { key: hyperparams[key] for key in [ @@ -1615,15 +1799,18 @@ def get_model(self, **hyperparams) -> rpca_noisy.RpcaNoisy: "norm", ] } - model = rpca_noisy.RpcaNoisy(random_state=self._rng, verbose=self.verbose, **hyperparams) + model = rpca_noisy.RpcaNoisy( + random_state=self._rng, verbose=self.verbose, **hyperparams + ) return model def _fit_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> Tuple[NDArray, NDArray, NDArray]: - """ - Fits the imputer on `df`, at the group and/or column level depending on self.groups and - self.columnwise. + """Fit the imputer on `df`. + + It does it at the group and/or column level depending on self.groups + and self.columnwise. Parameters ---------- @@ -1646,6 +1833,7 @@ def _fit_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) hyperparams = self.get_hyperparams() @@ -1667,9 +1855,10 @@ def _fit_element( def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -1689,6 +1878,7 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) hyperparams = self.get_hyperparams() @@ -1705,7 +1895,6 @@ def _transform_element( D_scale = (D - means) / stds M, A = model.decompose_on_basis(D_scale, Omega, Q) M = M * stds + means - A = A * stds + means M_final = utils.get_shape_original(M, X.shape) @@ -1716,13 +1905,15 @@ def _transform_element( class ImputerSoftImpute(_Imputer): - """ - This class implements the Soft Impute method: + """SoftIMpute imputer. - Hastie, Trevor, et al. Matrix completion and low-rank SVD via fast alternating least squares. - The Journal of Machine Learning Research 16.1 (2015): 3367-3402. + This class implements the Soft Impute method: + Hastie, Trevor, et al. Matrix completion and low-rank SVD via fast + alternating least squares. The Journal of Machine Learning Research 16.1 + (2015): 3367-3402. - This imputation technique is less robust than the RPCA, although it can provide faster. + This imputation technique is less robust than the RPCA, + although it can provide faster. Parameters ---------- @@ -1731,9 +1922,11 @@ class ImputerSoftImpute(_Imputer): columnwise : bool For the RPCA method to be applied columnwise (with reshaping of each column into an array) - or to be applied directly on the dataframe. By default, the value is set to False. + or to be applied directly on the dataframe. + By default, the value is set to False. random_state : Union[None, int, np.random.RandomState], optional Controls the randomness of the fit_transform, by default None + """ def __init__( @@ -1769,13 +1962,13 @@ def __init__( self.verbose = verbose def get_model(self, **hyperparams) -> softimpute.SoftImpute: - """ - Get the underlying model of the imputer based on its attributes. + """Get the underlying model of the imputer based on its attributes. Returns ------- softimpute.SoftImpute Soft Impute model to be used in the transform method. + """ hyperparams = { key: hyperparams[key] @@ -1785,7 +1978,9 @@ def get_model(self, **hyperparams) -> softimpute.SoftImpute: "tolerance", ] } - model = softimpute.SoftImpute(random_state=self._rng, verbose=self.verbose, **hyperparams) + model = softimpute.SoftImpute( + random_state=self._rng, verbose=self.verbose, **hyperparams + ) return model @@ -1793,8 +1988,8 @@ def get_model(self, **hyperparams) -> softimpute.SoftImpute: # self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 # ) -> softimpute.SoftImpute: # """ - # Fits the imputer on `df`, at the group and/or column level depending on - # self.groups and self.columnwise. + # Fits the imputer on `df`, at the group and/or column level depending + # on self.groups and self.columnwise. # Parameters # ---------- @@ -1825,9 +2020,10 @@ def get_model(self, **hyperparams) -> softimpute.SoftImpute: def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -1847,6 +2043,7 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) hyperparams = self.get_hyperparams() @@ -1863,7 +2060,9 @@ def _transform_element( A_final = utils.get_shape_original(A, X.shape) X_imputed = M_final + A_final - df_imputed = pd.DataFrame(X_imputed, index=df.index, columns=df.columns) + df_imputed = pd.DataFrame( + X_imputed, index=df.index, columns=df.columns + ) df_imputed = df.where(~df.isna(), df_imputed) return df_imputed @@ -1871,33 +2070,42 @@ def _transform_element( def _more_tags(self): return { "_xfail_checks": { - "check_fit2d_1sample": "This test shouldn't be running at all!", - "check_fit2d_1feature": "This test shouldn't be running at all!", + "check_fit2d_1sample": ( + "This test shouldn't be running at all!" + ), + "check_fit2d_1feature": ( + "This test shouldn't be running at all!" + ), }, } class ImputerEM(_Imputer): - """ - This class implements an imputation method based on joint modelling and an inference using a - Expectation-Minimization algorithm. + """EM imputer. + + This class implements an imputation method based on joint modelling and + an inference using a Expectation-Minimization algorithm. Parameters ---------- groups: Tuple[str, ...] List of column names to group by, by default [] method : {'multinormal', 'VAR'}, default='multinormal' - Method defining the hypothesis made on the data distribution. Possible values: - - 'multinormal' : the data points a independent and uniformly distributed following a - multinormal distribution + Method defining the hypothesis made on the data distribution. + Possible values: + - 'multinormal' : the data points a independent and uniformly + distributed following a multinormal distribution - 'VAR' : the data is a time series modeled by a VAR(p) process columnwise : bool - If False, correlations between variables will be used, which is advised. - If True, each column is imputed independently. For the multinormal case each - value will be imputed by the mean up to a noise with fixed noise, for the VAR1 case the - imputation will be a noisy temporal interpolation. + If False, correlations between variables will be used, + which is advised. + If True, each column is imputed independently. For the multinormal case + each value will be imputed by the mean up to a noise with fixed noise, + for the VAR1 case the imputation will be a noisy temporal + interpolation. random_state : Union[None, int, np.random.RandomState], optional Controls the randomness of the fit_transform, by default None + """ def __init__( @@ -1954,6 +2162,7 @@ def get_model(self, **hyperparams) -> em_sampler.EM: ------- em_sampler.EM EM model to be used in the fit and transform methods. + """ if self.model == "multinormal": hyperparams.pop("p") @@ -1980,9 +2189,10 @@ def get_model(self, **hyperparams) -> em_sampler.EM: def _fit_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> em_sampler.EM: - """ - Fits the imputer on `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Fit the imputer on `df`. + + It does it at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -2002,6 +2212,7 @@ def _fit_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) hyperparams = self.get_hyperparams() @@ -2012,9 +2223,10 @@ def _fit_element( def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does it at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -2034,6 +2246,7 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) @@ -2044,6 +2257,8 @@ def _transform_element( X = df.values.astype(float) X_imputed = model.transform(X) - df_transformed = pd.DataFrame(X_imputed, columns=df.columns, index=df.index) + df_transformed = pd.DataFrame( + X_imputed, columns=df.columns, index=df.index + ) return df_transformed diff --git a/qolmat/imputations/imputers_pytorch.py b/qolmat/imputations/imputers_pytorch.py index 1cf7d5d3..aff2b32f 100644 --- a/qolmat/imputations/imputers_pytorch.py +++ b/qolmat/imputations/imputers_pytorch.py @@ -1,15 +1,20 @@ -import pandas as pd -import numpy as np +"""Script for pytroch imputers.""" + +from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from typing import Any, Callable, List, Optional, Tuple, Union, Dict -from typing_extensions import Self +import numpy as np +import pandas as pd from numpy.typing import NDArray -from sklearn.preprocessing import StandardScaler from sklearn.base import BaseEstimator +from sklearn.preprocessing import StandardScaler -from qolmat.imputations.imputers import _Imputer, ImputerRegressor -from qolmat.utils.exceptions import EstimatorNotDefined, PyTorchExtraNotInstalled +# from typing_extensions import Self from qolmat.benchmark import metrics +from qolmat.imputations.imputers import ImputerRegressor, _Imputer +from qolmat.utils.exceptions import ( + EstimatorNotDefined, + PyTorchExtraNotInstalled, +) try: import torch @@ -20,8 +25,10 @@ class ImputerRegressorPyTorch(ImputerRegressor): - """ - This class inherits from the class ImputerRegressor and allows for PyTorch regressors. + """Imputer regressor based on PyTorch. + + This class inherits from the class ImputerRegressor + and allows for PyTorch regressors. Parameters ---------- @@ -32,8 +39,8 @@ class ImputerRegressorPyTorch(ImputerRegressor): handler_nan : str Can be `fit, `row` or `column`: - if `fit`, the estimator is assumed to be fitted on parcelar data, - - if `row` all non complete rows will be removed from the train dataset, and will not be - used for the inferance, + - if `row` all non complete rows will be removed from the train + dataset, and will not be used for the inference, - if `column`all non complete columns will be ignored. By default, `row` epochs: int @@ -42,6 +49,7 @@ class ImputerRegressorPyTorch(ImputerRegressor): Learning rate hen fitting the autoencoder, by default 0.001 loss_fn: Callable Loss used when fitting the autoencoder, by default nn.L1Loss() + """ def __init__( @@ -63,12 +71,15 @@ def __init__( self.loss_fn = loss_fn self.estimator = estimator - def _fit_estimator(self, estimator: nn.Sequential, X: pd.DataFrame, y: pd.DataFrame) -> Any: - """ - Fit the PyTorch estimator using the provided input and target data. + def _fit_estimator( + self, estimator: nn.Sequential, X: pd.DataFrame, y: pd.DataFrame + ) -> Any: + """Fit the PyTorch estimator using the provided input and target data. Parameters ---------- + estimator: torch.nn.Sequential + PyTorch estimator for imputing a column based on the others. X : pd.DataFrame The input data for training. y : pd.DataFrame @@ -78,36 +89,41 @@ def _fit_estimator(self, estimator: nn.Sequential, X: pd.DataFrame, y: pd.DataFr ------- Any Return fitted PyTorch estimator. + """ if not estimator: raise EstimatorNotDefined() optimizer = optim.Adam(estimator.parameters(), lr=self.learning_rate) loss_fn = self.loss_fn - if estimator is None: - assert EstimatorNotDefined() - else: - for epoch in range(self.epochs): - estimator.train() - optimizer.zero_grad() - - input_data = torch.Tensor(X.values) - target_data = torch.Tensor(y.values) - target_data = target_data.unsqueeze(1) - outputs = estimator(input_data) - loss = loss_fn(outputs, target_data) - - loss.backward() - optimizer.step() - if (epoch + 1) % 10 == 0: - print(f"Epoch [{epoch + 1}/{self.epochs}], Loss: {loss.item():.4f}") + + for epoch in range(self.epochs): + estimator.train() + optimizer.zero_grad() + + input_data = torch.Tensor(X.values) + target_data = torch.Tensor(y.values) + target_data = target_data.unsqueeze(1) + outputs = estimator(input_data) + loss = loss_fn(outputs, target_data) + + loss.backward() + optimizer.step() + if (epoch + 1) % 10 == 0: + print( + f"Epoch [{epoch + 1}/{self.epochs}], " + f"Loss: {loss.item():.4f}" + ) return estimator - def _predict_estimator(self, estimator: nn.Sequential, X: pd.DataFrame) -> pd.Series: - """ - Perform predictions using the trained PyTorch estimator. + def _predict_estimator( + self, estimator: nn.Sequential, X: pd.DataFrame + ) -> pd.Series: + """Perform predictions using the trained PyTorch estimator. Parameters ---------- + estimator: torch.nn.Sequential + PyTorch estimator for imputing a column based on the others. X : pd.DataFrame The input data for prediction. @@ -120,6 +136,7 @@ def _predict_estimator(self, estimator: nn.Sequential, X: pd.DataFrame) -> pd.Se ------ EstimatorNotDefined Raises an error if the attribute estimator is not defined. + """ if not estimator: raise EstimatorNotDefined() @@ -130,8 +147,7 @@ def _predict_estimator(self, estimator: nn.Sequential, X: pd.DataFrame) -> pd.Se class Autoencoder(nn.Module): - """ - Wrapper of a PyTorch autoencoder allowing to encode + """Wrapper of a PyTorch autoencoder allowing to encode. Parameters ---------- @@ -145,6 +161,7 @@ class Autoencoder(nn.Module): Learning rate for optimization, by default 0.001. loss_fn : Callable, optional Loss function for training, by default nn.L1Loss(). + """ def __init__( @@ -166,8 +183,7 @@ def __init__( self.scaler = StandardScaler() def forward(self, x: NDArray) -> nn.Sequential: - """ - Forward pass through the autoencoder. + """Forward pass through the autoencoder. Parameters ---------- @@ -178,14 +194,14 @@ def forward(self, x: NDArray) -> nn.Sequential: ------- pd.DataFrame Decoded data. + """ encode = self.encoder(x) decode = self.decoder(encode) return decode - def fit(self, X: NDArray, y: NDArray) -> Self: - """ - Fit the autoencoder to the data. + def fit(self, X: NDArray, y: NDArray) -> "Autoencoder": + """Fit the autoencoder to the data. Parameters ---------- @@ -198,6 +214,7 @@ def fit(self, X: NDArray, y: NDArray) -> Self: ------- Self Return Self + """ optimizer = optim.Adam(self.parameters(), lr=self.learning_rate) loss_fn = self.loss_fn @@ -214,14 +231,16 @@ def fit(self, X: NDArray, y: NDArray) -> Self: loss.backward() optimizer.step() if (epoch + 1) % 10 == 0: - print(f"Epoch [{epoch + 1}/{self.epochs}], Loss: {loss.item():.4f}") + print( + f"Epoch [{epoch + 1}/{self.epochs}], " + f"Loss: {loss.item():.4f}" + ) list_loss.append(loss.item()) self.loss.extend([list_loss]) return self def decode(self, Z: NDArray) -> NDArray: - """ - Decode encoded data. + """Decode encoded data. Parameters ---------- @@ -232,6 +251,7 @@ def decode(self, Z: NDArray) -> NDArray: ------- ndarray Decoded data. + """ Z_decoded = self.scaler.inverse_transform(Z) Z_decoded = self.decoder(torch.Tensor(Z_decoded)) @@ -239,8 +259,7 @@ def decode(self, Z: NDArray) -> NDArray: return Z_decoded def encode(self, X: NDArray) -> NDArray: - """ - Encode input data. + """Encode input data. Parameters ---------- @@ -251,6 +270,7 @@ def encode(self, X: NDArray) -> NDArray: ------- ndarray Encoded data. + """ X_encoded = self.encoder(torch.Tensor(X)) X_encoded = X_encoded.detach().numpy() @@ -275,6 +295,7 @@ class ImputerAutoencoder(_Imputer): Learning rate hen fitting the autoencoder, by default 0.001 loss_fn: Callable Loss used when fitting the autoencoder, by default nn.L1Loss() + """ def __init__( @@ -289,7 +310,12 @@ def __init__( learning_rate: float = 0.001, loss_fn: Callable = nn.L1Loss(), ) -> None: - super().__init__(groups=groups, columnwise=False, shrink=False, random_state=random_state) + super().__init__( + groups=groups, + columnwise=False, + shrink=False, + random_state=random_state, + ) self.loss_fn = loss_fn self.lamb = lamb self.max_iterations = max_iterations @@ -298,10 +324,13 @@ def __init__( self.encoder = encoder self.decoder = decoder - def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) -> Autoencoder: - """ - Fits the imputer on `df`, at the group and/or column level depending onself.groups and - self.columnwise. + def _fit_element( + self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 + ) -> Autoencoder: + """Fit the imputer on `df`. + + It does that at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -321,6 +350,7 @@ def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) autoencoder = Autoencoder( @@ -336,9 +366,10 @@ def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0) def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending onself.groups and - self.columnwise. + """Transform the dataframe `df`. + + It does that at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -358,6 +389,7 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ autoencoder = self._dict_fitting[col][ngroup] df_train = df.copy() @@ -378,7 +410,9 @@ def _transform_element( X_next = autoencoder.decode(Z_next) X[mask] = X_next[mask] df_imputed = pd.DataFrame( - scaler.inverse_transform(X), index=df_train.index, columns=df_train.columns + scaler.inverse_transform(X), + index=df_train.index, + columns=df_train.columns, ) return df_imputed @@ -389,8 +423,7 @@ def build_mlp( output_dim: int = 1, activation: Callable = nn.ReLU, ) -> nn.Sequential: - """ - Constructs a multi-layer perceptron (MLP) with a custom architecture. + """Construct a multi-layer perceptron (MLP) with a custom architecture. Parameters ---------- @@ -401,7 +434,8 @@ def build_mlp( output_dim : int, optional Dimension of the output layer, defaults to 1. activation : nn.Module, optional - Activation function to use between hidden layers, defaults to nn.ReLU(). + Activation function to use between hidden layers, + defaults to nn.ReLU(). Returns ------- @@ -415,7 +449,9 @@ def build_mlp( Examples -------- - >>> model = build_mlp(input_dim=10, list_num_neurons=[32, 64, 128], output_dim=1) + >>> model = build_mlp( + ... input_dim=10, list_num_neurons=[32, 64, 128], output_dim=1 + ... ) >>> print(model) Sequential( (0): Linear(in_features=10, out_features=32, bias=True) @@ -426,6 +462,7 @@ def build_mlp( (5): ReLU() (6): Linear(in_features=128, out_features=1, bias=True) ) + """ layers = [] for num_neurons in list_num_neurons: @@ -445,8 +482,7 @@ def build_autoencoder( output_dim: int = 1, activation: Callable = nn.ReLU, ) -> Tuple[nn.Sequential, nn.Sequential]: - """ - Constructs an autoencoder with a custom architecture. + """Construct an autoencoder with a custom architecture. Parameters ---------- @@ -459,7 +495,8 @@ def build_autoencoder( output_dim : int, optional Dimension of the output layer, defaults to 1. activation : nn.Module, optional - Activation function to use between hidden layers, defaults to nn.ReLU(). + Activation function to use between hidden layers, + defaults to nn.ReLU(). Returns ------- @@ -473,10 +510,12 @@ def build_autoencoder( Examples -------- - >>> encoder, decoder = build_autoencoder(input_dim=10, - ... latent_dim=4, - ... list_num_neurons=[32, 64, 128], - ... output_dim=252) + >>> encoder, decoder = build_autoencoder( + ... input_dim=10, + ... latent_dim=4, + ... list_num_neurons=[32, 64, 128], + ... output_dim=252, + ... ) >>> print(encoder) Sequential( (0): Linear(in_features=10, out_features=128, bias=True) @@ -497,8 +536,8 @@ def build_autoencoder( (5): ReLU() (6): Linear(in_features=128, out_features=252, bias=True) ) - """ + """ encoder = build_mlp( input_dim=input_dim, output_dim=latent_dim, @@ -515,7 +554,9 @@ def build_autoencoder( class ImputerDiffusion(_Imputer): - """This class inherits from the class _Imputer. + """Imputer based on diffusion models. + + This class inherits from the class _Imputer. It is a wrapper for imputers based on diffusion models. """ @@ -536,8 +577,7 @@ def __init__( index_datetime: str = "", freq_str: str = "1D", ): - """This class inherits from the class _Imputer. - It is a wrapper for imputers based on diffusion models. + """Init ImputerDiffusion. Parameters ---------- @@ -555,8 +595,8 @@ def __init__( print_valid : bool, optional Print model performance for after several epochs, by default False metrics_valid : Tuple[Callable, ...], optional - Set of validation metrics, by default ( metrics.mean_absolute_error, - metrics.dist_wasserstein ) + Set of validation metrics, by default (metrics.mean_absolute_error, + metrics.dist_wasserstein) round : int, optional Number of decimal places to round to, for better displaying model performance, by default 10 @@ -564,10 +604,12 @@ def __init__( Name of columns that need to be imputed, by default () index_datetime : str Name of datetime-like index. - It is for processing time-series data, used in diffusion models e.g., TsDDPM. + It is for processing time-series data, used in diffusion models + e.g., TsDDPM. freq_str : str Frequency string of DateOffset of Pandas. - It is for processing time-series data, used in diffusion models e.g., TsDDPM. + It is for processing time-series data, used in diffusion models + e.g., TsDDPM. Examples -------- @@ -575,10 +617,20 @@ def __init__( >>> from qolmat.imputations.imputers_pytorch import ImputerDiffusion >>> from qolmat.imputations.diffusions.ddpms import TabDDPM >>> - >>> X = np.array([[1, 1, 1, 1], [np.nan, np.nan, 3, 2], [1, 2, 2, 1], [2, 2, 2, 2]]) - >>> imputer = ImputerDiffusion(model=TabDDPM(random_state=11), epochs=50, batch_size=1) + >>> X = np.array( + ... [ + ... [1, 1, 1, 1], + ... [np.nan, np.nan, 3, 2], + ... [1, 2, 2, 1], + ... [2, 2, 2, 2], + ... ] + ... ) + >>> imputer = ImputerDiffusion( + ... model=TabDDPM(random_state=11), epochs=50, batch_size=1 + ... ) >>> >>> df_imputed = imputer.fit_transform(X) + """ super().__init__(groups=groups, columnwise=False) self.model = model @@ -603,10 +655,13 @@ def _more_tags(self): }, } - def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0): - """ - Fits the imputer on `df`, at the group and/or column level depending onself.groups and - self.columnwise. + def _fit_element( + self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 + ): + """Fit the imputer on `df`. + + It does it at the group and/or column level depending onself.groups + and self.columnwise. Parameters ---------- @@ -626,6 +681,7 @@ def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0): ------ NotDataFrame Input has to be a pandas.DataFrame. + """ self._check_dataframe(df) hp = self._get_params_fit() @@ -634,8 +690,9 @@ def _fit_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0): def _transform_element( self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 ) -> pd.DataFrame: - """ - Transforms the dataframe `df`, at the group and/or column level depending on self.groups + """Transform the dataframe `df`. + + It does it at the group and/or column level depending on self.groups and self.columnwise. Parameters @@ -656,6 +713,7 @@ def _transform_element( ------ NotDataFrame Input has to be a pandas.DataFrame. + """ df_imputed = self.model.predict(df) return df_imputed @@ -682,9 +740,25 @@ def _get_params_fit(self) -> Dict: return hyperparams def get_summary_training(self) -> Dict: + """Get the summary of the training. + + Returns + ------- + Dict + Summary of the training + + """ return self.model.summary def get_summary_architecture(self) -> Dict: + """Get the summary of the architecture. + + Returns + ------- + Dict + Summary of the architecture + + """ return { "number_parameters": self.model.num_params, "epsilon_model": self.model._eps_model, diff --git a/qolmat/imputations/mimca/estim_ncpMCA.py b/qolmat/imputations/mimca/estim_ncpMCA.py new file mode 100644 index 00000000..18cf78e7 --- /dev/null +++ b/qolmat/imputations/mimca/estim_ncpMCA.py @@ -0,0 +1,412 @@ +"""Estimate the optimal number of dimensions for MCA using CV or LOO.""" + + +import numpy as np +import pandas as pd +from tqdm import tqdm + + +def moy_p(V, weights): + """Compute the weighted mean of a vector, ignoring NaNs. + + Parameters + ---------- + V : array-like + Input vector with possible NaN values. + weights : array-like + Weights corresponding to each element in V. + + Returns + ------- + float + Weighted mean of non-NaN elements. + + """ + mask = ~np.isnan(V) + total_weight = np.sum(weights[mask]) + if total_weight == 0: + return 0.0 + return np.sum(V[mask] * weights[mask]) / total_weight + +def tab_disjonctif_NA(df): + """Create a disjunctive table for categorical variables, preserving NaNs. + + Parameters + ---------- + df : DataFrame + Input DataFrame with categorical and numeric variables. + + Returns + ------- + DataFrame + Disjunctive table with one-hot encoding, preserving NaNs. + + """ + df_encoded_list = [] + for col in df.columns: + if df[col].dtype.name == "category" or df[col].dtype == object: + df[col] = df[col].astype("category") + encoded = pd.get_dummies( + df[col], + prefix=col, + prefix_sep="_", + dummy_na=False, + dtype=float, + ) + categories = df[col].cat.categories.tolist() + col_names = [f"{col}_{cat}" for cat in categories] + encoded = encoded.reindex(columns=col_names, fill_value=0.0) + encoded[df[col].isna()] = np.nan + df_encoded_list.append(encoded) + else: + df_encoded_list.append(df[[col]]) + df_encoded = pd.concat(df_encoded_list, axis=1) + return df_encoded + +def prodna(data, noNA, rng): + """Introduce random missing values into a DataFrame. + + Parameters + ---------- + data : DataFrame + Input data. + noNA : float + Proportion of missing values to introduce. + rng : numpy.random.Generator + Random number generator. + + Returns + ------- + DataFrame + DataFrame with introduced missing values. + + """ + data = data.copy() + n_rows, n_cols = data.shape + total_values = n_rows * n_cols + n_missing = int(np.floor(total_values * noNA)) + missing_indices = rng.choice(total_values, n_missing, replace=False) + row_indices = missing_indices // n_cols + col_indices = missing_indices % n_cols + for i in range(n_missing): + row = row_indices[i] + col = col_indices[i] + data.iloc[row, col] = np.nan + return data + +def find_category(df_original, tab_disj): + """Reconstruct original categorical variables from disjunctive table. + + Parameters + ---------- + df_original : DataFrame + Original DataFrame with categorical variables. + tab_disj : DataFrame + Disjunctive table after imputation. + + Returns + ------- + DataFrame + Reconstructed DataFrame with imputed categorical variables. + + """ + df_reconstructed = df_original.copy() + start_idx = 0 + for col in df_original.columns: + if df_original[col].dtype.name == "category" or df_original[col].dtype == object: # noqa: E501 + categories = df_original[col].cat.categories.tolist() + num_categories = len(categories) + sub_tab = tab_disj.iloc[:, start_idx : start_idx + num_categories] + max_indices = sub_tab.values.argmax(axis=1) + df_reconstructed[col] = [categories[idx] for idx in max_indices] + df_reconstructed[col].replace("__MISSING__", np.nan, inplace=True) + start_idx += num_categories + else: + start_idx += 1 + return df_reconstructed + +def imputeMCA( + don, + ncp=2, + method="Regularized", + row_w=None, + coeff_ridge=1, + threshold=1e-6, + seed=None, + maxiter=1000, +): + """Impute missing values in a dataset using MCA. + + Parameters + ---------- + don : DataFrame + Input dataset with missing values. + ncp : int, optional + Number of principal components for MCA. Default is 2. + method : str, optional + Imputation method ('Regularized' or 'EM'). Default is 'Regularized'. + row_w : array-like, optional + Row weights. If None, uniform weights are applied. Default is None. + coeff_ridge : float, optional + Regularization coefficient for 'Regularized' MCA. Default is 1. + threshold : float, optional + Convergence threshold. Default is 1e-6. + seed : int, optional + Random seed for reproducibility. Default is None. + maxiter : int, optional + Maximum number of iterations for the imputation process. + + Returns + ------- + dict + Dictionary containing: + - "tab_disj": Disjunctive coded table after imputation. + - "completeObs": Complete dataset with missing values imputed. + + """ + don = pd.DataFrame(don) + don = don.copy() + for col in don.columns: + if not pd.api.types.is_numeric_dtype(don[col]) or don[col].dtype == "bool": # noqa: E501 + don[col] = don[col].astype("category") + new_categories = don[col].cat.categories.astype(str) + don[col] = don[col].cat.rename_categories(new_categories) # noqa: E501 + else: + unique_values = don[col].dropna().unique() + if set(unique_values).issubset({0, 1}): + don[col] = don[col].astype("category") + new_categories = don[col].cat.categories.astype(str) + don[col] = don[col].cat.rename_categories(new_categories) # noqa: E501 + if row_w is None: + row_w = np.ones(len(don)) / len(don) + else: + row_w = np.array(row_w, dtype=float) + row_w /= row_w.sum() + tab_disj_NA = tab_disjonctif_NA(don) + if ncp == 0: + tab_disj_comp_mean = tab_disj_NA.apply(lambda col: moy_p(col.values, row_w)) # noqa: E501 + tab_disj_comp = tab_disj_NA.fillna(tab_disj_comp_mean) + completeObs = find_category(don, tab_disj_comp) + return {"tab_disj": tab_disj_comp, "completeObs": completeObs} + tab_disj_comp = tab_disj_NA.copy() + hidden = tab_disj_NA.isna() + tab_disj_comp.fillna(tab_disj_comp.mean(), inplace=True) + tab_disj_rec_old = tab_disj_comp.copy() + nbiter = 0 + continue_flag = True + while continue_flag: + nbiter += 1 + M = tab_disj_comp.apply(lambda col: moy_p(col.values, row_w)) / don.shape[1] # noqa: E501 + M = M.replace({0: np.finfo(float).eps}) + M = M.fillna(np.finfo(float).eps) + tab_disj_comp_mean = tab_disj_comp.apply(lambda col: moy_p(col.values, row_w)) # noqa: E501 + tab_disj_comp_mean = tab_disj_comp_mean.replace({0: np.finfo(float).eps}) # noqa: E501 + Z = tab_disj_comp.div(tab_disj_comp_mean, axis=1) + Z_mean = Z.apply(lambda col: moy_p(col.values, row_w)) + Z = Z.subtract(Z_mean, axis=1) + Zscale = Z.multiply(np.sqrt(M), axis=1) + U, s, Vt = np.linalg.svd(Zscale.values, full_matrices=False) + V = Vt.T + U = U[:, :ncp] + V = V[:, :ncp] + s = s[:ncp] + if method.lower() == "em": + moyeig = 0 + else: + if len(s) > ncp: + moyeig = np.mean(s[ncp:] ** 2) + moyeig = min(moyeig * coeff_ridge, s[ncp - 1] ** 2) + else: + moyeig = 0 + eig_shrunk = (s ** 2 - moyeig) / s + eig_shrunk = np.maximum(eig_shrunk, 0) + rec = U @ np.diag(eig_shrunk) @ V.T + tab_disj_rec = pd.DataFrame( + rec, columns=tab_disj_comp.columns, index=tab_disj_comp.index + ) + tab_disj_rec = tab_disj_rec.div(np.sqrt(M), axis=1) + 1 + tab_disj_rec = tab_disj_rec.multiply(tab_disj_comp_mean, axis=1) + diff = tab_disj_rec - tab_disj_rec_old + diff_values = diff.values + hidden_values = hidden.values + diff_values[~hidden_values] = 0 + relch = np.sum((diff_values**2) * row_w[:, None]) + tab_disj_rec_old = tab_disj_rec.copy() + tab_disj_comp.values[hidden_values] = tab_disj_rec.values[hidden_values] # noqa: E501 + continue_flag = (relch > threshold) and (nbiter < maxiter) + completeObs = find_category(don, tab_disj_comp) + return {"tab_disj": tab_disj_comp, "completeObs": completeObs} + +def estim_ncpMCA( + don, + ncp_min=0, + ncp_max=5, + method="Regularized", + method_cv="Kfold", + nbsim=100, + pNA=0.05, + ind_sup=None, + quanti_sup=None, + quali_sup=None, + threshold=1e-4, + verbose=True, + seed=None +): + """Estimate the optimal number of dimensions for MCA using CV. + + Parameters + ---------- + don : DataFrame + Input data. + ncp_min : int, optional + Minimum number of components to test. Default is 0. + ncp_max : int, optional + Maximum number of components to test. Default is 5. + method : str, optional + Imputation method ('Regularized' or 'EM'). Default is 'Regularized'. + method_cv : str, optional + Cross-validation method ('Kfold' or 'loo'). Default is 'Kfold'. + nbsim : int, optional + Number of simulations for cross-validation. Default is 100. + pNA : float, optional + Proportion of missing values to simulate. Default is 0.05. + ind_sup : array-like, optional + Indices of supplementary individuals to exclude from the analysis. + Indices of supplementary quantitative variables to exclude. + quali_sup : array-like, optional + Indices of supplementary qualitative variables to exclude. + quanti_sup= array-like, optional + Indices of supplementary quantitative variables to exclude. + threshold : float, optional + Convergence threshold. Default is 1e-4. + verbose : bool, optional + Whether to print progress. Default is True. + seed : int, optional + Random seed for reproducibility. Default is None. + + Returns + ------- + dict + Dictionary containing: + - 'ncp': Optimal number of dimensions. + - 'criterion': List of criterion values for each dimension. + + """ + don = don.copy() + if ind_sup is not None: + don = don.drop(index=ind_sup) + if quanti_sup is not None or quali_sup is not None: + cols_to_drop = [] + if quanti_sup is not None: + cols_to_drop.extend(don.columns[quanti_sup]) + if quali_sup is not None: + cols_to_drop.extend(don.columns[quali_sup]) + don = don.drop(columns=cols_to_drop) + method = method.lower() + method_cv = method_cv.lower() + for col in don.columns: + if not pd.api.types.is_categorical_dtype(don[col]): + don[col] = don[col].astype("category") + vrai_tab = tab_disjonctif_NA(don) + criterion = [] + if seed is not None: + rng = np.random.default_rng(seed) + else: + rng = np.random.default_rng() + if method_cv == "kfold": + res = np.full((ncp_max - ncp_min + 1, nbsim), np.nan) + if verbose: + sim_range = tqdm(range(nbsim), desc="Simulations") + else: + sim_range = range(nbsim) + for sim in sim_range: + compteur = 0 + max_attempts = 50 + while compteur < max_attempts: + donNA = prodna(don, pNA, rng) + categories_complete = all( + donNA[col].nunique(dropna=True) == don[col].nunique(dropna=True) # noqa: E501 + for col in don.columns + ) + if categories_complete: + break + compteur += 1 + else: + raise ValueError( + "It is too difficult to suppress some cells.\n" + "Maybe several categories are taken by only one individual" + ) + for nbaxes in range(ncp_min, ncp_max + 1): + imputed = imputeMCA( + donNA, + ncp=nbaxes, + method=method, + threshold=threshold, + seed=seed + ) + tab_disj_comp = imputed["tab_disj"] + numerator = ((tab_disj_comp - vrai_tab) ** 2).sum().sum() + denominator = tab_disjonctif_NA(donNA).isna().sum().sum() - vrai_tab.isna().sum().sum() # noqa: E501 + if denominator == 0: + res[nbaxes - ncp_min, sim] = np.nan + else: + res[nbaxes - ncp_min, sim] = numerator / denominator + crit = np.nanmean(res, axis=1) + if np.all(np.isnan(crit)): + raise ValueError("All simulations resulted in NaN error") + ncp = int(np.nanargmin(crit) + ncp_min) + criterion = crit.tolist() + return {"ncp": ncp, "criterion": criterion} + + + elif method_cv == "loo": + criterion = [] + if verbose: + loop = tqdm(total=(ncp_max - ncp_min + 1) * don.shape[0], desc="LOO CV") # noqa: E501 + for nbaxes in range(ncp_min, ncp_max + 1): + errors = [] + for i in range(don.shape[0]): + donNA = don.copy() + for col in don.columns: + if not pd.isna(donNA.at[donNA.index[i], col]): + # Temporarily set the value to NaN + donNA.at[donNA.index[i], col] = np.nan + # Check if all categories are still represented + categories_complete = all( + donNA[col].nunique(dropna=True) == don[col].nunique(dropna=True) # noqa: E501 + for col in don.columns + ) + if not categories_complete: + # Skip this iteration if removing the value causes an issue + donNA.at[donNA.index[i], col] = don.at[don.index[i], col] # noqa: E501 + continue + # Impute missing values using MCA + imputed = imputeMCA( + donNA, + ncp=nbaxes, + method=method, + threshold=threshold, + seed=seed + ) + tab_disj_comp = imputed["tab_disj"] + vrai_tab = tab_disjonctif_NA(don) + numerator = ((tab_disj_comp - vrai_tab) ** 2).sum().sum() + denominator = 1 # Since we imputed one value + error = numerator / denominator + errors.append(error) + # Restore the original value + donNA.at[donNA.index[i], col] = don.at[don.index[i], col] + if verbose: + loop.update(1) + mean_error = np.mean(errors) + criterion.append(mean_error) + if verbose: + loop.close() + if np.all(np.isnan(criterion)): + raise ValueError("All computations resulted in NaN errors") + ncp = int(np.nanargmin(criterion) + ncp_min) + return {"ncp": ncp, "criterion": criterion} + else: + raise ValueError("method_cv must be 'kfold' or 'loo'") + + diff --git a/qolmat/imputations/mimca/imputer_mca.py b/qolmat/imputations/mimca/imputer_mca.py new file mode 100644 index 00000000..fb371462 --- /dev/null +++ b/qolmat/imputations/mimca/imputer_mca.py @@ -0,0 +1,183 @@ +import numpy as np # noqa: D100 +import pandas as pd + +from qolmat.utils.algebra import svdtriplet +from qolmat.utils.utils import ( + find_category, + moy_p, + tab_disjonctif_NA, + tab_disjonctif_prop, +) + + +def imputeMCA( + don, + ncp=2, + method="Regularized", + row_w=None, + coeff_ridge=1, + threshold=1e-6, + seed=None, + maxiter=1000, +): + """Impute missing values in a dataset using (MCA). + + Parameters + ---------- + don : DataFrame + Input dataset with missing values. + ncp : int, optional + Number of principal components for MCA. Default is 2. + method : str, optional + Imputation method ('Regularized' or 'EM'). Default is 'Regularized'. + row_w : array-like, optional + Row weights. If None, uniform weights are applied. Default is None. + coeff_ridge : float, optional + Regularization coefficient for 'Regularized' MCA. Default is 1. + threshold : float, optional + Convergence threshold. Default is 1e-6. + seed : int, optional + Random seed for reproducibility. Default is None. + maxiter : int, optional + Maximum number of iterations for the imputation process. + + Returns + ------- + dict + Dictionary containing: + - "tab_disj": Disjunctive coded table after imputation. + - "completeObs": Complete dataset with missing values imputed. + + """ + # Ensure the data is a DataFrame + don = pd.DataFrame(don) + don = don.copy() + + for col in don.columns: + if ( + not pd.api.types.is_numeric_dtype(don[col]) + or don[col].dtype == "bool" + ): # noqa: E501 + don[col] = don[col].astype("category") + # Convert categories to strings and rename them + new_categories = don[col].cat.categories.astype(str) + don[col] = don[col].cat.rename_categories(new_categories) + else: + unique_values = don[col].dropna().unique() + if set(unique_values).issubset({0, 1}): + don[col] = don[col].astype("category") + new_categories = don[col].cat.categories.astype(str) + don[col] = don[col].cat.rename_categories(new_categories) # noqa: E501 + + print("Data types after conversion:") + print(don.dtypes) + + # Handle row weights + if row_w is None: + row_w = np.ones(len(don)) / len(don) + else: + row_w = np.array(row_w, dtype=float) + row_w /= row_w.sum() + + # Initial imputation and creation of disjunctive tables + tab_disj_NA = tab_disjonctif_NA(don) + tab_disj_comp = tab_disjonctif_prop(don, seed=seed) + hidden = tab_disj_NA.isna() + tab_disj_rec_old = tab_disj_comp.copy() + + # Initialize iteration parameters + nbiter = 0 + continue_flag = True + + while continue_flag: + nbiter += 1 + + # Step 1: Compute weighted means M + M = ( + tab_disj_comp.apply(lambda col: moy_p(col.values, row_w)) + / don.shape[1] + ) # noqa: E501 + M = M.replace({0: np.finfo(float).eps}) + M = M.fillna(np.finfo(float).eps) + + if (M < 0).any(): + raise ValueError( + "Negative values encountered in M. Check data preprocessing." + ) # noqa: E501 + + print(f"Iteration {nbiter}:") + print("Weighted means (M):") + print(M.head()) + + # Step 2: Center and scale the data + tab_disj_comp_mean = tab_disj_comp.apply( + lambda col: moy_p(col.values, row_w) + ) # noqa: E501 + tab_disj_comp_mean = tab_disj_comp_mean.replace( + {0: np.finfo(float).eps} + ) # noqa: E501 + Z = tab_disj_comp.div(tab_disj_comp_mean, axis=1) + Z_mean = Z.apply(lambda col: moy_p(col.values, row_w)) + Z = Z.subtract(Z_mean, axis=1) + Zscale = Z.multiply(np.sqrt(M), axis=1) + + print("Centered and scaled data (Zscale):") + print(Zscale.head()) + + # Step 3: Perform weighted SVD + s, U, V = svdtriplet(Zscale.values, row_w=row_w, ncp=ncp) + print("Singular values (s):") + print(s) + print("Left singular vectors (U):") + print(U) + print("Right singular vectors (V):") + print(V) + + # Step 4: Regularization (Shrinking Eigenvalues) + if method.lower() == "em": + moyeig = 0 + else: + # Calculate moyeig based on R's imputeMCA logic + if len(s) > ncp: + moyeig = np.mean(s[ncp:] ** 2) + moyeig = min(moyeig * coeff_ridge, s[ncp] ** 2) + else: + moyeig = 0 + # Set to 0 when there are no additional singular values + eig_shrunk = (s[:ncp] ** 2 - moyeig) / s[:ncp] + eig_shrunk = np.maximum(eig_shrunk, 0) # Ensure non-negative + print("Shrunk eigenvalues (eig_shrunk):") + print(eig_shrunk) + + # Step 5: Reconstruct the data + rec = U @ np.diag(eig_shrunk) @ V.T + tab_disj_rec = pd.DataFrame( + rec, columns=tab_disj_comp.columns, index=tab_disj_comp.index + ) # noqa: E501 + tab_disj_rec = tab_disj_rec.div(np.sqrt(M), axis=1) + 1 + tab_disj_rec = tab_disj_rec.multiply(tab_disj_comp_mean, axis=1) + print("Reconstructed disjunctive table (tab_disj_rec):") + print(tab_disj_rec.head()) + + # Step 6: Compute difference and relative change + diff = tab_disj_rec - tab_disj_rec_old + diff_values = diff.values + hidden_values = hidden.values + # Zero out observed positions + diff_values[~hidden_values] = 0 + relch = np.sum((diff_values**2) * row_w[:, None]) + print(f"Relative Change: {relch}\n") + + # Step 7: Update for next iteration + tab_disj_rec_old = tab_disj_rec.copy() + tab_disj_comp.values[hidden_values] = tab_disj_rec.values[ + hidden_values + ] # noqa: E501 + + # Step 8: Check convergence + continue_flag = (relch > threshold) and (nbiter < maxiter) + + # Step 9: Reconstruct categorical data + completeObs = find_category(don, tab_disj_comp) + + return {"tab_disj": tab_disj_comp, "completeObs": completeObs} diff --git a/qolmat/imputations/mimca/mimca.py b/qolmat/imputations/mimca/mimca.py new file mode 100644 index 00000000..c36c567b --- /dev/null +++ b/qolmat/imputations/mimca/mimca.py @@ -0,0 +1,665 @@ +import numpy as np +import pandas as pd +from tqdm import tqdm + + +def moy_p(V, weights): + """Compute the weighted mean of a vector, ignoring NaNs. + + Parameters + ---------- + V : array-like + Input vector with possible NaN values. + weights : array-like + Weights corresponding to each element in V. + + Returns + ------- + float + Weighted mean of non-NaN elements. + + """ + mask = ~np.isnan(V) + total_weight = np.sum(weights[mask]) + if total_weight == 0: + return 0.0 + return np.sum(V[mask] * weights[mask]) / total_weight + + +def tab_disjonctif_NA(df) -> pd.DataFrame: + """Create a disjunctive table for categorical variables, preserving NaNs. + + Parameters + ---------- + df : DataFrame + Input DataFrame with categorical and numeric variables. + + Returns + ------- + DataFrame + Disjunctive table with one-hot encoding, preserving NaNs. + + """ + df_encoded_list = [] + for col in df.columns: + if df[col].dtype.name == "category" or df[col].dtype == object: + df[col] = df[col].astype("category") + encoded = pd.get_dummies( + df[col], + prefix=col, + prefix_sep="_", + dummy_na=False, + dtype=float, + ) + categories = df[col].cat.categories.tolist() + col_names = [f"{col}_{cat}" for cat in categories] + encoded = encoded.reindex(columns=col_names, fill_value=0.0) + encoded[df[col].isna()] = np.nan + df_encoded_list.append(encoded) + else: + df_encoded_list.append(df[[col]]) + df_encoded = pd.concat(df_encoded_list, axis=1) + return df_encoded + + +def prodna(data, noNA, rng): + """Introduce random missing values into a DataFrame. + + Parameters + ---------- + data : DataFrame + Input data. + noNA : float + Proportion of missing values to introduce. + rng : numpy.random.Generator + Random number generator. + + Returns + ------- + DataFrame + DataFrame with introduced missing values. + + """ + data = data.copy() + n_rows, n_cols = data.shape + total_values = n_rows * n_cols + n_missing = int(np.floor(total_values * noNA)) + missing_indices = rng.choice(total_values, n_missing, replace=False) + row_indices = missing_indices // n_cols + col_indices = missing_indices % n_cols + for i in range(n_missing): + row = row_indices[i] + col = col_indices[i] + data.iloc[row, col] = np.nan + return data + + +def find_category(df_original, tab_disj): + """Reconstruct original categorical variables from disjunctive table. + + Parameters + ---------- + df_original : DataFrame + Original DataFrame with categorical variables. + tab_disj : DataFrame + Disjunctive table after imputation. + + Returns + ------- + DataFrame + Reconstructed DataFrame with imputed categorical variables. + + """ + df_reconstructed = df_original.copy() + start_idx = 0 + for col in df_original.columns: + if ( + df_original[col].dtype.name == "category" + or df_original[col].dtype == object + ): # noqa: E501 + categories = df_original[col].cat.categories.tolist() + num_categories = len(categories) + sub_tab = tab_disj.iloc[:, start_idx : start_idx + num_categories] + max_indices = sub_tab.values.argmax(axis=1) + df_reconstructed[col] = [categories[idx] for idx in max_indices] + df_reconstructed[col] = df_reconstructed[col].astype("category") + start_idx += num_categories + else: + start_idx += 1 + return df_reconstructed + + +def imputeMCA( + don, + ncp=2, + method="Regularized", + row_w=None, + coeff_ridge=1, + threshold=1e-6, + seed=None, + maxiter=1000, +): + """Impute missing values in a dataset using (MCA). + + Parameters + ---------- + don : DataFrame + Input dataset with missing values. + ncp : int, optional + Number of principal components for MCA. Default is 2. + method : str, optional + Imputation method ('Regularized' or 'EM'). Default is 'Regularized'. + row_w : array-like, optional + Row weights. If None, uniform weights are applied. Default is None. + coeff_ridge : float, optional + Regularization coefficient for 'Regularized' MCA. Default is 1. + threshold : float, optional + Convergence threshold. Default is 1e-6. + seed : int, optional + Random seed for reproducibility. Default is None. + maxiter : int, optional + Maximum number of iterations for the imputation process. + + Returns + ------- + dict + Dictionary containing: + - "tab_disj": Disjunctive coded table after imputation. + - "completeObs": Complete dataset with missing values imputed. + + """ + don = pd.DataFrame(don) + don = don.copy() + for col in don.columns: + if ( + not pd.api.types.is_numeric_dtype(don[col]) + or don[col].dtype == "bool" + ): # noqa: E501 + don[col] = don[col].astype("category") + new_categories = don[col].cat.categories.astype(str) + don[col] = don[col].cat.rename_categories(new_categories) # noqa: E501 + else: + unique_values = don[col].dropna().unique() + if set(unique_values).issubset({0, 1}): + don[col] = don[col].astype("category") + new_categories = don[col].cat.categories.astype(str) + don[col] = don[col].cat.rename_categories(new_categories) # noqa: E501 + if row_w is None: + row_w = np.ones(len(don)) / len(don) + else: + row_w = np.array(row_w, dtype=float) + row_w /= row_w.sum() + tab_disj_NA = tab_disjonctif_NA(don) + if ncp == 0: + tab_disj_comp_mean = tab_disj_NA.apply( + lambda col: moy_p(col.values, row_w) + ) # noqa: E501 + tab_disj_comp = tab_disj_NA.fillna(tab_disj_comp_mean) + completeObs = find_category(don, tab_disj_comp) + return {"tab_disj": tab_disj_comp, "completeObs": completeObs} + tab_disj_comp = tab_disj_NA.copy() + hidden = tab_disj_NA.isna() + tab_disj_comp.fillna(tab_disj_comp.mean(), inplace=True) + tab_disj_rec_old = tab_disj_comp.copy() + nbiter = 0 + continue_flag = True + while continue_flag: + nbiter += 1 + M = ( + tab_disj_comp.apply(lambda col: moy_p(col.values, row_w)) + / don.shape[1] + ) # noqa: E501 + M = M.replace({0: np.finfo(float).eps}) + M = M.fillna(np.finfo(float).eps) + tab_disj_comp_mean = tab_disj_comp.apply( + lambda col: moy_p(col.values, row_w) + ) # noqa: E501 + tab_disj_comp_mean = tab_disj_comp_mean.replace( + {0: np.finfo(float).eps} + ) # noqa: E501 + Z = tab_disj_comp.div(tab_disj_comp_mean, axis=1) + Z_mean = Z.apply(lambda col: moy_p(col.values, row_w)) + Z = Z.subtract(Z_mean, axis=1) + Zscale = Z.multiply(np.sqrt(M), axis=1) + U, s, Vt = np.linalg.svd(Zscale.values, full_matrices=False) + V = Vt.T + U = U[:, :ncp] + V = V[:, :ncp] + s = s[:ncp] + if method.lower() == "em": + moyeig = 0 + else: + if len(s) > ncp: + moyeig = np.mean(s[ncp:] ** 2) + moyeig = min(moyeig * coeff_ridge, s[ncp - 1] ** 2) + else: + moyeig = 0 + eig_shrunk = (s**2 - moyeig) / s + eig_shrunk = np.maximum(eig_shrunk, 0) + rec = U @ np.diag(eig_shrunk) @ V.T + tab_disj_rec = pd.DataFrame( + rec, columns=tab_disj_comp.columns, index=tab_disj_comp.index + ) + tab_disj_rec = tab_disj_rec.div(np.sqrt(M), axis=1) + 1 + tab_disj_rec = tab_disj_rec.multiply(tab_disj_comp_mean, axis=1) + diff = tab_disj_rec - tab_disj_rec_old + diff_values = diff.values + hidden_values = hidden.values + diff_values[~hidden_values] = 0 + relch = np.sum((diff_values**2) * row_w[:, None]) + tab_disj_rec_old = tab_disj_rec.copy() + tab_disj_comp.values[hidden_values] = tab_disj_rec.values[ + hidden_values + ] # noqa: E501 + continue_flag = (relch > threshold) and (nbiter < maxiter) + completeObs = find_category(don, tab_disj_comp) + return {"tab_disj": tab_disj_comp, "completeObs": completeObs} + + +def estim_ncpMCA( + don, + ncp_min=0, + ncp_max=5, + method="Regularized", + method_cv="Kfold", + nbsim=100, + pNA=0.05, + ind_sup=None, + quanti_sup=None, + quali_sup=None, + threshold=1e-4, + verbose=True, + seed=None, +): + """Estimate the optimal number of dimensions for MCA using CV. + + Parameters + ---------- + don : DataFrame + Input data. + ncp_min : int, optional + Minimum number of components to test. Default is 0. + ncp_max : int, optional + Maximum number of components to test. Default is 5. + method : str, optional + Imputation method ('Regularized' or 'EM'). Default is 'Regularized'. + method_cv : str, optional + Cross-validation method ('Kfold' or 'loo'). Default is 'Kfold'. + nbsim : int, optional + Number of simulations for cross-validation. Default is 100. + pNA : float, optional + Proportion of missing values to simulate. Default is 0.05. + ind_sup : array-like, optional + Indices of supplementary individuals to exclude from the analysis. + quanti_sup : array-like, optional + Indices of supplementary quantitative variables to exclude. + quali_sup : array-like, optional + Indices of supplementary qualitative variables to exclude. + Convergence threshold. Default is 1e-4. + verbose : bool, optional + Whether to print progress. Default is True. + seed : int, optional + Random seed for reproducibility. Default is None. + + Returns + ------- + dict + Dictionary containing: + - 'ncp': Optimal number of dimensions. + - 'criterion': List of criterion values dimensions. + + """ + don = don.copy() + if ind_sup is not None: + don = don.drop(index=ind_sup) + if quanti_sup is not None or quali_sup is not None: + cols_to_drop = [] + if quanti_sup is not None: + cols_to_drop.extend(don.columns[quanti_sup]) + if quali_sup is not None: + cols_to_drop.extend(don.columns[quali_sup]) + don = don.drop(columns=cols_to_drop) + method = method.lower() + method_cv = method_cv.lower() + for col in don.columns: + if not pd.api.types.is_categorical_dtype(don[col]): + don[col] = don[col].astype("category") + vrai_tab = tab_disjonctif_NA(don) + criterion = [] + if seed is not None: + rng = np.random.default_rng(seed) + else: + rng = np.random.default_rng() + if method_cv == "kfold": + res = np.full((ncp_max - ncp_min + 1, nbsim), np.nan) + if verbose: + sim_range = tqdm(range(nbsim), desc="Simulations") + else: + sim_range = range(nbsim) + for sim in sim_range: + compteur = 0 + max_attempts = 50 + while compteur < max_attempts: + donNA = prodna(don, pNA, rng) + categories_complete = all( + donNA[col].nunique(dropna=True) + == don[col].nunique(dropna=True) # noqa: E501 + for col in don.columns + ) + if categories_complete: + break + compteur += 1 + else: + raise ValueError( + "It is too difficult to suppress some cells.\n" + "Maybe several categories by only one individual. " + 'You should remove these variables or try with"loo".' + ) + for nbaxes in range(ncp_min, ncp_max + 1): + imputed = imputeMCA( + donNA, + ncp=nbaxes, + method=method, + threshold=threshold, + seed=seed, + ) + tab_disj_comp = imputed["tab_disj"] + numerator = ((tab_disj_comp - vrai_tab) ** 2).sum().sum() + denominator = ( + tab_disjonctif_NA(donNA).isna().sum().sum() + - vrai_tab.isna().sum().sum() + ) # noqa: E501 + if denominator == 0: + res[nbaxes - ncp_min, sim] = np.nan + else: + res[nbaxes - ncp_min, sim] = numerator / denominator + crit = np.nanmean(res, axis=1) + if np.all(np.isnan(crit)): + raise ValueError( + "All simulations resulted in NaN errors. Please check your data and parameters." + ) # noqa: E501 + ncp = int(np.nanargmin(crit) + ncp_min) + criterion = crit.tolist() + return {"ncp": ncp, "criterion": criterion} + elif method_cv == "loo": + # LOO cross-validation code (if needed) + pass + else: + raise ValueError("method_cv must be 'kfold' or 'loo'") + + +def imputeMCA_print( + don, + ncp, + method="Regularized", + row_w=None, + coeff_ridge=1, + threshold=1e-6, + seed=None, + maxiter=1000, + verbose=False, + print_msg="", +): + """Print progress during MCA imputation. + + Parameters + ---------- + don : DataFrame + Input dataset with missing values. + ncp : int + Number of principal components for MCA. + method : str, optional + Imputation method ('Regularized' or 'EM'). Default is 'Regularized'. + row_w : array-like, optional + Row weights. If None, uniform weights are applied. Default is None. + coeff_ridge : float, optional + Regularization coefficient for 'Regularized' MCA. Default is 1. + threshold : float, optional + Convergence threshold. Default is 1e-6. + seed : int, optional + Random seed for reproducibility. Default is None. + maxiter : int, optional + Maximum number of iterations for the imputation process. + verbose : bool, optional + Whether to print progress. Default is False. + print_msg : str, optional + Message to print during imputation. Default is ''. + + Returns + ------- + dict + Result of the MCA imputation. + + """ + if verbose: + print(f"{print_msg}...", end="", flush=True) + res = imputeMCA( + don=don, + ncp=ncp, + method=method, + row_w=row_w, + coeff_ridge=coeff_ridge, + threshold=threshold, + seed=seed, + maxiter=maxiter, + ) # noqa: E501 + if verbose: + print("done") + return res + + +def normtdc(tab_disj, data_na): + """Normalize the disjunctive table to ensure values are between 0 and 1. + + Parameters + ---------- + tab_disj : DataFrame + Disjunctive table to normalize. + data_na : DataFrame + DataFrame with original categorical data. + + Returns + ------- + DataFrame + Normalized disjunctive table. + + """ + tdc = tab_disj.copy() + tdc[tdc < 0] = 0 + tdc[tdc > 1] = 1 + col_suppr = np.cumsum( + [len(col.cat.categories) for _, col in data_na.items()] + ) # noqa: E501 + + def normalize_row(row, col_suppr): + start = 0 + for end in col_suppr: + segment = row[start:end] + total = np.sum(segment) + if total != 0: + row[start:end] = segment / total + start = end + return row + + tdc = tdc.apply( + lambda row: normalize_row(row.values, col_suppr), + axis=1, + result_type="expand", + ) # noqa: E501 + tdc.columns = tab_disj.columns + return tdc + + +def draw(tab_disj, Don, Don_na): + """Draw random samples from the normalized disjtable to reconstruct data. + + Parameters + ---------- + tab_disj : DataFrame + Normalized disjunctive table. + Don : DataFrame + Original complete dataset. + Don_na : DataFrame + Dataset with missing values. + + Returns + ------- + DataFrame + Reconstructed dataset with imputed categorical values. + + """ + Don_res = Don.copy() + nbdummy = np.ones(Don.shape[1], dtype=int) + is_quali = [ + i + for i, col in enumerate(Don.columns) + if not pd.api.types.is_numeric_dtype(Don[col]) + ] # noqa: E501 + nbdummy[is_quali] = [Don.iloc[:, i].nunique() for i in is_quali] + vec = np.concatenate(([0], np.cumsum(nbdummy))) + for idx, i in enumerate(is_quali): + start = vec[idx] + end = vec[idx + 1] + cols = tab_disj.columns[start:end] + probs = tab_disj[cols].values + categories = Don.iloc[:, i].cat.categories + sampled_indices = [] + for p in probs: + if np.sum(p) > 0: + p_normalized = p / np.sum(p) + sampled_idx = np.random.choice(len(categories), p=p_normalized) # noqa: E501 + else: + sampled_idx = np.nan + sampled_indices.append(sampled_idx) + Don_res.iloc[:, i] = pd.Categorical.from_codes( + sampled_indices, categories=categories + ) # noqa: E501 + return Don_res + + +def MIMCA( + X, + nboot=100, + ncp=2, + coeff_ridge=1, + threshold=1e-6, + maxiter=1000, + verbose=False, +): # noqa: E501 + """Perform Multiple Imputation with (MIMCA). + + Parameters + ---------- + X : DataFrame + Input data with missing values. + nboot : int, optional + Number of bootstrap samples. Default is 100. + ncp : int, optional + Number of principal components for MCA. Default is 2. + coeff_ridge : float, optional + Regularization coefficient for 'Regularized' MCA. Default is 1. + threshold : float, optional + Convergence threshold. Default is 1e-6. + maxiter : int, optional + Maximum number of iterations for the imputation process. + verbose : bool, optional + Whether to print progress. Default is False. + + Returns + ------- + dict + Dictionary containing the results of the multiple imputations. + + """ + import warnings + + X = X.copy() + # Convert non-numeric columns to categorical + is_quali = [ + col for col in X.columns if not pd.api.types.is_numeric_dtype(X[col]) + ] # noqa: E501 + X[is_quali] = X[is_quali].apply(lambda col: col.astype("category")) + X = X.apply( + lambda col: col.cat.remove_unused_categories() + if col.dtype.name == "category" + else col + ) # noqa: E501 + # Remove variables with only one category + OneCat = ( + X.apply( + lambda col: len(col.cat.categories) + if col.dtype.name == "category" + else np.nan + ) + == 1 + ) # noqa: E501 + if OneCat.any(): + warning_vars = X.columns[OneCat].tolist() + warnings.warn( + f"The following variables are constant and have been suppressed from the analysis: {', '.join(warning_vars)}" + ) # noqa: E501 + X = X.drop(columns=warning_vars) + if X.shape[1] <= 1: + raise ValueError( + "No sufficient variables have 2 categories or more" + ) # noqa: E501 + n = X.shape[0] + # Generate bootstrap weights + rng = np.random.default_rng() + Boot = rng.integers(low=0, high=n, size=(n, nboot)) + Weight = np.zeros((n, nboot)) + for i in range(nboot): + counts = np.bincount(Boot[:, i], minlength=n) + Weight[:, i] = counts + Weight = Weight / Weight.sum(axis=0) + # Perform multiple imputations + res_imp = [] + for i in range(nboot): + if verbose: + print(f"Imputation {i + 1}/{nboot}") + weight_i = Weight[:, i] + res = imputeMCA_print( + don=X, + ncp=ncp, + coeff_ridge=coeff_ridge, + threshold=threshold, # noqa: E501 + maxiter=maxiter, + row_w=weight_i, + verbose=verbose, + print_msg=f"Imputation {i + 1}", + ) # noqa: E501 + res_imp.append(res) + # Normalize the imputed disjunctive tables + tdc_imp = [res["tab_disj"] for res in res_imp] + res_comp = [res["completeObs"] for res in res_imp] + tdc_norm = [ + normtdc(tab_disj=tdc, data_na=comp) + for tdc, comp in zip(tdc_imp, res_comp) + ] # noqa: E501 + # Draw the final imputed datasets + X_imp = [ + draw(tab_disj=tdc, Don=comp, Don_na=X) + for tdc, comp in zip(tdc_norm, res_comp) + ] # noqa: E501 + # Compute the final imputed disjunctive table using all data + res_imputeMCA = imputeMCA( + X, + ncp=ncp, + coeff_ridge=coeff_ridge, + threshold=threshold, + maxiter=maxiter, + )["tab_disj"] + res = { + "res_MIs": X_imp, + "res_imputeMCA": res_imputeMCA, + "call": { + "X": X, + "nboot": nboot, + "ncp": ncp, + "coeff_ridge": coeff_ridge, + "threshold": threshold, + "maxiter": maxiter, + "tab_disj_array": np.array([tdc.values for tdc in tdc_imp]), + }, + } + return res diff --git a/qolmat/imputations/preprocessing.py b/qolmat/imputations/preprocessing.py index 50c54270..12308ffb 100644 --- a/qolmat/imputations/preprocessing.py +++ b/qolmat/imputations/preprocessing.py @@ -1,38 +1,39 @@ +"""Script for preprocessing functions.""" + import copy -from typing import Any, Dict, Hashable, List, Optional, Tuple +from typing import Dict, Hashable, List, Optional, Tuple + import numpy as np import pandas as pd -from sklearn.compose import make_column_selector as selector -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import Pipeline -from sklearn.ensemble import ( - HistGradientBoostingRegressor, - HistGradientBoostingClassifier, -) -from sklearn.compose import ColumnTransformer +from category_encoders.one_hot import OneHotEncoder +from numpy.typing import NDArray from sklearn.base import ( BaseEstimator, RegressorMixin, TransformerMixin, ) +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_selector as selector +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler from sklearn.utils.validation import ( - check_X_y, check_array, check_is_fitted, + check_X_y, ) -from category_encoders.one_hot import OneHotEncoder - - -from typing_extensions import Self -from numpy.typing import NDArray - +# from typing_extensions import Self from qolmat.utils import utils class MixteHGBM(RegressorMixin, BaseEstimator): - """ - A custom scikit-learn estimator implementing a mixed model using + """MixteHGBM class. + + This is a custom scikit-learn estimator implementing a mixed model using HistGradientBoostingClassifier for string target data and HistGradientBoostingRegressor for numeric target data. """ @@ -41,19 +42,18 @@ def __init__(self): super().__init__() def set_model_parameters(self, **args_model): - """ - Sets the arguments of the underlying model. + """Set the arguments of the underlying model. Parameters ---------- - **kwargs : dict + **args_model : dict Additional keyword arguments to be passed to the underlying models. + """ self.args_model = args_model - def fit(self, X: NDArray, y: NDArray) -> Self: - """ - Fit the model according to the given training data. + def fit(self, X: NDArray, y: NDArray) -> "MixteHGBM": + """Fit the model according to the given training data. Parameters ---------- @@ -66,8 +66,11 @@ def fit(self, X: NDArray, y: NDArray) -> Self: ------- self : object Returns self. + """ - X, y = check_X_y(X, y, accept_sparse=True, force_all_finite="allow-nan") + X, y = check_X_y( + X, y, accept_sparse=True, force_all_finite="allow-nan" + ) self.is_fitted_ = True self.n_features_in_ = X.shape[1] if hasattr(self, "args_model"): @@ -85,8 +88,7 @@ def fit(self, X: NDArray, y: NDArray) -> Self: return self def predict(self, X: NDArray) -> NDArray: - """ - Predict using the fitted model. + """Predict using the fitted model. Parameters ---------- @@ -97,6 +99,7 @@ def predict(self, X: NDArray) -> NDArray: ------- y_pred : array-like, shape (n_samples,) Predicted target values. + """ X = check_array(X, accept_sparse=True, force_all_finite="allow-nan") check_is_fitted(self, "is_fitted_") @@ -104,26 +107,29 @@ def predict(self, X: NDArray) -> NDArray: return y_pred def _more_tags(self): + """Indicate if the class allows inputs with categorical data and nans. + + It modifies the behaviour of the functions checking data. """ - This method indicates that this class allows inputs with categorical data and nans. It - modifies the behaviour of the functions checking data. - """ - return {"X_types": ["2darray", "categorical", "string"], "allow_nan": True} + return { + "X_types": ["2darray", "categorical", "string"], + "allow_nan": True, + } class BinTransformer(TransformerMixin, BaseEstimator): - """ - Learns the possible values of the provided numerical feature, allowing to transform new values - to the closest existing one. + """BinTransformer class. + + Learn the possible values of the provided numerical feature, + allowing to transform new values to the closest existing one. """ def __init__(self, cols: Optional[List] = None): super().__init__() self.cols = cols - def fit(self, X: NDArray, y: Optional[NDArray] = None) -> Self: - """ - Fit the BinTransformer to X. + def fit(self, X: NDArray, y: Optional[NDArray] = None) -> "BinTransformer": + """Fit the BinTransformer to X. Parameters ---------- @@ -138,11 +144,12 @@ def fit(self, X: NDArray, y: Optional[NDArray] = None) -> Self: ------- self : object Fitted transformer. + """ df = utils._validate_input(X) self.feature_names_in_ = df.columns self.n_features_in_ = len(df.columns) - self.dict_df_bins_: Dict[Hashable, pd.DataFrame] = dict() + self.dict_df_bins_: Dict[Hashable, pd.DataFrame] = {} if self.cols is None: cols = df.select_dtypes(include="number").columns else: @@ -156,8 +163,7 @@ def fit(self, X: NDArray, y: Optional[NDArray] = None) -> Self: return self def transform(self, X: NDArray) -> NDArray: - """ - Transform X to existing values learned during fit. + """Transform X to existing values learned during fit. Parameters ---------- @@ -168,6 +174,7 @@ def transform(self, X: NDArray) -> NDArray: ------- X_out : ndarray of shape (n_samples,) Transformed input. + """ df = utils._validate_input(X) check_is_fitted(self) @@ -176,7 +183,8 @@ def transform(self, X: NDArray) -> NDArray: or df.columns.to_list() != self.feature_names_in_.to_list() ): raise ValueError( - "Feature names in X {df.columns} don't match with expected {feature_names_in_}" + f"Feature names in X {df.columns} don't match with " + f"expected {self.feature_names_in_}" ) df_out = df.copy() for col in df: @@ -192,8 +200,7 @@ def transform(self, X: NDArray) -> NDArray: return df_out def inverse_transform(self, X: NDArray) -> NDArray: - """ - Transform X to existing values learned during fit. + """Transform X to existing values learned during fit. Parameters ---------- @@ -204,37 +211,43 @@ def inverse_transform(self, X: NDArray) -> NDArray: ------- X_out : ndarray of shape (n_samples,) Transformed input. + """ return self.transform(X) def _more_tags(self): + """Indicate if the class allows inputs with categorical data and nans. + + It modifies the behaviour of the functions checking data. """ - This method indicates that this class allows inputs with categorical data and nans. It - modifies the behaviour of the functions checking data. - """ - return {"X_types": ["2darray", "categorical", "string"], "allow_nan": True} + return { + "X_types": ["2darray", "categorical", "string"], + "allow_nan": True, + } class OneHotEncoderProjector(OneHotEncoder): - """ - Inherits from the class OneHotEncoder imported from category_encoders. The decoding - function accepts non boolean values (as it is the case for the sklearn OneHotEncoder). In - this case the decoded value corresponds to the largest dummy value. + """Class for one-hot encoding of categorical features. + + It inherits from the class OneHotEncoder imported from category_encoders. + The decoding function accepts non boolean values (as it is the case for + the sklearn OneHotEncoder). In this case the decoded value corresponds to + the largest dummy value. """ def __init__(self, **kwargs): super().__init__(**kwargs) def reverse_dummies(self, X: pd.DataFrame, mapping: Dict) -> pd.DataFrame: - """ - Convert dummy variable into numerical variables + """Convert dummy variable into numerical variables. Parameters ---------- X : DataFrame + Input dataframe. mapping: list-like - Contains mappings of column to be transformed to it's new columns and value - represented + Mapping of column to be transformed to its + new columns and value represented Returns ------- @@ -260,22 +273,55 @@ def reverse_dummies(self, X: pd.DataFrame, mapping: Dict) -> pd.DataFrame: class WrapperTransformer(TransformerMixin, BaseEstimator): - """ - Wraps a transformer with reversible transformers designed to embed the data. + """Wrap a transformer. + + Wrapper with reversible transformers designed to embed the data. """ - def __init__(self, transformer: TransformerMixin, wrapper: TransformerMixin): + def __init__( + self, transformer: TransformerMixin, wrapper: TransformerMixin + ): super().__init__() self.transformer = transformer self.wrapper = wrapper - def fit(self, X: NDArray, y: Optional[NDArray] = None) -> Self: + def fit( + self, X: NDArray, y: Optional[NDArray] = None + ) -> "WrapperTransformer": + """Fit the model according to the given training data. + + Parameters + ---------- + X : NDArray + Input array. + y : Optional[NDArray], optional + _description_, by default None + + Returns + ------- + Self + The object itself. + + """ X_transformed = copy.deepcopy(X) X_transformed = self.wrapper.fit_transform(X_transformed) X_transformed = self.transformer.fit(X_transformed) return self def fit_transform(self, X: NDArray) -> NDArray: + """Fit the model according to the given training data and transform it. + + Parameters + ---------- + X : NDArray + Input array. + + Returns + ------- + NDArray + Transformed array. + + """ X_transformed = copy.deepcopy(X) X_transformed = self.wrapper.fit_transform(X_transformed) X_transformed = self.transformer.fit_transform(X_transformed) @@ -283,6 +329,19 @@ def fit_transform(self, X: NDArray) -> NDArray: return X_transformed def transform(self, X: NDArray) -> NDArray: + """Transform X. + + Parameters + ---------- + X : NDArray + Input array. + + Returns + ------- + NDArray + Transformed array. + + """ X_transformed = copy.deepcopy(X) X_transformed = self.wrapper.transform(X_transformed) X_transformed = self.transformer.transform(X_transformed) @@ -293,8 +352,9 @@ def transform(self, X: NDArray) -> NDArray: def make_pipeline_mixte_preprocessing( scale_numerical: bool = False, avoid_new: bool = False ) -> Pipeline: - """ - Create a preprocessing pipeline managing mixed type data by one hot encoding categorical data. + """Create a preprocessing pipeline managing mixed type data. + + It does this by one hot encoding categorical data. Parameters ---------- @@ -307,14 +367,19 @@ def make_pipeline_mixte_preprocessing( ------- preprocessor : Pipeline Preprocessing pipeline + """ transformers: List[Tuple] = [] if scale_numerical: - transformers += [("num", StandardScaler(), selector(dtype_include=np.number))] + transformers += [ + ("num", StandardScaler(), selector(dtype_include=np.number)) + ] ohe = OneHotEncoder(handle_unknown="ignore", use_cat_names=True) transformers += [("cat", ohe, selector(dtype_exclude=np.number))] - col_transformer = ColumnTransformer(transformers=transformers, remainder="passthrough") + col_transformer = ColumnTransformer( + transformers=transformers, remainder="passthrough" + ) col_transformer = col_transformer.set_output(transform="pandas") preprocessor = Pipeline(steps=[("col_transformer", col_transformer)]) @@ -323,13 +388,19 @@ def make_pipeline_mixte_preprocessing( return preprocessor -def make_robust_MixteHGB(scale_numerical: bool = False, avoid_new: bool = False) -> Pipeline: - """ - Create a robust pipeline for MixteHGBM by one hot encoding categorical features. - This estimator is intended for use in ImputerRegressor to deal with mixed type data. +def make_robust_MixteHGB( + scale_numerical: bool = False, avoid_new: bool = False +) -> Pipeline: + """Create a robust pipeline for MixteHGBM. - Note that from sklearn 1.4 HistGradientBoosting Natively Supports Categorical DTypes in - DataFrames, so that this pipeline is not required anymore. + Create a preprocessing pipeline managing mixed type data + by one hot encoding categorical features. + This estimator is intended for use in ImputerRegressor + to deal with mixed type data. + + Note that from sklearn 1.4 HistGradientBoosting Natively Supports + Categorical DTypes in DataFrames, so that this pipeline is not + required anymore. Parameters @@ -343,6 +414,7 @@ def make_robust_MixteHGB(scale_numerical: bool = False, avoid_new: bool = False) ------- robust_MixteHGB : object A robust pipeline for MixteHGBM. + """ preprocessor = make_pipeline_mixte_preprocessing( scale_numerical=scale_numerical, avoid_new=avoid_new diff --git a/qolmat/imputations/rpca/rpca.py b/qolmat/imputations/rpca/rpca.py index 29eeaaf9..a081eae3 100644 --- a/qolmat/imputations/rpca/rpca.py +++ b/qolmat/imputations/rpca/rpca.py @@ -1,18 +1,15 @@ +"""Script for the root class of RPCA.""" + from __future__ import annotations -from typing import Union, Tuple -from typing_extensions import Self +from typing import Union import numpy as np -from numpy.typing import NDArray from sklearn.base import BaseEstimator, TransformerMixin -from qolmat.utils import utils - class RPCA(BaseEstimator, TransformerMixin): - """ - This class is the root class of the RPCA methods. + """Root class of the RPCA methods. Parameters ---------- @@ -24,6 +21,7 @@ class RPCA(BaseEstimator, TransformerMixin): Tolerance for stopping criteria, by default 1e-6 verbose: bool default `False` + """ def __init__( diff --git a/qolmat/imputations/rpca/rpca_noisy.py b/qolmat/imputations/rpca/rpca_noisy.py index 74e68856..ae59ae0a 100644 --- a/qolmat/imputations/rpca/rpca_noisy.py +++ b/qolmat/imputations/rpca/rpca_noisy.py @@ -1,13 +1,15 @@ +"""Script for an the noisy RPCA.""" + from __future__ import annotations import warnings -from typing import Dict, List, Optional, Tuple, TypeVar, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np import scipy as scp +from numpy.typing import NDArray from scipy.sparse import dok_matrix, identity from scipy.sparse.linalg import spsolve -from numpy.typing import NDArray from sklearn import utils as sku from qolmat.imputations.rpca import rpca_utils @@ -16,23 +18,23 @@ class RpcaNoisy(RPCA): - """ - This class implements a noisy version of the so-called 'improved RPCA' + """Clas for a noisy version of the so-called 'improved RPCA'. References ---------- - Wang, Xuehui, et al. "An improved robust principal component analysis model for anomalies - detection of subway passenger flow." + Wang, Xuehui, et al. "An improved robust principal component analysis model + for anomalies detection of subway passenger flow." Journal of advanced transportation (2018). - Chen, Yuxin, et al. "Bridging convex and nonconvex optimization in robust PCA: Noise, outliers - and missing data." + Chen, Yuxin, et al. "Bridging convex and nonconvex optimization + in robust PCA: Noise, outliers and missing data." The Annals of Statistics 49.5 (2021): 2948-2971. Parameters ---------- random_state : int, optional - The seed of the pseudo random number generator to use, for reproductibility. + The seed of the pseudo random number generator to use, + for reproductibility. rank: Optional[int] Upper bound of the rank to be estimated mu: Optional[float] @@ -44,16 +46,19 @@ class RpcaNoisy(RPCA): list_periods: Optional[List[int]] list of periods, linked to the Toeplitz matrices list_etas: Optional[List[float]] - list of penalizing parameters for the corresponding period in list_periods + list of penalizing parameters for the corresponding period + in list_periods max_iterations: Optional[int] - stopping criteria, maximum number of iterations. By default, the value is set to 10_000 + stopping criteria, maximum number of iterations. + By default, the value is set to 10_000 tolerance: Optional[float] - stoppign critera, minimum difference between 2 consecutive iterations. By default, - the value is set to 1e-6 + stoppign critera, minimum difference between 2 consecutive iterations. + By default, the value is set to 1e-6 norm: Optional[str] error norm, can be "L1" or "L2". By default, the value is set to "L2" verbose: Optional[bool] verbosity level, if False the warnings are silenced + """ def __init__( @@ -70,7 +75,9 @@ def __init__( norm: str = "L2", verbose: bool = True, ) -> None: - super().__init__(max_iterations=max_iterations, tolerance=tolerance, verbose=verbose) + super().__init__( + max_iterations=max_iterations, tolerance=tolerance, verbose=verbose + ) self.rng = sku.check_random_state(random_state) self.rank = rank self.mu = mu @@ -81,8 +88,7 @@ def __init__( self.norm = norm def get_params_scale(self, D: NDArray) -> Dict[str, float]: - """ - Get parameters for scaling in RPCA based on the input data. + """Get parameters for scaling in RPCA based on the input data. Parameters ---------- @@ -111,8 +117,7 @@ def get_params_scale(self, D: NDArray) -> Dict[str, float]: } def decompose(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: - """ - Compute the noisy RPCA with L1 or L2 time penalisation + """Compute the noisy RPCA with L1 or L2 time penalisation. Parameters ---------- @@ -127,6 +132,7 @@ def decompose(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: Low-rank signal A: NDArray Anomalies + """ M, A, _, _ = self.decompose_with_basis(D, Omega) return M, A @@ -134,9 +140,9 @@ def decompose(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: def decompose_with_basis( self, D: NDArray, Omega: NDArray ) -> Tuple[NDArray, NDArray, NDArray, NDArray]: - """ - Compute the noisy RPCA with L1 or L2 time penalisation, and returns the decomposition of - the low-rank matrix. + """Compute the noisy RPCA with L1 or L2 time penalisation. + + It returns the decomposition of the low-rank matrix. Parameters ---------- @@ -155,6 +161,7 @@ def decompose_with_basis( Coefficients of the low-rank matrix in the reduced basis Q: NDArray Reduced basis of the low-rank matrix + """ D = utils.linear_interpolation(D) self.params_scale = self.get_params_scale(D) @@ -175,8 +182,9 @@ def decompose_with_basis( for period in self.list_periods: if not period < n_rows: raise ValueError( - "The periods provided in argument in `list_periods` must smaller " - f"than the number of rows in the matrix but {period} >= {n_rows}!" + "The periods provided in argument in `list_periods` " + "must smaller than the number of rows " + f"in the matrix but {period} >= {n_rows}!" ) M, A, L, Q = self.minimise_loss( @@ -211,12 +219,12 @@ def minimise_loss( tolerance: float = 1e-6, norm: str = "L2", ) -> Tuple: - """ - Compute the noisy RPCA with a L2 time penalisation. + """Compute the noisy RPCA with a L2 time penalisation. - This function computes the noisy Robust Principal Component Analysis (RPCA) using a L2 time - penalisation. It iteratively minimizes a loss function to separate the low-rank and sparse - components from the input data matrix. + This function computes the noisy Robust Principal Component Analysis + (RPCA) using a L2 time penalisation. It iteratively minimizes a loss + function to separate the low-rank and sparse components from the + input data matrix. Parameters ---------- @@ -231,18 +239,19 @@ def minimise_loss( lam : float Penalizing parameter for the sparse matrix. mu : float, optional - Initial stiffness parameter for the constraint on M, L, and Q. Defaults - to 1e-2. + Initial stiffness parameter for the constraint on M, L, and Q. + Defaults to 1e-2. list_periods : List[int], optional List of periods linked to the Toeplitz matrices. Defaults to []. list_etas : List[float], optional - List of penalizing parameters for the corresponding periods in list_periods. Defaults + List of penalizing parameters for the corresponding periods + in list_periods. Defaults to []. max_iterations : int, optional Stopping criteria, maximum number of iterations. Defaults to 10000. tolerance : float, optional - Stopping criteria, minimum difference between 2 consecutive iterations. - Defaults to 1e-6. + Stopping criteria, minimum difference between 2 + consecutive iterations. Defaults to 1e-6. norm : str, optional Error norm, can be "L1" or "L2". Defaults to "L2". @@ -264,8 +273,8 @@ def minimise_loss( ValueError If the periods provided in the argument in `list_periods` are not smaller than the number of rows in the matrix. - """ + """ rho = 1.1 n_rows, n_cols = D.shape @@ -288,10 +297,15 @@ def minimise_loss( mu_bar = mu * 1e3 # matrices for temporal correlation - list_H = [rpca_utils.toeplitz_matrix(period, n_rows) for period in list_periods] + list_H = [ + rpca_utils.toeplitz_matrix(period, n_rows) + for period in list_periods + ] HtH = dok_matrix((n_rows, n_rows)) for i_period, _ in enumerate(list_periods): - HtH += list_etas[i_period] * (list_H[i_period].T @ list_H[i_period]) + HtH += list_etas[i_period] * ( + list_H[i_period].T @ list_H[i_period] + ) Ir = np.eye(rank) In = identity(n_rows) @@ -335,7 +349,9 @@ def minimise_loss( if norm == "L1": for i_period, _ in enumerate(list_periods): eta = list_etas[i_period] - R[i_period] = rpca_utils.soft_thresholding(R[i_period] / mu, eta / mu) + R[i_period] = rpca_utils.soft_thresholding( + R[i_period] / mu, eta / mu + ) mu = min(mu * rho, mu_bar) @@ -364,9 +380,11 @@ def decompose_on_basis( Omega: NDArray, Q: NDArray, ) -> Tuple[NDArray, NDArray]: - """ - Decompose the matrix D with an observation matrix Omega using the noisy RPCA algorithm, - with a fixed reduced basis given by the matrix Q. This allows to impute new data without + """Decompose the matrix D with an observation matrix Omega. + + It uses the noisy RPCA algorithm, + with a fixed reduced basis given by the matrix Q. + This allows to impute new data without resolving the optimization problem on the whole dataset. Parameters @@ -384,6 +402,7 @@ def decompose_on_basis( A tuple representing the decomposition of D with: - M: low-rank matrix - A: sparse matrix + """ D = utils.linear_interpolation(D) params_scale = self.get_params_scale(D) @@ -434,8 +453,9 @@ def _check_cost_function_minimized( tau: float, lam: float, ): - """ - Check that the functional minimized by the RPCA is smaller at the end than at the + """Check cost function. + + The functional minimized by the RPCA is smaller at the end than at the beginning. Parameters @@ -452,6 +472,7 @@ def _check_cost_function_minimized( parameter penalizing the nuclear norm of the low rank part lam : float parameter penalizing the L1-norm of the anomaly/sparse part + """ cost_start = self.cost_function( D, @@ -482,8 +503,11 @@ def _check_cost_function_minimized( if self.verbose and (cost_end > cost_start * (1 + 1e-6)): warnings.warn( - f"RPCA algorithm may provide bad results. Function {function_str} increased from" - f" {cost_start} to {cost_end} instead of decreasing!".format("%.2f") + "RPCA algorithm may provide bad results. " + f"Function {function_str} increased from" + f" {cost_start} to {cost_end} instead of decreasing!".format( + "%.2f" + ) ) @staticmethod @@ -498,8 +522,7 @@ def cost_function( list_etas: List[float] = [], norm: str = "L2", ): - """ - Estimated cost function for the noisy RPCA algorithm + """Estimate cost function for the noisy RPCA algorithm. Parameters ---------- @@ -518,27 +541,34 @@ def cost_function( list_periods: Optional[List[int]] list of periods, linked to the Toeplitz matrices list_etas: Optional[List[float]] - list of penalizing parameters for the corresponding period in list_periods + list of penalizing parameters for the corresponding period in + list_periods norm: Optional[str] - error norm, can be "L1" or "L2". By default, the value is set to "L2" + error norm, can be "L1" or "L2". + By default, the value is set to "L2" Returns ------- float Value of the cost function minimized by the RPCA - """ + """ temporal_norm: float = 0 if len(list_etas) > 0: # matrices for temporal correlation - list_H = [rpca_utils.toeplitz_matrix(period, D.shape[0]) for period in list_periods] + list_H = [ + rpca_utils.toeplitz_matrix(period, D.shape[0]) + for period in list_periods + ] if norm == "L1": for eta, H_matrix in zip(list_etas, list_H): temporal_norm += eta * np.sum(np.abs(H_matrix @ M)) elif norm == "L2": for eta, H_matrix in zip(list_etas, list_H): - temporal_norm += eta * float(np.linalg.norm(H_matrix @ M, "fro")) + temporal_norm += eta * float( + np.linalg.norm(H_matrix @ M, "fro") + ) anomalies_norm = np.sum(np.abs(A * Omega)) cost = ( 1 / 2 * ((Omega * (D - M - A)) ** 2).sum() diff --git a/qolmat/imputations/rpca/rpca_pcp.py b/qolmat/imputations/rpca/rpca_pcp.py index f3b8e751..500605fb 100644 --- a/qolmat/imputations/rpca/rpca_pcp.py +++ b/qolmat/imputations/rpca/rpca_pcp.py @@ -1,3 +1,5 @@ +"""Script for the PCP RPCA.""" + from __future__ import annotations import warnings @@ -13,8 +15,9 @@ class RpcaPcp(RPCA): - """ - This class implements the basic RPCA decomposition using Alternating Lagrangian Multipliers. + """Class for the basic RPCA decomposition. + + It uses Alternating Lagrangian Multipliers. References ---------- @@ -24,7 +27,8 @@ class RpcaPcp(RPCA): Parameters ---------- random_state : int, optional - The seed of the pseudo random number generator to use, for reproductibility. + The seed of the pseudo random number generator to use, + for reproductibility. period: Optional[int] number of rows of the reshaped matrix if the signal is a 1D-array rank: Optional[int] @@ -34,12 +38,14 @@ class RpcaPcp(RPCA): lam: Optional[float] penalizing parameter for the sparse matrix max_iterations: Optional[int] - stopping criteria, maximum number of iterations. By default, the value is set to 10_000 + stopping criteria, maximum number of iterations. + By default, the value is set to 10_000 tolerance: Optional[float] - stoppign critera, minimum difference between 2 consecutive iterations. By default, - the value is set to 1e-6 + stoppign critera, minimum difference between 2 consecutive iterations. + By default, the value is set to 1e-6 verbose: Optional[bool] verbosity level, if False the warnings are silenced + """ def __init__( @@ -51,14 +57,15 @@ def __init__( tolerance: float = 1e-6, verbose: bool = True, ) -> None: - super().__init__(max_iterations=max_iterations, tolerance=tolerance, verbose=verbose) + super().__init__( + max_iterations=max_iterations, tolerance=tolerance, verbose=verbose + ) self.rng = sku.check_random_state(random_state) self.mu = mu self.lam = lam def get_params_scale(self, D: NDArray): - """ - Get parameters for scaling in RPCA based on the input data. + """Get parameters for scaling in RPCA based on the input data. Parameters ---------- @@ -81,8 +88,9 @@ def get_params_scale(self, D: NDArray): return dict_params def decompose(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: - """ - Estimate the relevant parameters then compute the PCP RPCA decomposition, using the + """Estimate the relevant parameters. + + It computes the PCP RPCA decomposition, using the Augumented Largrangian Multiplier (ALM) Parameters @@ -98,6 +106,7 @@ def decompose(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: Low-rank signal A: NDArray Anomalies + """ D = utils.linear_interpolation(D) if np.all(D == 0): @@ -116,7 +125,6 @@ def decompose(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: M: NDArray = D - A for iteration in range(self.max_iterations): - M = rpca_utils.svd_thresholding(D - A + Y / mu, 1 / mu) A = rpca_utils.soft_thresholding(D - M + Y / mu, lam / mu) A[~Omega] = (D - M)[~Omega] @@ -141,7 +149,9 @@ def _check_cost_function_minimized( Omega: NDArray, lam: float, ): - """Check that the functional minimized by the RPCA + """Check that the functional minimized by the RPCA. + + Check that the functional minimized by the RPCA is smaller at the end than at the beginning Parameters @@ -156,12 +166,16 @@ def _check_cost_function_minimized( boolean matrix indicating the observed values lam : float parameter penalizing the L1-norm of the anomaly/sparse part + """ cost_start = np.linalg.norm(observations, "nuc") - cost_end = np.linalg.norm(low_rank, "nuc") + lam * np.sum(Omega * np.abs(anomalies)) + cost_end = np.linalg.norm(low_rank, "nuc") + lam * np.sum( + Omega * np.abs(anomalies) + ) if self.verbose and round(cost_start, 4) - round(cost_end, 4) <= -1e-2: function_str = "||D||_* + lam ||A||_1" warnings.warn( - f"RPCA algorithm may provide bad results. Function {function_str} increased from" - f" {cost_start} to {cost_end} instead of decreasing!" + "RPCA algorithm may provide bad results. " + f"Function {function_str} increased from {cost_start} " + f"to {cost_end} instead of decreasing!" ) diff --git a/qolmat/imputations/rpca/rpca_utils.py b/qolmat/imputations/rpca/rpca_utils.py index 9e6c8945..0d3b6d5f 100644 --- a/qolmat/imputations/rpca/rpca_utils.py +++ b/qolmat/imputations/rpca/rpca_utils.py @@ -1,12 +1,7 @@ -""" -Modular utility functions for RPCA -""" +"""Modular utility functions for RPCA.""" -from typing import Tuple import numpy as np from numpy.typing import NDArray -import scipy -from scipy.linalg import toeplitz from scipy import sparse as sps @@ -14,8 +9,7 @@ def approx_rank( M: NDArray, threshold: float = 0.95, ) -> int: - """ - Estimate a bound on the rank of an array by SVD. + """Estimate a bound on the rank of an array by SVD. Parameters ---------- @@ -45,8 +39,7 @@ def soft_thresholding( X: NDArray, threshold: float, ) -> NDArray: - """ - Shrinkage operator (i.e. soft thresholding) on the elements of X. + """Shrinkage operator (i.e. soft thresholding) on the elements of X. Parameters ---------- @@ -59,13 +52,13 @@ def soft_thresholding( ------- NDArray Array V such that V = sign(X) * max(abs(X - threshold,0) + """ return np.sign(X) * np.maximum(np.abs(X) - threshold, 0) def svd_thresholding(X: NDArray, threshold: float) -> NDArray: - """ - Apply the shrinkage operator to the singular values obtained from the SVD of X. + """Apply shrinkage to the singular values from X's SVD. Parameters ---------- @@ -81,6 +74,7 @@ def svd_thresholding(X: NDArray, threshold: float) -> NDArray: U is the array of left singular vectors of X V is the array of the right singular vectors of X s is the array of the singular values as a diagonal array + """ U, s, Vh = np.linalg.svd(X, full_matrices=False) s = soft_thresholding(s, threshold) @@ -88,8 +82,7 @@ def svd_thresholding(X: NDArray, threshold: float) -> NDArray: def l1_norm(M: NDArray) -> float: - """ - L1 norm of an array + """Compute the L1 norm of an array. Parameters ---------- @@ -100,13 +93,15 @@ def l1_norm(M: NDArray) -> float: ------- float L1 norm of M + """ return np.sum(np.abs(M)) -def toeplitz_matrix(T: int, dimension: int) -> NDArray: - """ - Create a sparse Toeplitz square matrix H to take into account temporal correlations in the RPCA +def toeplitz_matrix(T: int, dimension: int) -> sps.spmatrix: + """Create a sparse Toeplitz square matrix H. + + It is useful to take into account temporal correlations in the RPCA H=Toeplitz(0,1,-1), in which the central diagonal is defined as ones and the T upper diagonal is defined as minus ones. @@ -121,11 +116,13 @@ def toeplitz_matrix(T: int, dimension: int) -> NDArray: ------- NDArray Sparse Toeplitz matrix using scipy format - """ + """ n_lags = dimension - T diagonals = [np.ones(n_lags), -np.ones(n_lags)] - H_top = sps.diags(diagonals, offsets=[0, T], shape=(n_lags, dimension), format="csr") + H_top = sps.diags( + diagonals, offsets=[0, T], shape=(n_lags, dimension), format="csr" + ) H = sps.dok_matrix((dimension, dimension)) H[:n_lags] = H_top return H diff --git a/qolmat/imputations/softimpute.py b/qolmat/imputations/softimpute.py index 5d04b39b..72d3a8c4 100644 --- a/qolmat/imputations/softimpute.py +++ b/qolmat/imputations/softimpute.py @@ -1,19 +1,22 @@ +"""Script for SoftImpute class.""" + from __future__ import annotations -from typing import Optional, Tuple, Union import warnings +from typing import Optional, Tuple, Union import numpy as np from numpy.typing import NDArray from sklearn import utils as sku from sklearn.base import BaseEstimator, TransformerMixin -from qolmat.utils import utils from qolmat.imputations.rpca import rpca_utils +from qolmat.utils import utils class SoftImpute(BaseEstimator, TransformerMixin): - """ + """Class for the Rank Restricted Soft SVD algorithm. + This class implements the Rank Restricted Soft SVD algorithm presented in Hastie, Trevor, et al. "Matrix completion and low-rank SVD via fast alternating least squares." The Journal of Machine Learning @@ -36,7 +39,8 @@ class SoftImpute(BaseEstimator, TransformerMixin): max_iterations : int Maximum number of iterations random_state : int, optional - The seed of the pseudo random number generator to use, for reproductibility + The seed of the pseudo random number generator to use, + for reproductibility verbose : bool flag for verbosity @@ -44,7 +48,9 @@ class SoftImpute(BaseEstimator, TransformerMixin): -------- >>> import numpy as np >>> from qolmat.imputations.softimpute import SoftImpute - >>> D = np.array([[1, 2, np.nan, 4], [1, 5, 3, np.nan], [4, 2, 3, 2], [1, 1, 5, 4]]) + >>> D = np.array( + ... [[1, 2, np.nan, 4], [1, 5, 3, np.nan], [4, 2, 3, 2], [1, 1, 5, 4]] + ... ) >>> Omega = ~np.isnan(D) >>> M, A = SoftImpute(random_state=11).decompose(D, Omega) >>> print(M + A) @@ -52,6 +58,7 @@ class SoftImpute(BaseEstimator, TransformerMixin): [1. 5. 3. 0.87217939] [4. 2. 3. 2. ] [1. 1. 5. 4. ]] + """ def __init__( @@ -73,8 +80,7 @@ def __init__( self.verbose = verbose def get_params_scale(self, X: NDArray): - """ - Get parameters for scaling in Soft Impute based on the input data. + """Get parameters for scaling in Soft Impute based on the input data. Parameters ---------- @@ -98,8 +104,7 @@ def get_params_scale(self, X: NDArray): return dict_params def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: - """ - Compute the Soft Impute decomposition + """Compute the Soft Impute decomposition. Parameters ---------- @@ -114,11 +119,13 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: Low-rank signal A: NDArray Anomalies + """ params_scale = self.get_params_scale(X) rank = params_scale["rank"] if self.rank is None else self.rank tau = params_scale["tau"] if self.tau is None else self.tau - assert tau > 0 + if tau <= 0: + raise ValueError(f"Parameter tau has negative value: {tau}") # Step 1 : Initializing n, m = X.shape @@ -138,7 +145,9 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: # Step 2 : Upate on B D2_invreg = (D**2 + tau) ** (-1) - Btilde = ((U * D).T @ np.where(Omega, X - A @ B.T, 0) + (B * D**2).T).T + Btilde = ( + (U * D).T @ np.where(Omega, X - A @ B.T, 0) + (B * D**2).T + ).T Btilde = Btilde * D2_invreg Utilde, D2tilde, _ = np.linalg.svd(Btilde * D, full_matrices=False) @@ -148,7 +157,9 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: # Step 3 : Upate on A D2_invreg = (D**2 + tau) ** (-1) - Atilde = ((V * D).T @ np.where(Omega, X - A @ B.T, 0).T + (A * D**2).T).T + Atilde = ( + (V * D).T @ np.where(Omega, X - A @ B.T, 0).T + (A * D**2).T + ).T Atilde = Atilde * D2_invreg Utilde, D2tilde, _ = np.linalg.svd(Atilde * D, full_matrices=False) @@ -162,7 +173,8 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: print(f"Iteration {iter_}: ratio = {round(ratio, 4)}") if ratio < self.tolerance: print( - f"Convergence reached at iteration {iter_} with ratio = {round(ratio, 4)}" + f"Convergence reached at iteration {iter_} " + f"with ratio = {round(ratio, 4)}" ) break @@ -178,7 +190,9 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: if self.verbose and (cost_end > cost_start + 1e-9): warnings.warn( f"Convergence failed: cost function increased from" - f" {cost_start} to {cost_end} instead of decreasing!".format("%.2f") + f" {cost_start} to {cost_end} instead of decreasing!".format( + "%.2f" + ) ) return M, A @@ -192,7 +206,9 @@ def _check_convergence( D: NDArray, V: NDArray, ) -> float: - """Given a pair of iterates (U_old, D_old, V_old) and (U, D, V), + """Check if the convergence has been reached. + + Given a pair of iterates (U_old, D_old, V_old) and (U, D, V), it computes the relative change in Frobenius norm given by || U_old @ D_old^2 @ V_old.T - U @ D^2 @ V.T ||_F^2 / || U_old @ D_old^2 @ V_old.T ||_F^2 @@ -216,6 +232,7 @@ def _check_convergence( ------- float relative change + """ if any(arg is None for arg in (U_old, D_old, V_old, U, D, V)): raise ValueError("One or more arguments are None.") @@ -261,8 +278,7 @@ def cost_function( Omega: NDArray, tau: float, ): - """ - Compute cost function for different RPCA algorithm + """Compute cost function for different RPCA algorithm. Parameters ---------- @@ -281,6 +297,7 @@ def cost_function( ------- float Value of the cost function minimized by the Soft Impute algorithm + """ norm_frobenius = np.sum(np.where(Omega, X - M, 0) ** 2) norm_nuclear = np.linalg.norm(M, "nuc") diff --git a/qolmat/utils/algebra.py b/qolmat/utils/algebra.py index 9e2af1a6..a38fba62 100644 --- a/qolmat/utils/algebra.py +++ b/qolmat/utils/algebra.py @@ -1,6 +1,10 @@ +"""Utils algebra functions for qolmat package.""" + +from typing import Optional, Tuple + import numpy as np import scipy -from numpy.typing import NDArray, ArrayLike +from numpy.typing import NDArray def frechet_distance_exact( @@ -9,13 +13,18 @@ def frechet_distance_exact( means2: NDArray, cov2: NDArray, ) -> float: - """Compute the Fréchet distance between two dataframes df1 and df2 - Frechet_distance = || mu_1 - mu_2 ||_2^2 + Tr(Sigma_1 + Sigma_2 - 2(Sigma_1 . Sigma_2)^(1/2)) - It is normalized, df1 and df2 are first scaled by a factor (std(df1) + std(df2)) / 2 + """Compute the Fréchet distance between two dataframes df1 and df2. + + Frechet_distance = || mu_1 - mu_2 ||_2^2 + + Tr(Sigma_1 + Sigma_2 - 2(Sigma_1 . Sigma_2)^(1/2)) + It is normalized, df1 and df2 are first scaled + by a factor (std(df1) + std(df2)) / 2 and then centered around (mean(df1) + mean(df2)) / 2 - The result is divided by the number of samples to get an homogeneous result. - Based on: Dowson, D. C., and BV666017 Landau. "The Fréchet distance between multivariate normal - distributions." Journal of multivariate analysis 12.3 (1982): 450-455. + The result is divided by the number of samples to get + an homogeneous result. + Based on: Dowson, D. C., and BV666017 Landau. + "The Fréchet distance between multivariate normal distributions." + Journal of multivariate analysis 12.3 (1982): 450-455. Parameters ---------- @@ -32,9 +41,14 @@ def frechet_distance_exact( ------- float Frechet distance + """ n = len(means1) - if (means2.shape != (n,)) or (cov1.shape != (n, n)) or (cov2.shape != (n, n)): + if ( + (means2.shape != (n,)) + or (cov1.shape != (n, n)) + or (cov2.shape != (n, n)) + ): raise ValueError("Inputs have to be of same dimensions.") ssdiff = np.sum((means1 - means2) ** 2.0) @@ -52,8 +66,9 @@ def frechet_distance_exact( def kl_divergence_gaussian_exact( means1: NDArray, cov1: NDArray, means2: NDArray, cov2: NDArray ) -> float: - """ - Exact Kullback-Leibler divergence computed between two multivariate normal distributions + """Compute the exact Kullback-Leibler divergence. + + This is computed between two multivariate normal distributions Based on https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence Parameters @@ -66,10 +81,12 @@ def kl_divergence_gaussian_exact( Mean of the second distribution cov2: NDArray Covariance matrx of the second distribution + Returns ------- float Kulback-Leibler divergence + """ n_variables = len(means1) L1, _ = scipy.linalg.cho_factor(cov1) @@ -81,3 +98,53 @@ def kl_divergence_gaussian_exact( term_diag_L = 2 * np.sum(np.log(np.diagonal(L2) / np.diagonal(L1))) div_kl = 0.5 * (norm_M - n_variables + norm_y + term_diag_L) return div_kl + + +def svdtriplet(X, row_w=None, ncp=np.inf): + """Perform weighted SVD on matrix X with row weights. + + Parameters + ---------- + X : ndarray + Data matrix of shape (n_samples, n_features). + row_w : array-like, optional + Row weights. If None, uniform weights are assumed. Default is None. + ncp : int + Number of principal components to retain. Default is infinity. + + Returns + ------- + s : ndarray + Singular values. + U : ndarray + Left singular vectors. + V : ndarray + Right singular vectors. + + """ + if not isinstance(X, np.ndarray): + X = np.array(X, dtype=float) + else: + X = X.astype(float) + if row_w is None: + row_w = np.ones(X.shape[0]) / X.shape[0] + else: + row_w = np.array(row_w, dtype=float) + row_w /= row_w.sum() + ncp = int(min(ncp, X.shape[0] - 1, X.shape[1])) + # Apply weights to rows + X_weighted = X * np.sqrt(row_w[:, None]) + # Perform SVD + U, s, Vt = np.linalg.svd(X_weighted, full_matrices=False) + V = Vt.T + U = U[:, :ncp] + V = V[:, :ncp] + s = s[:ncp] + # Adjust signs to ensure consistency + mult = np.sign(np.sum(V, axis=0)) + mult[mult == 0] = 1 + U *= mult + V *= mult + # Rescale U by the square root of row weights + U /= np.sqrt(row_w[:, None]) + return s, U, V diff --git a/qolmat/utils/data.py b/qolmat/utils/data.py index 2adecf4e..0ea1823b 100644 --- a/qolmat/utils/data.py +++ b/qolmat/utils/data.py @@ -1,9 +1,11 @@ +"""Utils data for qolmat package.""" + import os import sys import zipfile from datetime import datetime from math import pi -from typing import List, Tuple, Union +from typing import Dict, List, Tuple, Union from urllib import request import numpy as np @@ -16,28 +18,33 @@ def read_csv_local(data_file_name: str, **kwargs) -> pd.DataFrame: - """Load csv files + """Load csv files. Parameters ---------- data_file_name : str - Filename. Has to be "beijing" or "conductors" - kwargs : dict + Filename. Has to be "beijing" or "conductors". + **kwargs : dict, optional + Additional keyword arguments passed to `pandas.read_csv`. Returns ------- df : pd.DataFrame dataframe + """ - df = pd.read_csv(os.path.join(ROOT_DIR, "data", f"{data_file_name}.csv"), **kwargs) + df = pd.read_csv( + os.path.join(ROOT_DIR, "data", f"{data_file_name}.csv"), **kwargs + ) return df def download_data_from_zip( zipname: str, urllink: str, datapath: str = "data/" ) -> List[pd.DataFrame]: - """ - Downloads and extracts ZIP files from a URL, then loads DataFrames from CSV files. + """Download and extracts ZIP files from a URL. + + It also loads DataFrames from CSV files. Parameters ---------- @@ -52,7 +59,9 @@ def download_data_from_zip( Returns ------- List[pd.DataFrame] - A list of DataFrames loaded from the CSV files within the extracted directory. + A list of DataFrames loaded from the CSV files + within the extracted directory. + """ path_zip = os.path.join(datapath, zipname) path_zip_ext = path_zip + ".zip" @@ -68,9 +77,11 @@ def download_data_from_zip( def get_dataframes_in_folder(path: str, extension: str) -> List[pd.DataFrame]: - """ - Loads all dataframes from files with a specified extension within a directory, including - subdirectories. Special handling for '.tsf' files which are converted and immediately returned. + """Load all dataframes from files. + + Loads all files with a specified extension within a directory, including + subdirectories. Special handling for '.tsf' files which are converted + and immediately returned. Parameters ---------- @@ -82,8 +93,10 @@ def get_dataframes_in_folder(path: str, extension: str) -> List[pd.DataFrame]: Returns ------- List[pd.DataFrame] - A list of pandas DataFrames loaded from the files matching the extension. - If a '.tsf' file is found, its converted DataFrame is returned immediately. + A list of pandas DataFrames loaded from the files + matching the extension. If a '.tsf' file is found, + its converted DataFrame is returned immediately. + """ list_df = [] for folder, _, files in os.walk(path): @@ -91,7 +104,9 @@ def get_dataframes_in_folder(path: str, extension: str) -> List[pd.DataFrame]: if extension in file: list_df.append(pd.read_csv(os.path.join(folder, file))) if ".tsf" in file: - loaded_data = convert_tsf_to_dataframe(os.path.join(folder, file)) + loaded_data = convert_tsf_to_dataframe( + os.path.join(folder, file) + ) return [loaded_data] return list_df @@ -103,8 +118,7 @@ def generate_artificial_ts( ratio_anomalies: float, amp_noise: float, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Generates time series data, anomalies, and noise based on given parameters. + """Generate TS data, anomalies, and noise based on given parameters. Parameters ---------- @@ -125,8 +139,8 @@ def generate_artificial_ts( Time series data with sine waves (X). Anomaly data with specified amplitudes at random positions (A). Gaussian noise added to the time series (E). - """ + """ mesh = np.arange(n_samples) X = np.ones(n_samples) for p in periods: @@ -135,7 +149,9 @@ def generate_artificial_ts( n_anomalies = int(n_samples * ratio_anomalies) anomalies = np.random.standard_exponential(size=n_anomalies) anomalies *= amp_anomalies * np.random.choice([-1, 1], size=n_anomalies) - ind_anomalies = np.random.choice(range(n_samples), size=n_anomalies, replace=False) + ind_anomalies = np.random.choice( + range(n_samples), size=n_anomalies, replace=False + ) A = np.zeros(n_samples) A[ind_anomalies] = anomalies @@ -148,21 +164,23 @@ def get_data( datapath: str = "data/", n_groups_max: int = sys.maxsize, ) -> pd.DataFrame: - """ - Download or generate data + """Download or generate data. Parameters ---------- + name_data: str, optional + name of the file, by default "Beijing" datapath : str, optional data path, by default "data/" - download : bool, optional - if True: download a public dataset, if False: generate random univariate time series, by - default True + n_groups_max : int, optional + max number of groups, by default sys.maxsize. + Only used if name_data == "SNCF" Returns ------- pd.DataFrame requested data + """ url_zenodo = "https://zenodo.org/record/" if name_data == "Beijing": @@ -175,10 +193,13 @@ def get_data( df = read_csv_local("conductors") return df elif name_data == "Titanic": - path = "https://gist.githubusercontent.com/fyyying/4aa5b471860321d7b47fd881898162b7/raw/" + path = "https://gist.githubusercontent.com/" + "fyyying/4aa5b471860321d7b47fd881898162b7/raw/" "6907bb3a38bfbb6fccf3a8b1edfb90e39714d14f/titanic_dataset.csv" df = pd.read_csv(path) - df = df[["Survived", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]] + df = df[ + ["Survived", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] + ] df["Age"] = pd.to_numeric(df["Age"], errors="coerce") df["Fare"] = pd.to_numeric(df["Fare"], errors="coerce") return df @@ -194,7 +215,9 @@ def get_data( n_samples, periods, amp_anomalies, ratio_anomalies, amp_noise ) signal = X + A + E - df = pd.DataFrame({"signal": signal, "index": range(n_samples), "station": city}) + df = pd.DataFrame( + {"signal": signal, "index": range(n_samples), "station": city} + ) df.set_index(["station", "index"], inplace=True) df["X"] = X @@ -206,7 +229,9 @@ def get_data( df = pd.read_parquet(path_file) sizes_stations = df.groupby("station")["val_in"].mean().sort_values() n_groups_max = min(len(sizes_stations), n_groups_max) - stations = sizes_stations.index.get_level_values("station").unique()[-n_groups_max:] + stations = sizes_stations.index.get_level_values("station").unique()[ + -n_groups_max: + ] df = df.loc[stations] return df elif name_data == "Beijing_online": @@ -227,20 +252,30 @@ def get_data( df = pd.read_csv(csv_url, index_col=0) return df elif name_data == "Monach_weather": - urllink = os.path.join(url_zenodo, "4654822/files/weather_dataset.zip?download=1") + urllink = os.path.join( + url_zenodo, "4654822/files/weather_dataset.zip?download=1" + ) zipname = "weather_dataset" - list_loaded_data = download_data_from_zip(zipname, urllink, datapath=datapath) + list_loaded_data = download_data_from_zip( + zipname, urllink, datapath=datapath + ) loaded_data = list_loaded_data[0] df_list: List[pd.DataFrame] = [] for k in range(len(loaded_data)): values = list(loaded_data["series_value"][k]) freq = "1D" time_index = pd.date_range( - start=pd.Timestamp("01/01/2010"), periods=len(values), freq=freq + start=pd.Timestamp("01/01/2010"), + periods=len(values), + freq=freq, ) df_list = df_list + [ pd.DataFrame( - {loaded_data.series_name[k] + " " + loaded_data.series_type[k]: values}, + { + loaded_data.series_name[k] + + " " + + loaded_data.series_type[k]: values + }, index=time_index, ) ] @@ -254,18 +289,26 @@ def get_data( "4659727/files/australian_electricity_demand_dataset.zip?download=1", ) zipname = "australian_electricity_demand_dataset" - list_loaded_data = download_data_from_zip(zipname, urllink, datapath=datapath) + list_loaded_data = download_data_from_zip( + zipname, urllink, datapath=datapath + ) loaded_data = list_loaded_data[0] df_list = [] for k in range(len(loaded_data)): values = list(loaded_data["series_value"][k]) freq = "30min" time_index = pd.date_range( - start=loaded_data.start_timestamp[k], periods=len(values), freq=freq + start=loaded_data.start_timestamp[k], + periods=len(values), + freq=freq, ) df_list = df_list + [ pd.DataFrame( - {loaded_data.series_name[k] + " " + loaded_data.state[k]: values}, + { + loaded_data.series_name[k] + + " " + + loaded_data.state[k]: values + }, index=time_index, ) ] @@ -278,7 +321,7 @@ def get_data( def preprocess_data_beijing(df: pd.DataFrame) -> pd.DataFrame: - """Preprocess data from the "Beijing" datset + """Preprocess data from the "Beijing" datset. Parameters ---------- @@ -289,25 +332,39 @@ def preprocess_data_beijing(df: pd.DataFrame) -> pd.DataFrame: ------- pd.DataFrame preprocessed dataframe + """ df["datetime"] = pd.to_datetime(df[["year", "month", "day", "hour"]]) df["station"] = "Beijing" df.set_index(["station", "datetime"], inplace=True) df.drop( - columns=["year", "month", "day", "hour", "No", "cbwd", "Iws", "Is", "Ir"], + columns=[ + "year", + "month", + "day", + "hour", + "No", + "cbwd", + "Iws", + "Is", + "Ir", + ], inplace=True, ) df.sort_index(inplace=True) df = df.groupby( - ["station", df.index.get_level_values("datetime").floor("d")], group_keys=False + ["station", df.index.get_level_values("datetime").floor("d")], + group_keys=False, ).mean() return df -def add_holes(df: pd.DataFrame, ratio_masked: float, mean_size: int) -> pd.DataFrame: - """ - Creates holes in a dataset with no missing value, starting from `df`. Only used in the - documentation to design examples. +def add_holes( + df: pd.DataFrame, ratio_masked: float, mean_size: int +) -> pd.DataFrame: + """Create holes in a dataset with no missing value, starting from `df`. + + Only used in the documentation to design examples. Parameters ---------- @@ -319,10 +376,12 @@ def add_holes(df: pd.DataFrame, ratio_masked: float, mean_size: int) -> pd.DataF ratio_masked : float Targeted global proportion of nans added in the returned dataset + Returns ------- pd.DataFrame dataframe with missing values + """ groups = df.index.names.difference(["datetime", "date", "index", None]) if groups != []: @@ -334,10 +393,16 @@ def add_holes(df: pd.DataFrame, ratio_masked: float, mean_size: int) -> pd.DataF 1, ratio_masked=ratio_masked, subset=df.columns ) - generator.dict_probas_out = {column: 1 / mean_size for column in df.columns} - generator.dict_ratios = {column: 1 / len(df.columns) for column in df.columns} + generator.dict_probas_out = { + column: 1 / mean_size for column in df.columns + } + generator.dict_ratios = { + column: 1 / len(df.columns) for column in df.columns + } if generator.groups: - mask = df.groupby(groups, group_keys=False).apply(generator.generate_mask) + mask = df.groupby(groups, group_keys=False).apply( + generator.generate_mask + ) else: mask = generator.generate_mask(df) @@ -351,8 +416,10 @@ def get_data_corrupted( mean_size: int = 90, ratio_masked: float = 0.2, ) -> pd.DataFrame: - """ - Returns a dataframe with controled corruption optained from the source `name_data` + """Corrupt data. + + Return a dataframe with controlled corruption obtained + from the source `name_data`. Parameters ---------- @@ -362,10 +429,12 @@ def get_data_corrupted( Mean size of the holes to be generated using a geometric law ratio_masked: float Percent of missing data in each column in the output dataframe + Returns ------- pd.DataFrame Dataframe with missing values + """ df = get_data(name_data) df = add_holes(df, mean_size=mean_size, ratio_masked=ratio_masked) @@ -373,8 +442,7 @@ def get_data_corrupted( def add_station_features(df: pd.DataFrame) -> pd.DataFrame: - """ - Create a station feature in the dataset + """Create a station feature in the dataset. Parameters ---------- @@ -385,6 +453,7 @@ def add_station_features(df: pd.DataFrame) -> pd.DataFrame: ------- pd.DataFrame dataframe with missing values + """ df = df.copy() stations = df.index.get_level_values("station") @@ -393,9 +462,10 @@ def add_station_features(df: pd.DataFrame) -> pd.DataFrame: return df -def add_datetime_features(df: pd.DataFrame, col_time: str = "datetime") -> pd.DataFrame: - """ - Create a seasonal feature in the dataset with a cosine function +def add_datetime_features( + df: pd.DataFrame, col_time: str = "datetime" +) -> pd.DataFrame: + """Create a seasonal feature in the dataset with a cosine function. Parameters ---------- @@ -408,11 +478,14 @@ def add_datetime_features(df: pd.DataFrame, col_time: str = "datetime") -> pd.Da ------- pd.DataFrame dataframe with missing values + """ df = df.copy() time = df.index.get_level_values(col_time).to_series() days_in_year = time.dt.year.apply( - lambda x: 366 if ((x % 4 == 0) and (x % 100 != 0)) or (x % 400 == 0) else 365 + lambda x: 366 + if ((x % 4 == 0) and (x % 100 != 0)) or (x % 400 == 0) + else 365 ) ratio = time.dt.dayofyear.values / days_in_year.values df["time_cos"] = np.cos(2 * np.pi * ratio) @@ -421,13 +494,30 @@ def add_datetime_features(df: pd.DataFrame, col_time: str = "datetime") -> pd.Da def convert_tsf_to_dataframe( - full_file_path_and_name, - replace_missing_vals_with="NaN", - value_column_name="series_value", + full_file_path_and_name: str, + replace_missing_vals_with: Union[str, float, int] = "NaN", + value_column_name: str = "series_value", ): + """Convert a .tsf file to a dataframe. + + Parameters + ---------- + full_file_path_and_name : str + Filename + replace_missing_vals_with : Union[str, float, int], optional + Replace missing values with, by default "NaN" + value_column_name : str, optional + Name of the column containing the values, by default "series_value" + + Returns + ------- + _type_ + _description_ + + """ col_names = [] col_types = [] - all_data = {} + all_data: Dict[str, List] = {} line_count = 0 found_data_tag = False found_data_section = False @@ -443,21 +533,29 @@ def convert_tsf_to_dataframe( line_content = line.split(" ") if line.startswith("@attribute"): if len(line_content) != 3: - raise Exception("Invalid meta-data specification.") + raise Exception( + "Invalid meta-data specification." + ) col_names.append(line_content[1]) col_types.append(line_content[2]) else: if len(line_content) != 2: - raise Exception("Invalid meta-data specification.") + raise Exception( + "Invalid meta-data specification." + ) else: if len(col_names) == 0: - raise Exception("Attribute section must come before data.") + raise Exception( + "Attribute section must come before data." + ) found_data_tag = True elif not line.startswith("#"): if len(col_names) == 0: - raise Exception(" Attribute section must come before data.") + raise Exception( + " Attribute section must come before data." + ) elif not found_data_tag: raise Exception("Missing @data tag.") else: @@ -472,25 +570,35 @@ def convert_tsf_to_dataframe( full_info = line.split(":") if len(full_info) != (len(col_names) + 1): - raise Exception("Missing attributes/values in series.") + raise Exception( + "Missing attributes/values in series." + ) series = full_info[len(full_info) - 1] - series = series.split(",") + series = series.split(",") # type: ignore if len(series) == 0: - raise Exception(" Missing values should be indicated with ? symbol") + raise Exception( + " Missing values should be indicated " + "with ? symbol" + ) numeric_series = [] for val in series: if val == "?": - numeric_series.append(replace_missing_vals_with) + numeric_series.append( + replace_missing_vals_with + ) else: - numeric_series.append(float(val)) + numeric_series.append(float(val)) # type: ignore - if numeric_series.count(replace_missing_vals_with) == len(numeric_series): + if numeric_series.count( + replace_missing_vals_with + ) == len(numeric_series): raise Exception( - "At least one numeric value should be there in a series." + "At least one numeric value should be " + "there in a series." ) all_series.append(pd.Series(numeric_series).array) @@ -500,9 +608,12 @@ def convert_tsf_to_dataframe( if col_types[i] == "numeric": att_val = int(full_info[i]) elif col_types[i] == "string": - att_val = str(full_info[i]) + att_val = str(full_info[i]) # type: ignore elif col_types[i] == "date": - att_val = datetime.strptime(full_info[i], "%Y-%m-%d %H-%M-%S") + att_val = datetime.strptime( + full_info[i], + "%Y-%m-%d %H-%M-%S", # type: ignore + ) else: raise Exception("Invalid attribute type.") diff --git a/qolmat/utils/exceptions.py b/qolmat/utils/exceptions.py index 513e843b..baddfb38 100644 --- a/qolmat/utils/exceptions.py +++ b/qolmat/utils/exceptions.py @@ -1,7 +1,11 @@ +"""Exceptions for qolmat package.""" + from typing import Any, List, Tuple, Type class PyTorchExtraNotInstalled(Exception): + """Raise when pytorch extra is not installed.""" + def __init__(self): super().__init__( """Please install torch xx.xx.xx @@ -10,6 +14,8 @@ def __init__(self): class SignalTooShort(Exception): + """Raise when the signal is too short.""" + def __init__(self, period: int, n_cols: int): super().__init__( f"""`period` must be smaller than the signals duration. @@ -18,6 +24,8 @@ def __init__(self, period: int, n_cols: int): class NoMissingValue(Exception): + """Raise an error when there is no missing value.""" + def __init__(self, subset_without_nans: List[str]): super().__init__( f"No missing value in the columns {subset_without_nans}! " @@ -26,47 +34,78 @@ def __init__(self, subset_without_nans: List[str]): class SubsetIsAString(Exception): + """Raise an error when the subset is a string.""" + def __init__(self, subset: Any): - super().__init__(f"Provided subset `{subset}` should be None or a list!") + super().__init__( + f"Provided subset `{subset}` should be None or a list!" + ) class NotDimension2(Exception): + """Raise an error when the matrix is not of dim 2.""" + def __init__(self, shape: Tuple[int, ...]): - super().__init__(f"Provided matrix is of shape {shape}, which is not of dimension 2!") + super().__init__( + f"Provided matrix is of shape {shape}, " + "which is not of dimension 2!" + ) class NotDataFrame(Exception): + """Raise an error when the input is not a dataframe.""" + def __init__(self, X_type: Type[Any]): - super().__init__(f"Input musr be a dataframe, not a {X_type}") + super().__init__(f"Input must be a dataframe, not a {X_type}") class NotEnoughSamples(Exception): + """Raise an error when there is no not enough samples.""" + def __init__(self, max_num_row: int, min_n_rows: int): super().__init__( - f"Not enough valid patterns found. Largest found pattern has {max_num_row} rows, when " + f"Not enough valid patterns found. " + f"Largest found pattern has {max_num_row} rows, when " f"they should have at least min_n_rows={min_n_rows}." ) class EstimatorNotDefined(Exception): + """Raise an error when the estimator is not defined.""" + def __init__(self): - super().__init__("The underlying estimator should be defined beforehand!") + super().__init__( + "The underlying estimator should be defined beforehand!" + ) class SingleSample(Exception): + """Raise an error when there is a single sample.""" + def __init__(self): - super().__init__("""This imputer cannot be fitted on a single sample!""") + super().__init__( + """This imputer cannot be fitted on a single sample!""" + ) class IllConditioned(Exception): + """Raise an error when the covariance matrix is ill-conditioned.""" + def __init__(self, min_sv: float, min_std: float): super().__init__( - f"The covariance matrix is ill-conditioned, indicating high-colinearity: the smallest " - f"singular value of the data matrix is smaller than the threshold min_std ({min_sv} < " - f"{min_std}). Consider removing columns of decreasing the threshold." + f"The covariance matrix is ill-conditioned, " + "indicating high-colinearity: " + "the smallest singular value of the data matrix is smaller " + f"than the threshold min_std ({min_sv} < {min_std}). " + f"Consider removing columns of decreasing the threshold." ) class TypeNotHandled(Exception): + """Raise an error when the type is not handled.""" + def __init__(self, col: str, type_col: str): - super().__init__(f"The column `{col}` is of type `{type_col}`, which is not handled!") + super().__init__( + f"The column `{col}` is of type `{type_col}`, " + "which is not handled!" + ) diff --git a/qolmat/utils/plot.py b/qolmat/utils/plot.py index c6700e13..e9809425 100644 --- a/qolmat/utils/plot.py +++ b/qolmat/utils/plot.py @@ -1,18 +1,17 @@ -""" -Useful drawing functions -""" +"""Useful drawing functions.""" from __future__ import annotations -from typing import Dict, List, Any, Optional, Tuple, Union + +from typing import Any, Dict, List, Optional, Tuple, Union import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.ticker as plticker import numpy as np -from numpy.typing import NDArray import pandas as pd import scipy from mpl_toolkits.axes_grid1 import make_axes_locatable +from numpy.typing import NDArray plt.rcParams["axes.spines.right"] = False plt.rcParams["axes.spines.top"] = False @@ -23,18 +22,20 @@ tab10 = plt.get_cmap("tab10") -def plot_matrices(list_matrices: List[np.ndarray], title: Optional[str] = None) -> None: - """Plot RPCA matrices +def plot_matrices( + list_matrices: List[np.ndarray], title: Optional[str] = None +) -> None: + """Plot RPCA matrices. Parameters ---------- list_matrices : List[np.ndarray] - List containing, in the right order, the observations matrix, the low-rank matrix and the - sparse matrix + List containing, in the right order, the observations matrix, + the low-rank matrix and the sparse matrix title : Optional[str], optional if present, title of the saved figure, by default None - """ + """ suptitles = ["Observations", "Low-rank", "Sparse"] fig, ax = plt.subplots(1, 3, figsize=(10, 3)) @@ -62,21 +63,21 @@ def plot_signal( ylabel: Optional[str] = None, dates: Optional[List] = None, ) -> None: - """Plot RPCA results for time series + """Plot RPCA results for time series. Parameters ---------- list_signals : List[List] - List containing, in the right order, the observed time series, the cleaned signal and - the anomalies + List containing, in the right order, the observed time series, + the cleaned signal and the anomalies title : Optional[str], optional if present, title of the saved figure, by default None ylabel : Optional[str], optional ylabel, by default None dates : Optional[List], optional dates of the time series (xlabel), by default None - """ + """ suptitles = ["Observations", "Cleaned", "Anomalies"] colors = ["black", "darkblue", "crimson"] fontsize = 15 @@ -106,7 +107,7 @@ def plot_images( dims: Tuple[int, int], filename: Optional[str] = None, ) -> None: - """Plot multiple images in 3 columns for original, background and "foreground" + """Plot multiple images for original, background and "foreground". Parameters ---------- @@ -122,8 +123,8 @@ def plot_images( dimensions of the reduction filename : Optional[str], optional filename for saving figure, by default None - """ + """ f = plt.figure(figsize=(15, 10)) r = len(index_array) @@ -163,8 +164,7 @@ def make_ellipses( n_std: float = 2, color: Union[str, Any, Tuple[float, float, float]] = "None", ): - """ - Create a plot of the covariance confidence ellipse of *x* and *y*. + """Create a plot of the covariance confidence ellipse of *x* and *y*. Parameters ---------- @@ -186,16 +186,21 @@ def make_ellipses( Returns ------- matplotlib.patches.Ellipse - """ + """ pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1]) ell_radius_x = np.sqrt(1 + pearson) * 2.5 ell_radius_y = np.sqrt(1 - pearson) * 2.5 - ell = mpl.patches.Ellipse((0, 0), width=ell_radius_x, height=ell_radius_y, facecolor=color) + ell = mpl.patches.Ellipse( + (0, 0), width=ell_radius_x, height=ell_radius_y, facecolor=color + ) scale_x = np.sqrt(cov[0, 0]) * n_std scale_y = np.sqrt(cov[1, 1]) * n_std transf = ( - mpl.transforms.Affine2D().rotate_deg(45).scale(scale_x, scale_y).translate(mean_x, mean_y) + mpl.transforms.Affine2D() + .rotate_deg(45) + .scale(scale_x, scale_y) + .translate(mean_x, mean_y) ) ell.set_transform(transf + ax.transData) ax.add_patch(ell) @@ -211,8 +216,7 @@ def make_ellipses_from_data( n_std: float = 2, color: Union[str, Any, Tuple[float, float, float]] = "None", ): - """ - Create a plot of the covariance confidence ellipse of *x* and *y*. + """Create a plot of the covariance confidence ellipse of *x* and *y*. Parameters ---------- @@ -231,6 +235,7 @@ def make_ellipses_from_data( Returns ------- matplotlib.patches.Ellipse + """ if x.size != y.size: raise ValueError("x and y must be the same size") @@ -248,10 +253,14 @@ def compare_covariances( col_y: str, ax: mpl.axes.Axes, label: str = "", - color: Union[None, str, Tuple[float, float, float], Tuple[float, float, float, float]] = None, + color: Union[ + None, + str, + Tuple[float, float, float], + Tuple[float, float, float, float], + ] = None, ): - """ - Covariance plot: scatter plot with ellipses + """Covariance plot: scatter plot with ellipses. Parameters ---------- @@ -265,12 +274,26 @@ def compare_covariances( variable y, column's name of dataframe df2 to compare with ax : matplotlib.axes._subplots.AxesSubplot matplotlib ax handles + label: str + label of the plot + color: Union[None, str, Tuple[float, float, float], + Tuple[float, float, float, float]] + color of the ellipse + """ df1 = df_1.dropna() df2 = df_2.dropna() if color is None: color = tab10(0) - ax.scatter(df2[col_x], df2[col_y], marker=".", color=color, s=2, alpha=0.7, label="imputed") + ax.scatter( + df2[col_x], + df2[col_y], + marker=".", + color=color, + s=2, + alpha=0.7, + label="imputed", + ) ax.scatter( df1[col_x], df1[col_y], @@ -293,7 +316,9 @@ def multibar( colors: Any = None, decimals: float = 0, ): - """Create a multi-bar graph to represent the values of the different dataframe columns. + """Create a multi-bar graph. + + It represents the values of the different dataframe columns. Parameters ---------- @@ -307,8 +332,8 @@ def multibar( color in multibar plot, by default None decimals : float, optional the decimals numbers, by default 0 - """ + """ if ax is None: ax = plt.gca() if colors is None: @@ -346,8 +371,10 @@ def multibar( plt.legend(loc=(1, 0)) -def plot_imputations(df: pd.DataFrame, dict_df_imputed: Dict[str, pd.DataFrame]): - """Plot original and imputed dataframes for each imputers +def plot_imputations( + df: pd.DataFrame, dict_df_imputed: Dict[str, pd.DataFrame] +): + """Plot original and imputed dataframes for each imputers. Parameters ---------- @@ -355,6 +382,7 @@ def plot_imputations(df: pd.DataFrame, dict_df_imputed: Dict[str, pd.DataFrame]) original dataframe dict_df_imputed : Dict[str, pd.DataFrame] dictionnary of imputed dataframe for each imputers + """ n_columns = len(df.columns) n_imputers = len(dict_df_imputed) @@ -369,7 +397,9 @@ def plot_imputations(df: pd.DataFrame, dict_df_imputed: Dict[str, pd.DataFrame]) plt.plot(values_orig, ".", color="black", label="original") values_imp = df_imputed[col].copy() values_imp[values_orig.notna()] = np.nan - plt.plot(values_imp, ".", color=tab10(0), label=name_imputer, alpha=1) + plt.plot( + values_imp, ".", color=tab10(0), label=name_imputer, alpha=1 + ) plt.ylabel(col, fontsize=16) if i_plot % n_columns == 0: plt.legend(loc=[1, 0], fontsize=18) diff --git a/qolmat/utils/utils.py b/qolmat/utils/utils.py index ce8f7865..3f445e7f 100644 --- a/qolmat/utils/utils.py +++ b/qolmat/utils/utils.py @@ -1,23 +1,24 @@ -from typing import List, Optional, Tuple, Union -import warnings +"""Utils for qolmat package.""" + +from typing import List, Tuple, Union import numpy as np import pandas as pd - from numpy.typing import NDArray from sklearn.base import check_array -from qolmat.utils.exceptions import NotDimension2, SignalTooShort +from qolmat.utils.exceptions import NotDimension2 HyperValue = Union[int, float, str] def _get_numerical_features(df1: pd.DataFrame) -> List[str]: - """Get numerical features from dataframe + """Get numerical features from dataframe. Parameters ---------- df1 : pd.DataFrame + Input dataframe. Returns ------- @@ -28,6 +29,7 @@ def _get_numerical_features(df1: pd.DataFrame) -> List[str]: ------ Exception No numerical feature is found + """ cols_numerical = df1.select_dtypes(include=np.number).columns.tolist() if len(cols_numerical) == 0: @@ -37,11 +39,12 @@ def _get_numerical_features(df1: pd.DataFrame) -> List[str]: def _get_categorical_features(df1: pd.DataFrame) -> List[str]: - """Get categorical features from dataframe + """Get categorical features from dataframe. Parameters ---------- df1 : pd.DataFrame + Input dataframe. Returns ------- @@ -52,10 +55,12 @@ def _get_categorical_features(df1: pd.DataFrame) -> List[str]: ------ Exception No categorical feature is found - """ + """ cols_numerical = df1.select_dtypes(include=np.number).columns.tolist() - cols_categorical = [col for col in df1.columns.to_list() if col not in cols_numerical] + cols_categorical = [ + col for col in df1.columns.to_list() if col not in cols_numerical + ] if len(cols_categorical) == 0: raise Exception("No categorical feature is found.") else: @@ -63,9 +68,10 @@ def _get_categorical_features(df1: pd.DataFrame) -> List[str]: def _validate_input(X: NDArray) -> pd.DataFrame: - """ - Checks that the input X can be converted into a DataFrame, and returns the corresponding - dataframe. + """Calidate the input array. + + Checks that the input X can be converted into a DataFrame, + and returns the corresponding dataframe. Parameters ---------- @@ -75,8 +81,9 @@ def _validate_input(X: NDArray) -> pd.DataFrame: Returns ------- pd.DataFrame - Formatted dataframe, if the input had no column names then the dataframe columns are - integers + Formatted dataframe, if the input had no column names + then the dataframe columns are integers + """ check_array(X, force_all_finite="allow-nan", dtype=None) if not isinstance(X, pd.DataFrame): @@ -85,7 +92,7 @@ def _validate_input(X: NDArray) -> pd.DataFrame: raise ValueError if len(X_np.shape) == 1: X_np = X_np.reshape(-1, 1) - df = pd.DataFrame(X_np, columns=[i for i in range(X_np.shape[1])]) + df = pd.DataFrame(X_np, columns=list(range(X_np.shape[1]))) df = df.infer_objects() else: df = X @@ -103,7 +110,7 @@ def progress_bar( length: int = 100, fill: str = "█", ): - """Call in a loop to create terminal progress bar + """Call in a loop to create terminal progress bar. Parameters ---------- @@ -121,8 +128,11 @@ def progress_bar( character length of bar, by default 100 fill : str bar fill character, by default "█" + """ - percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) + percent = ("{0:." + str(decimals) + "f}").format( + 100 * (iteration / float(total)) + ) filled_length = int(length * iteration // total) bar = fill * filled_length + "-" * (length - filled_length) print(f"\r{prefix} |{bar}| {percent}% {suffix}", end="\r") @@ -131,7 +141,7 @@ def progress_bar( def acf(values: pd.Series, lag_max: int = 30) -> pd.Series: - """Correlation series of dataseries + """Correlation series of dataseries. Parameters ---------- @@ -144,6 +154,7 @@ def acf(values: pd.Series, lag_max: int = 30) -> pd.Series: ------- pd.Series correlation series of value + """ acf = pd.Series(0, index=range(lag_max)) for lag in range(lag_max): @@ -152,8 +163,7 @@ def acf(values: pd.Series, lag_max: int = 30) -> pd.Series: def impute_nans(M: NDArray, method: str = "zeros") -> NDArray: - """ - Impute the M's nan with the specified method + """Impute the M's nan with the specified method. Parameters ---------- @@ -166,6 +176,7 @@ def impute_nans(M: NDArray, method: str = "zeros") -> NDArray: ------- NDArray Imputed Array + Raises ------ ValueError @@ -180,9 +191,13 @@ def impute_nans(M: NDArray, method: str = "zeros") -> NDArray: isna = np.isnan(values) nna = np.sum(isna) if method == "mean": - value_imputation = np.nanmean(M) if nna == n_rows else np.nanmean(values) + value_imputation = ( + np.nanmean(M) if nna == n_rows else np.nanmean(values) + ) elif method == "median": - value_imputation = np.nanmedian(M) if nna == n_rows else np.nanmedian(values) + value_imputation = ( + np.nanmedian(M) if nna == n_rows else np.nanmedian(values) + ) elif method == "zeros": value_imputation = 0 else: @@ -193,8 +208,7 @@ def impute_nans(M: NDArray, method: str = "zeros") -> NDArray: def linear_interpolation(X: NDArray) -> NDArray: - """ - Impute missing data with a linear interpolation, column-wise + """Impute missing data with a linear interpolation, column-wise. Parameters ---------- @@ -205,6 +219,7 @@ def linear_interpolation(X: NDArray) -> NDArray: ------- X_interpolated : NDArray imputed array, by linear interpolation + """ n_rows, n_cols = X.shape indices = np.arange(n_rows) @@ -224,12 +239,12 @@ def linear_interpolation(X: NDArray) -> NDArray: def fold_signal(X: NDArray, period: int) -> NDArray: - """ - Reshape a time series into a 2D-array + """Reshape a time series into a 2D-array. Parameters ---------- X : NDArray + Input array to be reshaped. period : int Period used to fold the signal of the 2D-array @@ -242,6 +257,7 @@ def fold_signal(X: NDArray, period: int) -> NDArray: ------ ValueError if X is not a 1D array + """ if len(X.shape) != 2: raise NotDimension2(X.shape) @@ -257,8 +273,20 @@ def fold_signal(X: NDArray, period: int) -> NDArray: def prepare_data(X: NDArray, period: int = 1) -> NDArray: - """ - Transform signal to 2D-array in case of 1D-array. + """Reshape a time series into a 2D-array. + + Parameters + ---------- + X : NDArray + Input array to be reshaped. + period : int, optional + Period used to fold the signal. Defaults to 1. + + Returns + ------- + NDArray + Reshaped array. + """ if len(X.shape) == 1: X = X.reshape(-1, 1) @@ -267,27 +295,43 @@ def prepare_data(X: NDArray, period: int = 1) -> NDArray: return X_fold -def get_shape_original(M: NDArray, shape: tuple) -> NDArray: +def get_shape_original(M: NDArray, shape: Tuple[int, int]) -> NDArray: """Shapes an output matrix from the RPCA algorithm into the original shape. Parameters ---------- M : NDArray Matrix to reshape - X : NDArray - Matrix of the desired shape + shape : Tuple[int, int] + Desired shape Returns ------- NDArray Reshaped matrix + """ - size = np.prod(shape) + size: int = int(np.prod(shape)) M_flat = M.flatten()[:size] return M_flat.reshape(shape) def create_lag_matrices(X: NDArray, p: int) -> Tuple[NDArray, NDArray]: + """Create lag matrices for the VAR(p). + + Parameters + ---------- + X : NDArray + Input matrix + p : int + Number of lags + + Returns + ------- + Tuple[NDArray, NDArray] + Z and Y + + """ n_rows, _ = X.shape n_rows_new = n_rows - p list_X_lag = [np.ones((n_rows_new, 1))] @@ -301,8 +345,184 @@ def create_lag_matrices(X: NDArray, p: int) -> Tuple[NDArray, NDArray]: def nan_mean_cov(X: NDArray) -> Tuple[NDArray, NDArray]: + """Compute mean and covariance matrix. + + Parameters + ---------- + X : NDArray + Input matrix + + Returns + ------- + Tuple[NDArray, NDArray] + Means and covariance matrix + + """ _, n_variables = X.shape means = np.nanmean(X, axis=0) cov = np.ma.cov(np.ma.masked_invalid(X), rowvar=False).data cov = cov.reshape(n_variables, n_variables) return means, cov + + +def moy_p(V, weights): + """Compute the weighted mean of a vector, ignoring NaNs. + + Parameters + ---------- + V : array-like + Input vector with possible NaN values. + weights : array-like + Weights corresponding to each element in V. + + Returns + ------- + float + Weighted mean of non-NaN elements. + + """ + mask = ~np.isnan(V) + total_weight = np.sum(weights[mask]) + if total_weight == 0: + return 0.0 # or use np.finfo(float).eps for a small positive value + return np.sum(V[mask] * weights[mask]) / total_weight + + +def tab_disjonctif_NA(df): + """Create a disjunctive (one-hot encoded). + + Parameters + ---------- + df : DataFrame + Input DataFrame with categorical and numeric variables. + + Returns + ------- + DataFrame + Disjunctive table with one-hot encoding. + + """ # noqa: E501 + df_encoded_list = [] + for col in df.columns: + if df[col].dtype.name == "category" or df[col].dtype == object: + df[col] = df[col].astype("category") + # Include '__MISSING__' as a category if not already present + if "__MISSING__" not in df[col].cat.categories: + df[col] = df[col].cat.add_categories(["__MISSING__"]) + # Fill missing values with '__MISSING__' + df[col] = df[col].fillna("__MISSING__") + # One-hot encode the categorical variable + encoded = pd.get_dummies( + df[col], + prefix=col, + prefix_sep="_", + dummy_na=False, + dtype=float, + ) + df_encoded_list.append(encoded) + else: + # Numeric column; keep as is + df_encoded_list.append(df[[col]]) + # Concatenate all encoded columns + df_encoded = pd.concat(df_encoded_list, axis=1) + return df_encoded + + +def tab_disjonctif_prop(df, seed=None): + """Perform probabilistic imputation for categorical columns using observed + value distributions, without creating a separate missing category. + + Parameters + ---------- + df : DataFrame + DataFrame with categorical columns to impute. + seed : int, optional + Random seed for reproducibility. Default is None. + + Returns + ------- + DataFrame + Disjunctive coded DataFrame with missing values probabilistically + imputed. + + """ # noqa: D205 + if seed is not None: + np.random.seed(seed) + df = df.copy() + df_encoded_list = [] + for col in df.columns: + if df[col].dtype.name == "category" or df[col].dtype == object: + # Ensure categories are strings + df[col] = df[col].cat.rename_categories( + df[col].cat.categories.astype(str) + ) + observed = df[col][df[col].notna()] + categories = df[col].cat.categories.tolist() + # Get observed frequencies + freqs = observed.value_counts(normalize=True) + # Impute missing values based on observed frequencies + missing_indices = df[col][df[col].isna()].index + if len(missing_indices) > 0: + imputed_values = np.random.choice( + freqs.index, size=len(missing_indices), p=freqs.values + ) + df.loc[missing_indices, col] = imputed_values + # One-hot encode without creating missing category + encoded = pd.get_dummies( + df[col], + prefix=col, + prefix_sep="_", + dummy_na=False, + dtype=float, + ) + col_names = [f"{col}_{cat}" for cat in categories] + encoded = encoded.reindex(columns=col_names, fill_value=0.0) + df_encoded_list.append(encoded) + else: + df_encoded_list.append(df[[col]]) + df_encoded = pd.concat(df_encoded_list, axis=1) + return df_encoded + + +def find_category(df_original, tab_disj): + """Reconstruct the original categorical variables from the disjunctive. + + Parameters + ---------- + df_original : DataFrame + Original DataFrame with categorical variables. + tab_disj : DataFrame + Disjunctive table after imputation. + + Returns + ------- + DataFrame + Reconstructed DataFrame with imputed categorical variables. + + """ + df_reconstructed = df_original.copy() + start_idx = 0 + for col in df_original.columns: + if ( + df_original[col].dtype.name == "category" + or df_original[col].dtype == object + ): # noqa: E501 + categories = df_original[col].cat.categories.tolist() + if "__MISSING__" in categories: + missing_cat_index = categories.index("__MISSING__") + else: + missing_cat_index = None + num_categories = len(categories) + sub_tab = tab_disj.iloc[:, start_idx : start_idx + num_categories] + if missing_cat_index is not None: + sub_tab.iloc[:, missing_cat_index] = -np.inf + # Find the category with the maximum value for each row + max_indices = sub_tab.values.argmax(axis=1) + df_reconstructed[col] = [categories[idx] for idx in max_indices] + # Replace '__MISSING__' back to NaN + df_reconstructed[col].replace("__MISSING__", np.nan, inplace=True) + start_idx += num_categories + else: + # For numeric variables, keep as is + start_idx += 1 # Increment start_idx by 1 for numeric columns + return df_reconstructed \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 2cf67a1c..00000000 --- a/setup.py +++ /dev/null @@ -1,91 +0,0 @@ -import codecs - -from setuptools import find_packages, setup - -DISTNAME = "qolmat" -VERSION = "0.1.8" -DESCRIPTION = "A Python library for optimal data imputation." -LONG_DESCRIPTION_CONTENT_TYPE = "text/x-rst" -with codecs.open("README.rst", encoding="utf-8-sig") as f: - LONG_DESCRIPTION = f.read() - - -LICENSE = "new BSD" -MAINTAINER = "Julien ROUSSEL, Anh Khoa NGO HO, Charles-Henri PRAT, Guillaume SAËS" -MAINTAINER_EMAIL = ( - "jroussel@quantmetry.com, " - "akngoho@quantmetry.com, " - "chprat@quantmetry.com, " - "gsaes@quantmetry.com, " - "mabidi.quantmetry.com" -) -URL = "https://github.com/Quantmetry/qolmat" -DOWNLOAD_URL = "https://pypi.org/project/qolmat/#files" -PROJECT_URLS = { - "Bug Tracker": "https://github.com/Quantmetry/qolmat", - "Documentation": "https://qolmat.readthedocs.io/en/latest/", - "Source Code": "https://github.com/Quantmetry/qolmat", -} - -PYTHON_REQUIRES = ">=3.8" -PACKAGES = find_packages() -INSTALL_REQUIRES = [ - "category_encoders", - "dcor>=0.6", - "hyperopt", - "numpy>=1.21", - "packaging", - "pandas>=1.3", - "scikit-learn", - "scipy", - "statsmodels>=0.14", - "typing-extensions", -] -EXTRAS_REQUIRE = { - "tests": ["flake8", "mypy", "pandas", "pytest", "pytest-cov", "typed-ast"], - "docs": [ - "numpydoc", - "sphinx", - "sphinx-gallery", - "sphinx_rtd_theme", - ], - "pytorch": [ - "torch==2.0.1", - ], -} - -CLASSIFIERS = [ - "Intended Audience :: Science/Research", - "Intended Audience :: Developers", - "License :: OSI Approved", - "Topic :: Software Development", - "Topic :: Scientific/Engineering", - "Operating System :: Microsoft :: Windows", - "Operating System :: POSIX", - "Operating System :: Unix", - "Operating System :: MacOS", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", -] - -PACKAGE_DATA = {"qolmat": ["data/beijing.csv", "data/conductors.csv"]} - -setup( - name=DISTNAME, - version=VERSION, - license=LICENSE, - description=DESCRIPTION, - long_description=LONG_DESCRIPTION, - long_description_content_type=LONG_DESCRIPTION_CONTENT_TYPE, - url=URL, - download_url=DOWNLOAD_URL, - project_urls=PROJECT_URLS, - packages=PACKAGES, - python_requires=PYTHON_REQUIRES, - install_requires=INSTALL_REQUIRES, - extras_require=EXTRAS_REQUIRE, - classifiers=CLASSIFIERS, - zip_safe=False, - package_data=PACKAGE_DATA, -) diff --git a/tests/analysis/test_holes_characterization.py b/tests/analysis/test_holes_characterization.py index c794b94e..a77ecbb1 100644 --- a/tests/analysis/test_holes_characterization.py +++ b/tests/analysis/test_holes_characterization.py @@ -11,7 +11,9 @@ @pytest.fixture def mcar_df() -> pd.DataFrame: rng = np.random.default_rng(42) - matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) + matrix = rng.multivariate_normal( + mean=[0, 0], cov=[[1, 0], [0, 1]], size=200 + ) df = pd.DataFrame(data=matrix, columns=["Column_1", "Column_2"]) hole_gen = UniformHoleGenerator( n_splits=1, random_state=42, subset=["Column_2"], ratio_masked=0.2 @@ -23,7 +25,9 @@ def mcar_df() -> pd.DataFrame: @pytest.fixture def mar_hm_df() -> pd.DataFrame: rng = np.random.default_rng(42) - matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) + matrix = rng.multivariate_normal( + mean=[0, 0], cov=[[1, 0], [0, 1]], size=200 + ) quantile_95 = norm.ppf(0.975) df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"]) @@ -37,7 +41,9 @@ def mar_hm_df() -> pd.DataFrame: @pytest.fixture def mar_hc_df() -> pd.DataFrame: rng = np.random.default_rng(42) - matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) + matrix = rng.multivariate_normal( + mean=[0, 0], cov=[[1, 0], [0, 1]], size=200 + ) quantile_95 = norm.ppf(0.975) df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"]) @@ -49,7 +55,8 @@ def mar_hc_df() -> pd.DataFrame: @pytest.mark.parametrize( - "df_input, expected", [("mcar_df", True), ("mar_hm_df", False), ("mar_hc_df", True)] + "df_input, expected", + [("mcar_df", True), ("mar_hm_df", False), ("mar_hc_df", True)], ) def test_little_mcar_test(df_input: pd.DataFrame, expected: bool, request): mcar_test_little = LittleTest(random_state=42) diff --git a/tests/benchmark/test_comparator.py b/tests/benchmark/test_comparator.py index bddb29a3..02971bbb 100644 --- a/tests/benchmark/test_comparator.py +++ b/tests/benchmark/test_comparator.py @@ -1,8 +1,8 @@ -import pytest +from unittest.mock import MagicMock, patch + import numpy as np import pandas as pd -from unittest.mock import patch, MagicMock from qolmat.benchmark.comparator import Comparator generator_holes_mock = MagicMock() @@ -20,7 +20,9 @@ imputer_mock = MagicMock() expected_get_errors = pd.Series( [1.0, 1.0, 1.0, 1.0], - index=pd.MultiIndex.from_tuples([("mae", "A"), ("mae", "B"), ("mse", "A"), ("mse", "B")]), + index=pd.MultiIndex.from_tuples( + [("mae", "A"), ("mae", "B"), ("mse", "A"), ("mse", "B")] + ), ) @@ -28,10 +30,14 @@ def test_get_errors(mock_get_metric): df_origin = pd.DataFrame({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}) df_imputed = pd.DataFrame({"A": [1, 2, 4], "B": [4, 5, 7]}) - df_mask = pd.DataFrame({"A": [False, False, True], "B": [False, False, True]}) + df_mask = pd.DataFrame( + {"A": [False, False, True], "B": [False, False, True]} + ) - mock_get_metric.return_value = lambda df_origin, df_imputed, df_mask: pd.Series( - [1.0, 1.0], index=["A", "B"] + mock_get_metric.return_value = ( + lambda df_origin, df_imputed, df_mask: pd.Series( + [1.0, 1.0], index=["A", "B"] + ) ) errors = comparator.get_errors(df_origin, df_imputed, df_mask) pd.testing.assert_series_equal(errors, expected_get_errors) @@ -65,7 +71,10 @@ def test_compare(mock_evaluate_errors_sample): errors_imputer1 = pd.Series([0.1, 0.2], index=["mae", "mse"]) errors_imputer2 = pd.Series([0.3, 0.4], index=["mae", "mse"]) - mock_evaluate_errors_sample.side_effect = [errors_imputer1, errors_imputer2] + mock_evaluate_errors_sample.side_effect = [ + errors_imputer1, + errors_imputer2, + ] df_errors = comparator.compare(df_test) assert mock_evaluate_errors_sample.call_count == 2 diff --git a/tests/benchmark/test_hyperparameters.py b/tests/benchmark/test_hyperparameters.py index 5c6ff85a..cf5b567d 100644 --- a/tests/benchmark/test_hyperparameters.py +++ b/tests/benchmark/test_hyperparameters.py @@ -1,20 +1,24 @@ -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union +import hyperopt as ho import numpy as np import pandas as pd -import pytest from qolmat.benchmark import hyperparameters -from qolmat.benchmark.hyperparameters import HyperValue # from hyperparameters import HyperValue -from qolmat.benchmark.missing_patterns import _HoleGenerator, EmpiricalHoleGenerator -from qolmat.imputations.imputers import _Imputer, ImputerRpcaNoisy - -import hyperopt as ho +from qolmat.benchmark.missing_patterns import ( + EmpiricalHoleGenerator, + _HoleGenerator, +) +from qolmat.imputations.imputers import ImputerRpcaNoisy, _Imputer -df_origin = pd.DataFrame({"col1": [0, np.nan, 2, 4, np.nan], "col2": [-1, np.nan, 0.5, 1, 1.5]}) -df_imputed = pd.DataFrame({"col1": [0, 1, 2, 3.5, 4], "col2": [-1.5, 0, 1.5, 2, 1.5]}) +df_origin = pd.DataFrame( + {"col1": [0, np.nan, 2, 4, np.nan], "col2": [-1, np.nan, 0.5, 1, 1.5]} +) +df_imputed = pd.DataFrame( + {"col1": [0, 1, 2, 3.5, 4], "col2": [-1.5, 0, 1.5, 2, 1.5]} +) df_mask = pd.DataFrame( { "col1": [False, False, True, False, False], @@ -24,7 +28,9 @@ df_corrupted = df_origin.copy() df_corrupted[df_mask] = np.nan -imputer_rpca = ImputerRpcaNoisy(tau=2, random_state=42, columnwise=True, period=1) +imputer_rpca = ImputerRpcaNoisy( + tau=2, random_state=42, columnwise=True, period=1 +) dict_imputers_rpca = {"rpca": imputer_rpca} generator_holes = EmpiricalHoleGenerator(n_splits=1, ratio_masked=0.5) dict_config_opti = { @@ -41,27 +47,38 @@ class ImputerTest(_Imputer): + """Group tests for Imputer.""" + def __init__( self, groups: Tuple[str, ...] = (), random_state: Union[None, int, np.random.RandomState] = None, value: float = 0, ) -> None: - super().__init__(groups=groups, columnwise=True, random_state=random_state) + """Init function.""" + super().__init__( + groups=groups, columnwise=True, random_state=random_state + ) self.value = value - def _transform_element(self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0): + def _transform_element( + self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0 + ): df_out = df.copy() df_out = df_out.fillna(self.value) return df_out class HoleGeneratorTest(_HoleGenerator): + """Group tests for HoleGenerator.""" + def __init__(self, mask: pd.Series, subset: Optional[List[str]] = None): + """Init HoleGenerator.""" super().__init__(n_splits=1, subset=subset) self.mask = mask def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame: + """Generate mask.""" df_out = X.copy() for col in df_out: df_out[col] = self.mask @@ -69,19 +86,27 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame: def test_hyperparameters_get_objective() -> None: + """Test get_objective.""" imputer = ImputerTest() - generator = HoleGeneratorTest(pd.Series([False, False, True, True]), subset=["some_col"]) + generator = HoleGeneratorTest( + pd.Series([False, False, True, True]), subset=["some_col"] + ) metric = "mse" names_hyperparams = ["value"] df = pd.DataFrame({"some_col": [np.nan, 0, 3, 5]}) - fun_obj = hyperparameters.get_objective(imputer, df, generator, metric, names_hyperparams) + fun_obj = hyperparameters.get_objective( + imputer, df, generator, metric, names_hyperparams + ) assert fun_obj([4]) == 1 assert fun_obj([0]) == (3**2 + 5**2) / 2 def test_hyperparameters_optimize(): + """Test optimize.""" imputer = ImputerTest() - generator = HoleGeneratorTest(pd.Series([False, False, True, True]), subset=["some_col"]) + generator = HoleGeneratorTest( + pd.Series([False, False, True, True]), subset=["some_col"] + ) metric = "mse" dict_config_opti = {"value": ho.hp.uniform("value", 0, 10)} df = pd.DataFrame({"some_col": [np.nan, 0, 3, 5]}) diff --git a/tests/benchmark/test_metrics.py b/tests/benchmark/test_metrics.py index 0c768054..26fa0f7c 100644 --- a/tests/benchmark/test_metrics.py +++ b/tests/benchmark/test_metrics.py @@ -2,12 +2,11 @@ # # Evaluation metrics # # ###################### -from math import exp import numpy as np -from numpy import random as npr import pandas as pd import pytest import scipy +from numpy import random as npr from qolmat.benchmark import metrics from qolmat.utils.exceptions import NotEnoughSamples @@ -16,9 +15,13 @@ {"col1": [0, np.nan, 2, 3, np.nan], "col2": [-1, np.nan, 0.5, 1, 1.5]} ) -df_complete = pd.DataFrame({"col1": [0, 2, 2, 3, 4], "col2": [-1, -2, 0.5, 1, 1.5]}) +df_complete = pd.DataFrame( + {"col1": [0, 2, 2, 3, 4], "col2": [-1, -2, 0.5, 1, 1.5]} +) -df_imputed = pd.DataFrame({"col1": [0, 1, 2, 3.5, 4], "col2": [-1.5, 0, 1.5, 2, 1.5]}) +df_imputed = pd.DataFrame( + {"col1": [0, 1, 2, 3.5, 4], "col2": [-1.5, 0, 1.5, 2, 1.5]} +) df_mask = pd.DataFrame( { @@ -31,7 +34,9 @@ @pytest.mark.parametrize("df1", [df_incomplete]) @pytest.mark.parametrize("df2", [df_imputed]) @pytest.mark.parametrize("df_mask", [df_mask]) -def test_mean_squared_error(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> None: +def test_mean_squared_error( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> None: assert metrics.mean_squared_error(df1, df1, df_mask).equals( pd.Series([0.0, 0.0], index=["col1", "col2"]) ) @@ -59,7 +64,9 @@ def test_root_mean_squared_error( @pytest.mark.parametrize("df1", [df_incomplete]) @pytest.mark.parametrize("df2", [df_imputed]) @pytest.mark.parametrize("df_mask", [df_mask]) -def test_mean_absolute_error(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> None: +def test_mean_absolute_error( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> None: assert metrics.mean_absolute_error(df1, df1, df_mask).equals( pd.Series([0.0, 0.0], index=["col1", "col2"]) ) @@ -90,9 +97,9 @@ def test_mean_absolute_percentage_error( def test_weighted_mean_absolute_percentage_error( df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame ) -> None: - assert metrics.weighted_mean_absolute_percentage_error(df1, df1, df_mask).equals( - pd.Series([0.0, 0.0], index=["col1", "col2"]) - ) + assert metrics.weighted_mean_absolute_percentage_error( + df1, df1, df_mask + ).equals(pd.Series([0.0, 0.0], index=["col1", "col2"])) result = metrics.weighted_mean_absolute_percentage_error(df1, df2, df_mask) expected = pd.Series([0.1, 1.0], index=["col1", "col2"]) np.testing.assert_allclose(result, expected, atol=1e-3) @@ -101,7 +108,9 @@ def test_weighted_mean_absolute_percentage_error( @pytest.mark.parametrize("df1", [df_incomplete]) @pytest.mark.parametrize("df2", [df_imputed]) @pytest.mark.parametrize("df_mask", [df_mask]) -def test_accuracy(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> None: +def test_accuracy( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> None: result = metrics.accuracy(df1, df1, df_mask) expected = pd.Series([1.0, 1.0], index=["col1", "col2"]) pd.testing.assert_series_equal(result, expected) @@ -113,17 +122,23 @@ def test_accuracy(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) - @pytest.mark.parametrize("df1", [df_incomplete]) @pytest.mark.parametrize("df2", [df_imputed]) @pytest.mark.parametrize("df_mask", [df_mask]) -def test_wasserstein_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> None: +def test_wasserstein_distance( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> None: dist = metrics.dist_wasserstein(df1, df1, df_mask, method="columnwise") assert dist.equals(pd.Series([0.0, 0.0], index=["col1", "col2"])) dist = metrics.dist_wasserstein(df1, df2, df_mask, method="columnwise") - assert dist.round(3).equals(pd.Series([0.250, 0.833], index=["col1", "col2"])) + assert dist.round(3).equals( + pd.Series([0.250, 0.833], index=["col1", "col2"]) + ) @pytest.mark.parametrize("df1", [df_incomplete]) @pytest.mark.parametrize("df2", [df_imputed]) @pytest.mark.parametrize("df_mask", [df_mask]) -def test_kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> None: +def test_kl_divergence( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> None: result = metrics.kl_divergence(df1, df1, df_mask, method="columnwise") expected = pd.Series([0.0, 0.0], index=["col1", "col2"]) pd.testing.assert_series_equal(result, expected, atol=1e-3) @@ -133,7 +148,9 @@ def test_kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFra pd.testing.assert_series_equal(result, expected, atol=1e-3) df_nonan = df1.notna() - result = metrics.kl_divergence(df1, df2, df_nonan, method="gaussian", min_n_rows=2) + result = metrics.kl_divergence( + df1, df2, df_nonan, method="gaussian", min_n_rows=2 + ) expected = pd.Series([1.029], index=["All"]) pd.testing.assert_series_equal(result, expected, atol=1e-3) @@ -190,26 +207,38 @@ def test_sum_pairwise_distances( @pytest.mark.parametrize("df1", [df_incomplete]) @pytest.mark.parametrize("df2", [df_imputed]) @pytest.mark.parametrize("df_mask", [df_mask]) -def test_sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> None: +def test_sum_energy_distances( + df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame +) -> None: sum_distances_df1 = np.sum( scipy.spatial.distance.cdist( - df1[df_mask].fillna(0.0), df1[df_mask].fillna(0.0), metric="cityblock" + df1[df_mask].fillna(0.0), + df1[df_mask].fillna(0.0), + metric="cityblock", ) ) sum_distances_df2 = np.sum( scipy.spatial.distance.cdist( - df2[df_mask].fillna(0.0), df2[df_mask].fillna(0.0), metric="cityblock" + df2[df_mask].fillna(0.0), + df2[df_mask].fillna(0.0), + metric="cityblock", ) ) sum_distances_df1_df2 = np.sum( scipy.spatial.distance.cdist( - df1[df_mask].fillna(0.0), df2[df_mask].fillna(0.0), metric="cityblock" + df1[df_mask].fillna(0.0), + df2[df_mask].fillna(0.0), + metric="cityblock", ) ) - energy_distance_scipy = 2 * sum_distances_df1_df2 - sum_distances_df1 - sum_distances_df2 + energy_distance_scipy = ( + 2 * sum_distances_df1_df2 - sum_distances_df1 - sum_distances_df2 + ) energy_distance_qolmat = metrics.sum_energy_distances(df1, df2, df_mask) - assert energy_distance_qolmat.equals(pd.Series(energy_distance_scipy, index=["All"])) + assert energy_distance_qolmat.equals( + pd.Series(energy_distance_scipy, index=["All"]) + ) @pytest.mark.parametrize("df1", [df_incomplete]) @@ -218,20 +247,23 @@ def test_sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd. def test_mean_difference_correlation_matrix_numerical_features( df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame ) -> None: - assert metrics.mean_difference_correlation_matrix_numerical_features(df1, df1, df_mask).equals( - pd.Series([0.0, 0.0], index=["col1", "col2"]) - ) + assert metrics.mean_difference_correlation_matrix_numerical_features( + df1, df1, df_mask + ).equals(pd.Series([0.0, 0.0], index=["col1", "col2"])) assert metrics.mean_difference_correlation_matrix_numerical_features( df1, df1, df_mask, False ).equals(pd.Series([0.0, 0.0], index=["col1", "col2"])) - assert metrics.mean_difference_correlation_matrix_numerical_features(df1, df2, df_mask).equals( - pd.Series([0.0, 0.0], index=["col1", "col2"]) - ) + assert metrics.mean_difference_correlation_matrix_numerical_features( + df1, df2, df_mask + ).equals(pd.Series([0.0, 0.0], index=["col1", "col2"])) df_incomplete_cat = pd.DataFrame( - {"col1": ["a", np.nan, "a", "b", np.nan], "col2": ["c", np.nan, "d", "b", "d"]} + { + "col1": ["a", np.nan, "a", "b", np.nan], + "col2": ["c", np.nan, "d", "b", "d"], + } ) df_imputed_cat = pd.DataFrame( @@ -279,7 +311,10 @@ def test_mean_difference_correlation_matrix_categorical_features( df_incomplete_cat_num = pd.DataFrame( - {"col1": ["a", np.nan, "a", "b", np.nan], "col2": [-1, np.nan, 0.5, 1, 1.5]} + { + "col1": ["a", np.nan, "a", "b", np.nan], + "col2": [-1, np.nan, 0.5, 1, 1.5], + } ) df_imputed_cat_num = pd.DataFrame( @@ -287,7 +322,10 @@ def test_mean_difference_correlation_matrix_categorical_features( ) df_mask_cat_num = pd.DataFrame( - {"col1": [True, False, True, True, False], "col2": [True, False, True, True, False]} + { + "col1": [True, False, True, True, False], + "col2": [True, False, True, True, False], + } ) @@ -318,7 +356,9 @@ def test_exception_raise_different_shapes( df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame ) -> None: with pytest.raises(Exception): - metrics.mean_difference_correlation_matrix_numerical_features(df1, df2, df_mask) + metrics.mean_difference_correlation_matrix_numerical_features( + df1, df2, df_mask + ) with pytest.raises(Exception): metrics.frechet_distance_base(df1, df2) @@ -332,7 +372,9 @@ def test_exception_raise_no_numerical_column_found( with pytest.raises(Exception): metrics.kolmogorov_smirnov_test(df1, df2, df_mask) with pytest.raises(Exception): - metrics.mean_difference_correlation_matrix_numerical_features(df1, df2, df_mask) + metrics.mean_difference_correlation_matrix_numerical_features( + df1, df2, df_mask + ) @pytest.mark.parametrize("df1", [df_incomplete]) @@ -346,7 +388,10 @@ def test_exception_raise_no_categorical_column_found( df_incomplete_cat_num_bad = pd.DataFrame( - {"col1": ["a", np.nan, "c", "b", np.nan], "col2": [-1, np.nan, 0.5, 0.5, 1.5]} + { + "col1": ["a", np.nan, "c", "b", np.nan], + "col2": [-1, np.nan, 0.5, 0.5, 1.5], + } ) @@ -376,14 +421,19 @@ def test_pattern_based_weighted_mean_metric( rng = npr.default_rng(123) -df_gauss1 = pd.DataFrame(rng.multivariate_normal([0, 0], [[1, 0.2], [0.2, 2]], size=100)) -df_gauss2 = pd.DataFrame(rng.multivariate_normal([0, 1], [[1, 0.2], [0.2, 2]], size=100)) +df_gauss1 = pd.DataFrame( + rng.multivariate_normal([0, 0], [[1, 0.2], [0.2, 2]], size=100) +) +df_gauss2 = pd.DataFrame( + rng.multivariate_normal([0, 1], [[1, 0.2], [0.2, 2]], size=100) +) df_mask_gauss = pd.DataFrame(np.full_like(df_gauss1, True)) def test_pattern_mae_comparison(mocker) -> None: - - mock_metric = mocker.patch("qolmat.benchmark.metrics.accuracy_1D", return_value=0) + mock_metric = mocker.patch( + "qolmat.benchmark.metrics.accuracy_1D", return_value=0 + ) df_nonan = df_incomplete.notna() metrics.pattern_based_weighted_mean_metric( diff --git a/tests/benchmark/test_missing_patterns.py b/tests/benchmark/test_missing_patterns.py index 0fa06e69..4cd29455 100644 --- a/tests/benchmark/test_missing_patterns.py +++ b/tests/benchmark/test_missing_patterns.py @@ -4,7 +4,9 @@ from qolmat.benchmark import missing_patterns as mp -df_complet = pd.DataFrame({"col1": [i for i in range(100)], "col2": [2 * i for i in range(100)]}) +df_complet = pd.DataFrame( + {"col1": list(range(100)), "col2": [2 * i for i in range(100)]} +) df_incomplet = df_complet.copy() df_incomplet.iloc[99, :] = np.nan @@ -20,9 +22,15 @@ df_incomplet_group.index = df_incomplet_group.index.set_names("group") list_generators = { - "geo": mp.GeometricHoleGenerator(n_splits=2, ratio_masked=0.1, random_state=42), - "unif": mp.UniformHoleGenerator(n_splits=2, ratio_masked=0.1, random_state=42), - "multi": mp.MultiMarkovHoleGenerator(n_splits=2, ratio_masked=0.1, random_state=42), + "geo": mp.GeometricHoleGenerator( + n_splits=2, ratio_masked=0.1, random_state=42 + ), + "unif": mp.UniformHoleGenerator( + n_splits=2, ratio_masked=0.1, random_state=42 + ), + "multi": mp.MultiMarkovHoleGenerator( + n_splits=2, ratio_masked=0.1, random_state=42 + ), "group": mp.GroupedHoleGenerator( n_splits=2, ratio_masked=0.1, random_state=42, groups=("group",) ), @@ -38,7 +46,9 @@ (df_incomplet_group, list_generators["group"]), ], ) -def test_SamplerHoleGenerator_split(df: pd.DataFrame, generator: mp._HoleGenerator) -> None: +def test_SamplerHoleGenerator_split( + df: pd.DataFrame, generator: mp._HoleGenerator +) -> None: mask = generator.split(df)[0] col1_holes = mask["col1"].sum() col2_holes = mask["col2"].sum() @@ -57,7 +67,9 @@ def test_SamplerHoleGenerator_split(df: pd.DataFrame, generator: mp._HoleGenerat (df_incomplet_group, list_generators["group"]), ], ) -def test_SamplerHoleGenerator_reproducible(df: pd.DataFrame, generator: mp._HoleGenerator) -> None: +def test_SamplerHoleGenerator_reproducible( + df: pd.DataFrame, generator: mp._HoleGenerator +) -> None: generator.random_state = 42 mask1 = generator.split(df)[0] generator.random_state = 43 @@ -81,7 +93,9 @@ def test_SamplerHoleGenerator_reproducible(df: pd.DataFrame, generator: mp._Hole def test_SamplerHoleGenerator_without_real_nans( df: pd.DataFrame, generator: mp._HoleGenerator ) -> None: - real_nan = np.random.choice([True, False], size=df.size, p=[0.4, 0.6]).reshape(100, 2) + real_nan = np.random.choice( + [True, False], size=df.size, p=[0.4, 0.6] + ).reshape(100, 2) df[real_nan] = np.nan mask = generator.split(df)[0] @@ -92,5 +106,9 @@ def test_SamplerHoleGenerator_without_real_nans( loc_real_nans_col2 = np.where(df["col2"].isna())[0] loc_mask_col2 = np.where(mask["col2"])[0] - np.testing.assert_allclose(len(set(loc_real_nans_col1) & set(loc_mask_col1)), 0) - np.testing.assert_allclose(len(set(loc_real_nans_col2) & set(loc_mask_col2)), 0) + np.testing.assert_allclose( + len(set(loc_real_nans_col1) & set(loc_mask_col1)), 0 + ) + np.testing.assert_allclose( + len(set(loc_real_nans_col2) & set(loc_mask_col2)), 0 + ) diff --git a/tests/imputations/rpca/test_rpca.py b/tests/imputations/rpca/test_rpca.py index 1430dc4e..34672ef2 100644 --- a/tests/imputations/rpca/test_rpca.py +++ b/tests/imputations/rpca/test_rpca.py @@ -1,32 +1,21 @@ from typing import Tuple + import numpy as np -import pandas as pd -import pytest from numpy.typing import NDArray -from pytest_mock.plugin import MockerFixture -from qolmat.imputations.rpca.rpca import RPCA - -# X_incomplete = np.array([[1, np.nan], [4, 2], [np.nan, 4]]) - -# X_exp_nrows_1_prepare_data = np.array([1.0, np.nan, 4.0, 2.0, np.nan, 4.0]) -# X_exp_nrows_6_prepare_data = np.concatenate( -# [X_incomplete.reshape(-1, 6).flatten(), np.ones((1, 94)).flatten() * np.nan] -# ) - -# period = 100 -# max_iter = 256 -# mu = 0.5 -# tau = 0.5 -# lam = 1 +from qolmat.imputations.rpca.rpca import RPCA class RPCAMock(RPCA): + """Mock for RPCA.""" + def __init__(self): + """Mock for init RPCA.""" super().__init__() self.Q = None def decompose(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: + """Mock for decompose function.""" self.call_count = 1 return D, D diff --git a/tests/imputations/rpca/test_rpca_noisy.py b/tests/imputations/rpca/test_rpca_noisy.py index 78f62e41..d20aeaba 100644 --- a/tests/imputations/rpca/test_rpca_noisy.py +++ b/tests/imputations/rpca/test_rpca_noisy.py @@ -4,7 +4,6 @@ import pytest from numpy.typing import NDArray -from qolmat.imputations.rpca import rpca_utils from qolmat.imputations.rpca.rpca_noisy import RpcaNoisy from qolmat.utils import utils from qolmat.utils.data import generate_artificial_ts @@ -57,7 +56,9 @@ def test_check_cost_function_minimized_warning( ): """Test warning when the cost function is not minimized.""" with pytest.warns(UserWarning): - RpcaNoisy()._check_cost_function_minimized(obs, lr, ano, omega, lam, tau) + RpcaNoisy()._check_cost_function_minimized( + obs, lr, ano, omega, lam, tau + ) @pytest.mark.parametrize( @@ -85,7 +86,9 @@ def test_check_cost_function_minimized_no_warning( ): """Test no warning when the cost function is minimized.""" with warnings.catch_warnings(record=True) as record: - RpcaNoisy()._check_cost_function_minimized(obs, lr, ano, omega, lam, tau) + RpcaNoisy()._check_cost_function_minimized( + obs, lr, ano, omega, lam, tau + ) assert len(record) == 0 @@ -108,7 +111,9 @@ def test_rpca_decompose_rpca_shape(norm: str): rank = 2 rpca = RpcaNoisy(rank=rank, norm=norm) Omega = ~np.isnan(X_test) - M_result, A_result, L_result, Q_result = rpca.decompose_with_basis(X_test, Omega) + M_result, A_result, L_result, Q_result = rpca.decompose_with_basis( + X_test, Omega + ) n_rows, n_cols = X_test.shape assert M_result.shape == (n_rows, n_cols) assert A_result.shape == (n_rows, n_cols) @@ -143,7 +148,9 @@ def test_rpca_noisy_zero_tau(X: NDArray, lam: float, X_interpolated: NDArray): "X, tau, X_interpolated", [(X_incomplete, 0.4, X_interpolated), (X_incomplete, 2.4, X_interpolated)], ) -def test_rpca_noisy_zero_lambda(X: NDArray, tau: float, X_interpolated: NDArray): +def test_rpca_noisy_zero_lambda( + X: NDArray, tau: float, X_interpolated: NDArray +): """Test RPCA noisy results if lambda equals zero.""" rpca = RpcaNoisy(tau=tau, lam=0, norm="L2") Omega = ~np.isnan(X) @@ -154,7 +161,9 @@ def test_rpca_noisy_zero_lambda(X: NDArray, tau: float, X_interpolated: NDArray) def test_rpca_noisy_decompose_rpca(synthetic_temporal_data): """Test RPCA noisy results for time series data. - Check if the cost function is smaller at the end than at the start.""" + + Check if the cost function is smaller at the end than at the start. + """ signal = synthetic_temporal_data period = 100 tau = 1 @@ -166,24 +175,27 @@ def test_rpca_noisy_decompose_rpca(synthetic_temporal_data): low_rank_init = D anomalies_init = np.zeros(D.shape) - cost_init = RpcaNoisy.cost_function(D, low_rank_init, anomalies_init, Omega, tau, lam) + cost_init = RpcaNoisy.cost_function( + D, low_rank_init, anomalies_init, Omega, tau, lam + ) - X_result, A_result, _, _ = RpcaNoisy.minimise_loss(D, Omega, rank, tau, lam) - cost_result = RpcaNoisy.cost_function(D, X_result, A_result, Omega, tau, lam) + X_result, A_result, _, _ = RpcaNoisy.minimise_loss( + D, Omega, rank, tau, lam + ) + cost_result = RpcaNoisy.cost_function( + D, X_result, A_result, Omega, tau, lam + ) assert cost_result <= cost_init - # assert np.linalg.norm(X_input_rpca, "nuc") >= 1 / 2 * np.linalg.norm( - # X_input_rpca - X_result.reshape(period, -1) - A_result.reshape(period, -1), - # "fro", - # ) ** 2 + tau * np.linalg.norm(X_result.reshape(period, -1), "nuc") + lam * np.sum( - # np.abs(A_result.reshape(period, -1)) - # ) +def test_rpca_noisy_temporal_signal_temporal_regularisations( + synthetic_temporal_data, +): + """Test RPCA noisy results for TS data with temporal regularisations. -def test_rpca_noisy_temporal_signal_temporal_regularisations(synthetic_temporal_data): - """Test RPCA noisy results for time series data with temporal regularisations. - Check if the cost function is smaller at the end than at the start.""" + Check if the cost function is smaller at the end than at the start. + """ signal = synthetic_temporal_data period = 10 tau = 1 diff --git a/tests/imputations/rpca/test_rpca_pcp.py b/tests/imputations/rpca/test_rpca_pcp.py index c7ab69e5..de997d90 100644 --- a/tests/imputations/rpca/test_rpca_pcp.py +++ b/tests/imputations/rpca/test_rpca_pcp.py @@ -85,6 +85,7 @@ def test_rpca_rpca_pcp_get_params_scale(X: NDArray): @pytest.mark.parametrize("X, mu", [(X_complete, small_mu)]) def test_rpca_rpca_pcp_zero_lambda_small_mu(X: NDArray, mu: float): """Test RPCA PCP results if lambda equals zero. + The problem is ill-conditioned and the result depends on the parameter mu; case when mu is small. """ @@ -98,6 +99,7 @@ def test_rpca_rpca_pcp_zero_lambda_small_mu(X: NDArray, mu: float): @pytest.mark.parametrize("X, mu", [(X_complete, large_mu)]) def test_rpca_rpca_pcp_zero_lambda_large_mu(X: NDArray, mu: float): """Test RPCA PCP results if lambda equals zero. + The problem is ill-conditioned and the result depends on the parameter mu; case when mu is large. """ @@ -120,7 +122,9 @@ def test_rpca_rpca_pcp_large_lambda_small_mu(X: NDArray, mu: float): def test_rpca_temporal_signal(synthetic_temporal_data): """Test RPCA PCP results for time series data. - Check if the cost function is smaller at the end than at the start.""" + + Check if the cost function is smaller at the end than at the start. + """ signal = synthetic_temporal_data period = 100 lam = 0.1 @@ -130,6 +134,6 @@ def test_rpca_temporal_signal(synthetic_temporal_data): Omega = ~np.isnan(D) D_interpolated = utils.linear_interpolation(D) X_result, A_result = rpca.decompose(D, Omega) - assert np.linalg.norm(D_interpolated, "nuc") >= np.linalg.norm(X_result, "nuc") + lam * np.sum( - np.abs(A_result) - ) + assert np.linalg.norm(D_interpolated, "nuc") >= np.linalg.norm( + X_result, "nuc" + ) + lam * np.sum(np.abs(A_result)) diff --git a/tests/imputations/rpca/test_rpca_utils.py b/tests/imputations/rpca/test_rpca_utils.py index 775c9d98..120bff83 100644 --- a/tests/imputations/rpca/test_rpca_utils.py +++ b/tests/imputations/rpca/test_rpca_utils.py @@ -1,14 +1,14 @@ import numpy as np -from numpy.typing import NDArray import pytest +from numpy.typing import NDArray + from qolmat.imputations.rpca.rpca_utils import ( approx_rank, + l1_norm, soft_thresholding, svd_thresholding, - l1_norm, toeplitz_matrix, ) -from qolmat.utils.utils import fold_signal X_incomplete = np.array( [ @@ -20,7 +20,9 @@ ] ) -X_complete = np.array([[1, 7, 4, 4], [5, 2, 4, 4], [-3, 3, 3, 3], [2, -1, 5, 5], [2, 1, 5, 5]]) +X_complete = np.array( + [[1, 7, 4, 4], [5, 2, 4, 4], [-3, 3, 3, 3], [2, -1, 5, 5], [2, 1, 5, 5]] +) @pytest.mark.parametrize("X", [X_complete]) diff --git a/tests/imputations/test_em_sampler.py b/tests/imputations/test_em_sampler.py index 832737dc..21e2ffd0 100644 --- a/tests/imputations/test_em_sampler.py +++ b/tests/imputations/test_em_sampler.py @@ -1,22 +1,29 @@ from typing import List, Literal + import numpy as np import pytest +import scipy from numpy.typing import NDArray from scipy import linalg -import scipy from sklearn.datasets import make_spd_matrix -from qolmat.utils import utils - from qolmat.imputations import em_sampler -from qolmat.utils.exceptions import IllConditioned +from qolmat.utils import utils np.random.seed(42) A: NDArray = np.array([[3, 1, 0], [1, 1, 0], [0, 0, 1]], dtype=float) -A_inverse: NDArray = np.array([[0.5, -0.5, 0], [-0.5, 1.5, 0], [0, 0, 1]], dtype=float) +A_inverse: NDArray = np.array( + [[0.5, -0.5, 0], [-0.5, 1.5, 0], [0, 0, 1]], dtype=float +) X_missing = np.array( - [[1, np.nan, 1], [2, np.nan, 3], [1, 4, np.nan], [-1, 2, 1], [1, 1, np.nan]], + [ + [1, np.nan, 1], + [2, np.nan, 3], + [1, 4, np.nan], + [-1, 2, 1], + [1, 1, np.nan], + ], dtype=float, ) mask: NDArray = np.isnan(X_missing) @@ -40,7 +47,6 @@ def generate_multinormal_predefined_mean_cov(d=3, n=500): mask[ind, j] = True X_missing = X.copy() X_missing[mask] = np.nan - # return {"mean": mean, "covariance": covariance, "X": X, "X_missing": X_missing} return X, X_missing, mean, covariance @@ -93,16 +99,20 @@ def test_gradient_conjugue( """Test the conjugate gradient algorithm.""" X_first_guess = utils.impute_nans(X_missing) X_result = em_sampler._conjugate_gradient(A, X_first_guess, mask) - X_expected = np.array([[1, -1, 1], [2, -2, 3], [1, 4, 0], [-1, 2, 1], [1, 1, 0]], dtype=float) + X_expected = np.array( + [[1, -1, 1], [2, -2, 3], [1, 4, 0], [-1, 2, 1], [1, 1, 0]], dtype=float + ) - assert np.sum(X_result * (X_result @ A)) <= np.sum(X_first_guess * (X_first_guess @ A)) + assert np.sum(X_result * (X_result @ A)) <= np.sum( + X_first_guess * (X_first_guess @ A) + ) assert np.allclose(X_missing[~mask], X_result[~mask]) assert ((X_result @ A)[mask] == 0).all() np.testing.assert_allclose(X_result, X_expected, atol=1e-5) def test_get_lag_p(): - """Test if it can retrieve the lag p""" + """Test if it can retrieve the lag p.""" X, _, _, _ = generate_varp_process(d=3, n=1000, p=2) varpem = em_sampler.VARpEM() varpem.fit(X) @@ -120,7 +130,8 @@ def test_fit_calls(mocker, X_missing: NDArray) -> None: """Test number of calls of some methods in MultiNormalEM.""" max_iter_em = 3 mock_sample_ou = mocker.patch( - "qolmat.imputations.em_sampler.MultiNormalEM._sample_ou", return_value=X_missing + "qolmat.imputations.em_sampler.MultiNormalEM._sample_ou", + return_value=X_missing, ) mock_maximize_likelihood = mocker.patch( "qolmat.imputations.em_sampler.MultiNormalEM._maximize_likelihood", @@ -152,7 +163,11 @@ def test_fit_calls(mocker, X_missing: NDArray) -> None: @pytest.mark.parametrize( "means, covs, logliks", [ - ([np.array([1, 2, 3, 3])] * 15, [np.array([1, 2, 3, 3])] * 15, [1] * 15), + ( + [np.array([1, 2, 3, 3])] * 15, + [np.array([1, 2, 3, 3])] * 15, + [1] * 15, + ), ( [np.array([1, 2, 3, 3])] * 15, [np.random.uniform(low=0, high=100, size=(1, 4))[0]] * 15, @@ -180,7 +195,7 @@ def test_em_sampler_check_convergence_true( em.dict_criteria_stop["means"] = means em.dict_criteria_stop["covs"] = covs em.dict_criteria_stop["logliks"] = logliks - assert em._check_convergence() == True + assert em._check_convergence() @pytest.mark.parametrize( @@ -197,7 +212,7 @@ def test_em_sampler_check_convergence_false( em.dict_criteria_stop["means"] = means em.dict_criteria_stop["covs"] = covs em.dict_criteria_stop["logliks"] = logliks - assert em._check_convergence() == True + assert em._check_convergence() @pytest.mark.parametrize( @@ -231,7 +246,9 @@ def test_sample_ou_2d(model): assert abs(mean_est - mean_theo) < np.sqrt(var_theo / n_samples) * q_alpha ratio_inf = scipy.stats.chi2.ppf(alpha / 2, n_samples) / (n_samples - 1) - ratio_sup = scipy.stats.chi2.ppf(1 - alpha / 2, n_samples) / (n_samples - 1) + ratio_sup = scipy.stats.chi2.ppf(1 - alpha / 2, n_samples) / ( + n_samples - 1 + ) ratio = var_est / var_theo @@ -261,7 +278,7 @@ def test_varem_sampler_check_convergence_true( em.dict_criteria_stop["B"] = list_B em.dict_criteria_stop["S"] = list_S em.dict_criteria_stop["logliks"] = logliks - assert em._check_convergence() == True + assert em._check_convergence() @pytest.mark.parametrize( @@ -278,12 +295,14 @@ def test_varem_sampler_check_convergence_false( em.dict_criteria_stop["B"] = list_B em.dict_criteria_stop["S"] = list_S em.dict_criteria_stop["logliks"] = logliks - assert em._check_convergence() == True + assert em._check_convergence() def test_illconditioned_multinormalem() -> None: """Test that data with colinearity raises an exception.""" - X = np.array([[1, np.nan, 8, 1], [3, 1, 4, 2], [2, 3, np.nan, 1]], dtype=float) + X = np.array( + [[1, np.nan, 8, 1], [3, 1, 4, 2], [2, 3, np.nan, 1]], dtype=float + ) model = em_sampler.MultiNormalEM() with pytest.warns(UserWarning): _ = model.fit_transform(X) @@ -293,7 +312,7 @@ def test_illconditioned_multinormalem() -> None: def test_no_more_nan_multinormalem() -> None: - """Test there are no more missing values after the MultiNormalEM algorithm.""" + """Test there are no more missing values after the MultiNormalEM algo.""" X = np.array([[1, np.nan], [3, 1], [np.nan, 3]], dtype=float) model = em_sampler.MultiNormalEM() X_imp = model.fit_transform(X) @@ -310,9 +329,11 @@ def test_no_more_nan_varpem() -> None: assert np.sum(np.isnan(X_imputed)) == 0 -def test_fit_parameters_multinormalem(): - """Test the fit MultiNormalEM provides good parameters estimates (no imputation).""" - X, X_missing, mean, covariance = generate_multinormal_predefined_mean_cov(d=2, n=10000) +def test_fit_parameters_multinormalem_no_imputation(): + """Test fit MultiNormalEM provides good parameters estimates.""" + X, X_missing, mean, covariance = generate_multinormal_predefined_mean_cov( + d=2, n=10000 + ) em = em_sampler.MultiNormalEM() em.fit_parameters(X) np.testing.assert_allclose(em.means, mean, atol=1e-1) @@ -320,8 +341,10 @@ def test_fit_parameters_multinormalem(): def test_mean_covariance_multinormalem(): - """Test the MultiNormalEM provides good mean and covariance estimations.""" - X, X_missing, mean, covariance = generate_multinormal_predefined_mean_cov(d=2, n=1000) + """Test MultiNormalEM provides good mean and covariance estimations.""" + X, X_missing, mean, covariance = generate_multinormal_predefined_mean_cov( + d=2, n=1000 + ) em = em_sampler.MultiNormalEM() X_imputed = em.fit_transform(X_missing) @@ -333,11 +356,14 @@ def test_mean_covariance_multinormalem(): np.testing.assert_allclose(em.means, mean, rtol=1e-1, atol=1e-1) np.testing.assert_allclose(em.cov, covariance, rtol=1e-1, atol=1e-1) np.testing.assert_allclose(mean_imputed, mean, rtol=1e-1, atol=1e-1) - np.testing.assert_allclose(covariance_imputed, covariance, rtol=1e-1, atol=1e-1) + np.testing.assert_allclose( + covariance_imputed, covariance, rtol=1e-1, atol=1e-1 + ) def test_multinormal_em_minimize_llik(): - X, X_missing, mean, covariance = generate_multinormal_predefined_mean_cov(d=2, n=1000) + """Test that the loglikelihood of the imputed data is lower.""" + X, X_missing, _, _ = generate_multinormal_predefined_mean_cov(d=2, n=1000) imputer = em_sampler.MultiNormalEM(method="mle", random_state=11) X_imputed = imputer.fit_transform(X_missing) llikelihood_imputed = imputer.get_loglikelihood(X_imputed) @@ -354,6 +380,7 @@ def test_multinormal_em_minimize_llik(): @pytest.mark.parametrize("method", ["sample", "mle"]) def test_multinormal_em_fit_transform(method: Literal["mle", "sample"]): + """Test fit_transform method returns the same result as the fit method.""" imputer = em_sampler.MultiNormalEM(method=method, random_state=11) X = X_missing.copy() result = imputer.fit_transform(X) @@ -390,7 +417,9 @@ def test_parameters_after_imputation_varpem(p: int): def test_varpem_fit_transform(): imputer = em_sampler.VARpEM(method="mle", random_state=11) - X = np.array([[1, 1, 1, 1], [np.nan, np.nan, 3, 2], [1, 2, 2, 1], [2, 2, 2, 2]]) + X = np.array( + [[1, 1, 1, 1], [np.nan, np.nan, 3, 2], [1, 2, 2, 1], [2, 2, 2, 2]] + ) result = imputer.fit_transform(X) assert result.shape == X.shape np.testing.assert_allclose(result[~np.isnan(X)], X[~np.isnan(X)]) @@ -439,12 +468,6 @@ def test_pretreatment_temporal(em): np.testing.assert_allclose(mask_result, mask_expected) -# X_missing = np.array( -# [[1, np.nan, 1], [2, np.nan, 3], [1, 4, np.nan], [-1, 2, 1], [1, 1, np.nan]], -# dtype=float, -# ) - - @pytest.mark.parametrize( "em", [ diff --git a/tests/imputations/test_imputers.py b/tests/imputations/test_imputers.py index bea18d9a..5069f0bd 100644 --- a/tests/imputations/test_imputers.py +++ b/tests/imputations/test_imputers.py @@ -5,26 +5,33 @@ import pytest from sklearn.ensemble import ExtraTreesRegressor from sklearn.linear_model import LinearRegression -from sklearn.utils.estimator_checks import check_estimator, parametrize_with_checks -from qolmat.benchmark.hyperparameters import HyperValue +from sklearn.utils.estimator_checks import ( + parametrize_with_checks, +) +from qolmat.benchmark.hyperparameters import HyperValue from qolmat.imputations import imputers -df_complete = pd.DataFrame({"col1": [0, 1, 2, 3, 4], "col2": [-1, 0, 0.5, 1, 1.5]}) +df_complete = pd.DataFrame( + {"col1": [0, 1, 2, 3, 4], "col2": [-1, 0, 0.5, 1, 1.5]} +) df_incomplete = pd.DataFrame( {"col1": [0, np.nan, 2, 3, np.nan], "col2": [-1, np.nan, 0.5, np.nan, 1.5]} ) df_mixed = pd.DataFrame( - {"col1": [0, np.nan, 2, 3, np.nan], "col2": ["a", np.nan, "b", np.nan, "b"]} + { + "col1": [0, np.nan, 2, 3, np.nan], + "col2": ["a", np.nan, "b", np.nan, "b"], + } ) df_timeseries = pd.DataFrame( pd.DataFrame( { - "col1": [i for i in range(20)], - "col2": [0, np.nan, 2, np.nan, 2] + [i for i in range(5, 20)], + "col1": list(range(20)), + "col2": [0, np.nan, 2, np.nan, 2] + list(range(5, 20)), }, index=pd.date_range("2023-04-17", periods=20, freq="D"), ) @@ -80,14 +87,18 @@ def test_hyperparameters_get_hyperparameters() -> None: } -@pytest.mark.parametrize("col, expected", [("col1", expected1), ("col2", expected2)]) +@pytest.mark.parametrize( + "col, expected", [("col1", expected1), ("col2", expected2)] +) def test_hyperparameters_get_hyperparameters_modified( col: str, expected: Dict[str, HyperValue] ) -> None: imputer = imputers.ImputerRpcaNoisy() for key, val in hyperparams_global.items(): setattr(imputer, key, val) - imputer.imputer_params = tuple(set(imputer.imputer_params) | set(hyperparams_global.keys())) + imputer.imputer_params = tuple( + set(imputer.imputer_params) | set(hyperparams_global.keys()) + ) hyperparams = imputer.get_hyperparams(col) assert hyperparams == expected @@ -105,7 +116,9 @@ def test_hyperparameters_get_hyperparameters_modified( @pytest.mark.parametrize( "df", [pd.DataFrame({"col1": [np.nan, np.nan, np.nan], "col2": [1, 2, 3]})] ) -def test_Imputer_fit_transform_on_nan_column(df: pd.DataFrame, imputer: imputers._Imputer) -> None: +def test_Imputer_fit_transform_on_nan_column( + df: pd.DataFrame, imputer: imputers._Imputer +) -> None: np.testing.assert_raises(ValueError, imputer.fit_transform, df) @@ -130,7 +143,9 @@ def test_fit_transform_on_grouped(df: pd.DataFrame) -> None: @pytest.mark.parametrize("df", [df_incomplete]) @pytest.mark.parametrize("df_oracle", [df_complete]) -def test_ImputerOracle_fit_transform(df: pd.DataFrame, df_oracle: pd.DataFrame) -> None: +def test_ImputerOracle_fit_transform( + df: pd.DataFrame, df_oracle: pd.DataFrame +) -> None: imputer = imputers.ImputerOracle() imputer.set_solution(df_oracle) result = imputer.fit_transform(df) @@ -142,7 +157,9 @@ def test_ImputerOracle_fit_transform(df: pd.DataFrame, df_oracle: pd.DataFrame) def test_ImputerSimple_mean_fit_transform(df: pd.DataFrame) -> None: imputer = imputers.ImputerSimple(strategy="mean") result = imputer.fit_transform(df) - expected = pd.DataFrame({"col1": [0, 5 / 3, 2, 3, 5 / 3], "col2": ["a", "b", "b", "b", "b"]}) + expected = pd.DataFrame( + {"col1": [0, 5 / 3, 2, 3, 5 / 3], "col2": ["a", "b", "b", "b", "b"]} + ) pd.testing.assert_frame_equal(result, expected) @@ -150,7 +167,9 @@ def test_ImputerSimple_mean_fit_transform(df: pd.DataFrame) -> None: def test_ImputerSimple_median_fit_transform(df: pd.DataFrame) -> None: imputer = imputers.ImputerSimple() result = imputer.fit_transform(df) - expected = pd.DataFrame({"col1": [0.0, 2.0, 2.0, 3.0, 2.0], "col2": ["a", "b", "b", "b", "b"]}) + expected = pd.DataFrame( + {"col1": [0.0, 2.0, 2.0, 3.0, 2.0], "col2": ["a", "b", "b", "b", "b"]} + ) pd.testing.assert_frame_equal(result, expected) @@ -158,7 +177,9 @@ def test_ImputerSimple_median_fit_transform(df: pd.DataFrame) -> None: def test_ImputerSimple_mode_fit_transform(df: pd.DataFrame) -> None: imputer = imputers.ImputerSimple(strategy="most_frequent") result = imputer.fit_transform(df) - expected = pd.DataFrame({"col1": [0.0, 0.0, 2.0, 3.0, 0.0], "col2": ["a", "b", "b", "b", "b"]}) + expected = pd.DataFrame( + {"col1": [0.0, 0.0, 2.0, 3.0, 0.0], "col2": ["a", "b", "b", "b", "b"]} + ) pd.testing.assert_frame_equal(result, expected) @@ -174,7 +195,9 @@ def test_ImputerShuffle_fit_transform1(df: pd.DataFrame) -> None: def test_ImputerShuffle_fit_transform2(df: pd.DataFrame) -> None: imputer = imputers.ImputerShuffle(random_state=42) result = imputer.fit_transform(df) - expected = pd.DataFrame({"col1": [0, 3, 2, 3, 0], "col2": [-1, 1.5, 0.5, 1.5, 1.5]}) + expected = pd.DataFrame( + {"col1": [0, 3, 2, 3, 0], "col2": [-1, 1.5, 0.5, 1.5, 1.5]} + ) np.testing.assert_allclose(result, expected) @@ -182,7 +205,9 @@ def test_ImputerShuffle_fit_transform2(df: pd.DataFrame) -> None: def test_ImputerLOCF_fit_transform(df: pd.DataFrame) -> None: imputer = imputers.ImputerLOCF() result = imputer.fit_transform(df) - expected = pd.DataFrame({"col1": [0, 0, 2, 3, 3], "col2": [-1, -1, 0.5, 0.5, 1.5]}) + expected = pd.DataFrame( + {"col1": [0, 0, 2, 3, 3], "col2": [-1, -1, 0.5, 0.5, 1.5]} + ) np.testing.assert_allclose(result, expected) @@ -190,7 +215,9 @@ def test_ImputerLOCF_fit_transform(df: pd.DataFrame) -> None: def test_ImputerNOCB_fit_transform(df: pd.DataFrame) -> None: imputer = imputers.ImputerNOCB() result = imputer.fit_transform(df) - expected = pd.DataFrame({"col1": [0, 2, 2, 3, 3], "col2": [-1, 0.5, 0.5, 1.5, 1.5]}) + expected = pd.DataFrame( + {"col1": [0, 2, 2, 3, 3], "col2": [-1, 0.5, 0.5, 1.5, 1.5]} + ) np.testing.assert_allclose(result, expected) @@ -198,7 +225,9 @@ def test_ImputerNOCB_fit_transform(df: pd.DataFrame) -> None: def test_ImputerInterpolation_fit_transform(df: pd.DataFrame) -> None: imputer = imputers.ImputerInterpolation() result = imputer.fit_transform(df) - expected = pd.DataFrame({"col1": [0, 1, 2, 3, 3], "col2": [-1, -0.25, 0.5, 1, 1.5]}) + expected = pd.DataFrame( + {"col1": [0, 1, 2, 3, 3], "col2": [-1, -0.25, 0.5, 1, 1.5]} + ) np.testing.assert_allclose(result, expected) @@ -208,8 +237,8 @@ def test_ImputerResiduals_fit_transform(df: pd.DataFrame) -> None: result = imputer.fit_transform(df) expected = pd.DataFrame( { - "col1": [i for i in range(20)], - "col2": [0, 0.953, 2, 2.061, 2] + [i for i in range(5, 20)], + "col1": list(range(20)), + "col2": [0, 0.953, 2, 2.061, 2] + list(range(5, 20)), }, index=pd.date_range("2023-04-17", periods=20, freq="D"), ) @@ -262,14 +291,18 @@ def test_ImputerRegressor_fit_transform(df: pd.DataFrame) -> None: @pytest.mark.parametrize("df", [df_timeseries]) def test_ImputerRpcaNoisy_fit_transform(df: pd.DataFrame) -> None: - imputer = imputers.ImputerRpcaNoisy(columnwise=False, max_iterations=100, tau=1, lam=0.3) + imputer = imputers.ImputerRpcaNoisy( + columnwise=False, max_iterations=100, tau=1, lam=0.3 + ) df_omega = df.notna() df_result = imputer.fit_transform(df) np.testing.assert_allclose(df_result[df_omega], df[df_omega]) assert df_result.notna().all().all() -index_grouped = pd.MultiIndex.from_product([["a", "b"], range(4)], names=["group", "date"]) +index_grouped = pd.MultiIndex.from_product( + [["a", "b"], range(4)], names=["group", "date"] +) dict_values = { "col1": [0, np.nan, 0, np.nan, 1, 1, 1, 1], "col2": [1, 1, 1, 1, 2, 2, 2, 2], @@ -319,6 +352,8 @@ def test_models_fit_transform_grouped(imputer): imputers.ImputerEM(), ] ) -def test_sklearn_compatible_estimator(estimator: imputers._Imputer, check: Any) -> None: +def test_sklearn_compatible_estimator( + estimator: imputers._Imputer, check: Any +) -> None: """Check compatibility with sklearn, using sklearn estimator checks API.""" check(estimator) diff --git a/tests/imputations/test_imputers_diffusions.py b/tests/imputations/test_imputers_diffusions.py index 18363175..40215091 100644 --- a/tests/imputations/test_imputers_diffusions.py +++ b/tests/imputations/test_imputers_diffusions.py @@ -1,10 +1,11 @@ +from typing import Any + import numpy as np import pandas as pd import pytest - -from typing import Any - -from sklearn.utils.estimator_checks import check_estimator, parametrize_with_checks +from sklearn.utils.estimator_checks import ( + parametrize_with_checks, +) from qolmat.benchmark import metrics from qolmat.imputations import imputers, imputers_pytorch @@ -82,7 +83,9 @@ def test_TabDDPM_fit(df: pd.DataFrame) -> None: ) model = ddpms.TabDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64) - model = model.fit(df, batch_size=2, epochs=2, x_valid=df, print_valid=False) + model = model.fit( + df, batch_size=2, epochs=2, x_valid=df, print_valid=False + ) df_imputed = model.predict(df) @@ -94,7 +97,6 @@ def test_TabDDPM_fit(df: pd.DataFrame) -> None: @pytest.mark.parametrize("df", [df_incomplete]) def test_TabDDPM_process_data(df: pd.DataFrame) -> None: - model = ddpms.TabDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64) arr_processed, arr_mask, _ = model._process_data(df, is_training=True) @@ -104,11 +106,14 @@ def test_TabDDPM_process_data(df: pd.DataFrame) -> None: @pytest.mark.parametrize("df", [df_incomplete]) def test_TabDDPM_process_reversely_data(df: pd.DataFrame) -> None: - model = ddpms.TabDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64) - model = model.fit(df, batch_size=2, epochs=2, x_valid=df, print_valid=False) + model = model.fit( + df, batch_size=2, epochs=2, x_valid=df, print_valid=False + ) - arr_processed, arr_mask, list_indices = model._process_data(df, is_training=False) + arr_processed, arr_mask, list_indices = model._process_data( + df, is_training=False + ) df_imputed = model._process_reversely_data(arr_processed, df, list_indices) np.testing.assert_array_equal(df.shape, df_imputed.shape) @@ -118,11 +123,16 @@ def test_TabDDPM_process_reversely_data(df: pd.DataFrame) -> None: @pytest.mark.parametrize("df", [df_incomplete]) def test_TabDDPM_q_sample(df: pd.DataFrame) -> None: - model = ddpms.TabDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64) - model = model.fit(df, batch_size=2, epochs=2, x_valid=df, print_valid=False) + model = model.fit( + df, batch_size=2, epochs=2, x_valid=df, print_valid=False + ) - device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + device = ( + torch.device("cuda") + if torch.cuda.is_available() + else torch.device("cpu") + ) ts_data_noised, ts_noise = model._q_sample( x=torch.ones(2, 5, dtype=torch.float).to(device), @@ -135,7 +145,9 @@ def test_TabDDPM_q_sample(df: pd.DataFrame) -> None: @pytest.mark.parametrize("df", [df_incomplete]) def test_TabDDPM_eval(df: pd.DataFrame) -> None: - model = ddpms.TabDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64, is_clip=True) + model = ddpms.TabDDPM( + num_noise_steps=10, num_blocks=1, dim_embedding=64, is_clip=True + ) model = model.fit( df, batch_size=2, @@ -156,7 +168,9 @@ def test_TabDDPM_eval(df: pd.DataFrame) -> None: list(df.index), ) - np.testing.assert_array_equal(list(scores.keys()), ["mean_absolute_error", "dist_wasserstein"]) + np.testing.assert_array_equal( + list(scores.keys()), ["mean_absolute_error", "dist_wasserstein"] + ) @pytest.mark.parametrize("df", [df_incomplete]) @@ -191,8 +205,12 @@ def test_TabDDPM_predict(df: pd.DataFrame) -> None: } ) - model = ddpms.TabDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64, is_clip=True) - model = model.fit(df, batch_size=2, epochs=2, x_valid=df, print_valid=False) + model = ddpms.TabDDPM( + num_noise_steps=10, num_blocks=1, dim_embedding=64, is_clip=True + ) + model = model.fit( + df, batch_size=2, epochs=2, x_valid=df, print_valid=False + ) df_imputed = model.predict(df) @@ -216,7 +234,12 @@ def test_TsDDPM_fit(df: pd.DataFrame) -> None: model = ddpms.TsDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64) model = model.fit( - df, batch_size=2, epochs=2, x_valid=df, print_valid=False, index_datetime="datetime" + df, + batch_size=2, + epochs=2, + x_valid=df, + print_valid=False, + index_datetime="datetime", ) df_imputed = model.predict(df) @@ -229,10 +252,16 @@ def test_TsDDPM_fit(df: pd.DataFrame) -> None: @pytest.mark.parametrize("df", [df_incomplete]) def test_TsDDPM_process_data(df: pd.DataFrame) -> None: - - model = ddpms.TsDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64, is_rolling=False) + model = ddpms.TsDDPM( + num_noise_steps=10, num_blocks=1, dim_embedding=64, is_rolling=False + ) model = model.fit( - df, batch_size=2, epochs=2, x_valid=df, print_valid=False, index_datetime="datetime" + df, + batch_size=2, + epochs=2, + x_valid=df, + print_valid=False, + index_datetime="datetime", ) arr_processed, arr_mask, _ = model._process_data(df, is_training=True) @@ -240,9 +269,16 @@ def test_TsDDPM_process_data(df: pd.DataFrame) -> None: np.testing.assert_array_equal(arr_processed.shape, [5, 1, 5]) np.testing.assert_array_equal(arr_mask.shape, [5, 1, 5]) - model = ddpms.TsDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64, is_rolling=True) + model = ddpms.TsDDPM( + num_noise_steps=10, num_blocks=1, dim_embedding=64, is_rolling=True + ) model = model.fit( - df, batch_size=2, epochs=2, x_valid=df, print_valid=False, index_datetime="datetime" + df, + batch_size=2, + epochs=2, + x_valid=df, + print_valid=False, + index_datetime="datetime", ) arr_processed, arr_mask, _ = model._process_data(df, is_training=True) @@ -253,25 +289,42 @@ def test_TsDDPM_process_data(df: pd.DataFrame) -> None: @pytest.mark.parametrize("df", [df_incomplete]) def test_TsDDPM_process_reversely_data(df: pd.DataFrame) -> None: - - model = ddpms.TsDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64, is_rolling=False) + model = ddpms.TsDDPM( + num_noise_steps=10, num_blocks=1, dim_embedding=64, is_rolling=False + ) model = model.fit( - df, batch_size=2, epochs=2, x_valid=df, print_valid=False, index_datetime="datetime" + df, + batch_size=2, + epochs=2, + x_valid=df, + print_valid=False, + index_datetime="datetime", ) - arr_processed, arr_mask, list_indices = model._process_data(df, is_training=False) + arr_processed, arr_mask, list_indices = model._process_data( + df, is_training=False + ) df_imputed = model._process_reversely_data(arr_processed, df, list_indices) np.testing.assert_array_equal(df.shape, df_imputed.shape) np.testing.assert_array_equal(df.index, df_imputed.index) np.testing.assert_array_equal(df.columns, df_imputed.columns) - model = ddpms.TsDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64, is_rolling=True) + model = ddpms.TsDDPM( + num_noise_steps=10, num_blocks=1, dim_embedding=64, is_rolling=True + ) model = model.fit( - df, batch_size=2, epochs=2, x_valid=df, print_valid=False, index_datetime="datetime" + df, + batch_size=2, + epochs=2, + x_valid=df, + print_valid=False, + index_datetime="datetime", ) - arr_processed, arr_mask, list_indices = model._process_data(df, is_training=False) + arr_processed, arr_mask, list_indices = model._process_data( + df, is_training=False + ) df_imputed = model._process_reversely_data(arr_processed, df, list_indices) np.testing.assert_array_equal(df.shape, df_imputed.shape) @@ -281,12 +334,20 @@ def test_TsDDPM_process_reversely_data(df: pd.DataFrame) -> None: @pytest.mark.parametrize("df", [df_incomplete]) def test_TsDDPM_q_sample(df: pd.DataFrame) -> None: - model = ddpms.TsDDPM(num_noise_steps=10, num_blocks=1, dim_embedding=64) model = model.fit( - df, batch_size=2, epochs=2, x_valid=df, print_valid=False, index_datetime="datetime" + df, + batch_size=2, + epochs=2, + x_valid=df, + print_valid=False, + index_datetime="datetime", + ) + device = ( + torch.device("cuda") + if torch.cuda.is_available() + else torch.device("cpu") ) - device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") ts_data_noised, ts_noise = model._q_sample( x=torch.ones(2, 1, 5, dtype=torch.float).to(device), @@ -299,9 +360,13 @@ def test_TsDDPM_q_sample(df: pd.DataFrame) -> None: @parametrize_with_checks( [ - imputers_pytorch.ImputerDiffusion(model=ddpms.TabDDPM(), batch_size=1, epochs=1), + imputers_pytorch.ImputerDiffusion( + model=ddpms.TabDDPM(), batch_size=1, epochs=1 + ), ] ) -def test_sklearn_compatible_estimator(estimator: imputers._Imputer, check: Any) -> None: +def test_sklearn_compatible_estimator( + estimator: imputers._Imputer, check: Any +) -> None: """Check compatibility with sklearn, using sklearn estimator checks API.""" check(estimator) diff --git a/tests/imputations/test_imputers_pytorch.py b/tests/imputations/test_imputers_pytorch.py index a6146291..0704114c 100644 --- a/tests/imputations/test_imputers_pytorch.py +++ b/tests/imputations/test_imputers_pytorch.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd import pytest -import torch from qolmat.imputations import imputers_pytorch from qolmat.utils.exceptions import PyTorchExtraNotInstalled @@ -29,7 +28,9 @@ def test_ImputerRegressorPyTorch_fit_transform(df: pd.DataFrame) -> None: nn.manual_seed(42) if nn.cuda.is_available(): nn.cuda.manual_seed(42) - estimator = imputers_pytorch.build_mlp(input_dim=2, list_num_neurons=[64, 32]) + estimator = imputers_pytorch.build_mlp( + input_dim=2, list_num_neurons=[64, 32] + ) imputer = imputers_pytorch.ImputerRegressorPyTorch( estimator=estimator, handler_nan="column", epochs=10 ) @@ -55,30 +56,3 @@ def test_ImputerRegressorPyTorch_fit_transform(df: pd.DataFrame) -> None: } ) np.testing.assert_allclose(result, expected, atol=1e-3) - - -# @pytest.mark.parametrize("df", [df_incomplete]) -# def test_imputers_pytorch_Autoencoder(df: pd.DataFrame) -> None: -# input = df.values.shape[1] -# latent = 4 -# encoder, decoder = imputers_pytorch.build_autoencoder_example( -# input_dim=input, -# latent_dim=latent, -# output_dim=input, -# list_num_neurons=[4 * latent, 2 * latent], -# ) -# autoencoder = imputers_pytorch.ImputerAutoencoder( -# encoder, decoder, epochs=10, lamb=0.01, max_iterations=5, random_state=42 -# ) -# result = autoencoder.fit_transform(df) -# print(result) -# expected = pd.DataFrame( -# { -# "col1": [22.315, 15, 22.496, 23, 33], -# "col2": [69, 76, 74, 80, 78], -# "col3": [174, 166, 182, 177, 174.218], -# "col4": [9, 12, 11, 12, 8], -# "col5": [93, 75, 62.308, 12, 62.449], -# } -# ) -# np.testing.assert_allclose(result, expected, atol=1e-3) diff --git a/tests/imputations/test_preprocessing.py b/tests/imputations/test_preprocessing.py index 30b55bd3..a05fffdb 100644 --- a/tests/imputations/test_preprocessing.py +++ b/tests/imputations/test_preprocessing.py @@ -1,15 +1,12 @@ import numpy as np import pandas as pd import pytest -from sklearn.compose import make_column_selector as selector - -from sklearn.pipeline import Pipeline from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import mean_squared_error -from sklearn.utils.estimator_checks import check_estimator -from sklearn.utils.validation import check_X_y, check_array from sklearn.model_selection import train_test_split -from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from sklearn.utils.estimator_checks import check_estimator + from qolmat.imputations.preprocessing import ( BinTransformer, MixteHGBM, @@ -83,7 +80,9 @@ def test_fit_transform_BinTransformer(bin_transformer): def test_transform_BinTransformer(bin_transformer): bin_transformer.dict_df_bins_ = { - 0: pd.DataFrame({"value": [1, 2, 3, 4, 5], "min": [-np.inf, 1.5, 2.5, 3.5, 4.5]}) + 0: pd.DataFrame( + {"value": [1, 2, 3, 4, 5], "min": [-np.inf, 1.5, 2.5, 3.5, 4.5]} + ) } bin_transformer.feature_names_in_ = pd.Index([0]) bin_transformer.n_features_in_ = 1 @@ -100,7 +99,9 @@ def test_fit_transform_with_dataframes_BinTransformer(bin_transformer): def test_transform_with_dataframes_BinTransformer(bin_transformer): bin_transformer.dict_df_bins_ = { - 0: pd.DataFrame({"value": [1, 2, 3, 4, 5], "min": [0.5, 1.5, 2.5, 3.5, 4.5]}) + 0: pd.DataFrame( + {"value": [1, 2, 3, 4, 5], "min": [0.5, 1.5, 2.5, 3.5, 4.5]} + ) } bin_transformer.feature_names_in_ = pd.Index(["0"]) bin_transformer.n_features_in_ = 1 @@ -126,7 +127,9 @@ def test_inverse_transform_OneHotEncoderProjector(encoder): df_back = encoder.inverse_transform(df_dum) pd.testing.assert_frame_equal(df, df_back) - df_dum_perturbated = df_dum + np.random.uniform(-0.5, 0.5, size=df_dum.shape) + df_dum_perturbated = df_dum + np.random.uniform( + -0.5, 0.5, size=df_dum.shape + ) df_back = encoder.inverse_transform(df_dum_perturbated) pd.testing.assert_frame_equal(df, df_back) @@ -137,16 +140,22 @@ def test_inverse_transform_OneHotEncoderProjector(encoder): class DummyTransformer(TransformerMixin, BaseEstimator): + """Dummy transformer for testing.""" + def fit(self, X, y=None): + """Fit function.""" return self def transform(self, X): + """Transform function.""" return X def fit_transform(self, X, y=None): + """Fit and transform function.""" return self.fit(X, y).transform(X) def inverse_transform(self, X, y=None): + """Inverse transform function.""" return X diff --git a/tests/imputations/test_softimpute.py b/tests/imputations/test_softimpute.py index e8c3dff0..b85025da 100644 --- a/tests/imputations/test_softimpute.py +++ b/tests/imputations/test_softimpute.py @@ -1,4 +1,3 @@ -from typing import Any import numpy as np import pytest from numpy.typing import NDArray @@ -10,16 +9,16 @@ X_non_regression_test = np.array( [[1, 2, np.nan, 4], [1, 5, 3, np.nan], [4, 2, 3, 2], [1, 1, 5, 4]] ) -X_expected = np.array([[1, 2, 2.9066, 4], [1, 5, 3, 2.1478], [4, 2, 3, 2], [1, 1, 5, 4]]) +X_expected = np.array( + [[1, 2, 2.9066, 4], [1, 5, 3, 2.1478], [4, 2, 3, 2], [1, 1, 5, 4]] +) tau = 1 max_iterations = 30 random_state = 50 def test_initialized_default() -> None: - """Test that initialization does not crash and - has default parameters - """ + """Test that initialization does not crash and has default parameters.""" model = softimpute.SoftImpute() assert model.period == 1 assert model.rank is None @@ -27,9 +26,7 @@ def test_initialized_default() -> None: def test_initialized_custom() -> None: - """Test that initialization does not crash and - has custom parameters - """ + """Test that initialization does not crash and has custom parameters.""" model = softimpute.SoftImpute(period=2, rank=10) assert model.period == 2 assert model.rank == 10 @@ -38,13 +35,17 @@ def test_initialized_custom() -> None: @pytest.mark.parametrize("X", [X]) def test_soft_impute_decompose(X: NDArray) -> None: - """Test fit instance and decomposition is computed""" + """Test fit instance and decomposition is computed.""" tau = 1 model = softimpute.SoftImpute(tau=tau) Omega = ~np.isnan(X) X_imputed = np.where(Omega, X, 0) - cost_all_in_M = model.cost_function(X, X_imputed, np.full_like(X, 0), Omega, tau) - cost_all_in_A = model.cost_function(X, np.full_like(X, 0), X_imputed, Omega, tau) + cost_all_in_M = model.cost_function( + X, X_imputed, np.full_like(X, 0), Omega, tau + ) + cost_all_in_A = model.cost_function( + X, np.full_like(X, 0), X_imputed, Omega, tau + ) M, A = model.decompose(X, Omega) cost_final = model.cost_function(X, M, A, Omega, tau) assert isinstance(model, softimpute.SoftImpute) @@ -56,12 +57,9 @@ def test_soft_impute_decompose(X: NDArray) -> None: assert cost_final < cost_all_in_A -# tests/imputations/test_imputers.py::test_sklearn_compatible_estimator - - @pytest.mark.parametrize("X", [X]) def test_soft_impute_convergence(X: NDArray) -> None: - """Test type of the check convergence""" + """Test type of the check convergence.""" model = softimpute.SoftImpute() M = model.random_state.uniform(size=(10, 20)) U, D, V = np.linalg.svd(M, full_matrices=False) @@ -70,31 +68,14 @@ def test_soft_impute_convergence(X: NDArray) -> None: def test_soft_impute_convergence_with_none() -> None: - """Test check type None and raise error""" + """Test check type None and raise error.""" model = softimpute.SoftImpute() with pytest.raises(ValueError): _ = model._check_convergence( - None, + np.array([1]), np.array([1]), np.array([1]), np.array([1]), np.array([1]), np.array([1]), ) - - -# @pytest.mark.parametrize( -# "X, X_expected, tau, max_iterations, random_state", -# [(X_non_regression_test, X_expected, tau, max_iterations, random_state)], -# ) -# def test_soft_impute_non_regression( -# X: NDArray, X_expected: NDArray, tau: float, max_iterations: int, random_state: int -# ) -> None: -# """Non regression test""" -# model = softimpute.SoftImpute( -# tau=tau, max_iterations=max_iterations, random_state=random_state -# ) -# Omega = ~np.isnan(X) -# M, A = model.decompose(X, Omega) -# X_result = M + A -# np.testing.assert_allclose(X_result, X_expected, rtol=1e-3, atol=1e-3) diff --git a/tests/utils/test_algebra.py b/tests/utils/test_algebra.py index 45a508c8..ae6a6ae4 100644 --- a/tests/utils/test_algebra.py +++ b/tests/utils/test_algebra.py @@ -1,7 +1,7 @@ import numpy as np -from sympy import diag from qolmat.utils import algebra +from qolmat.utils.algebra import svdtriplet def test_frechet_distance_exact(): @@ -12,7 +12,9 @@ def test_frechet_distance_exact(): means2 = np.array([0, -1, 1]) cov2 = np.eye(3, 3) - expected = np.sum((means2 - means1) ** 2) + np.sum((np.sqrt(stds) - 1) ** 2) + expected = np.sum((means2 - means1) ** 2) + np.sum( + (np.sqrt(stds) - 1) ** 2 + ) expected /= 3 result = algebra.frechet_distance_exact(means1, cov1, means2, cov2) np.testing.assert_almost_equal(result, expected, decimal=3) @@ -26,6 +28,118 @@ def test_kl_divergence_gaussian_exact(): means2 = np.array([0, -1, 1]) cov2 = np.eye(3, 3) - expected = (np.sum(stds**2 - np.log(stds**2) - 1 + (means2 - means1) ** 2)) / 2 + expected = ( + np.sum(stds**2 - np.log(stds**2) - 1 + (means2 - means1) ** 2) + ) / 2 result = algebra.kl_divergence_gaussian_exact(means1, cov1, means2, cov2) np.testing.assert_almost_equal(result, expected, decimal=3) + +def test_svdtriplet_known_matrix(): + """Test svdtriplet on a known matrix without weights.""" + X = np.array([[3, 1], [1, 3]]) + expected_singular_values = np.array([4, 2]) + expected_U = np.array([[0.7071, -0.7071], + [0.7071, 0.7071]]) + expected_V = np.array([[0.7071, 0.7071], + [0.7071, -0.7071]]) + # Call svdtriplet without weights + s, U, V = svdtriplet(X, row_w=None, ncp=2) + # Compare singular values + np.testing.assert_almost_equal(s, expected_singular_values, decimal=3) + np.testing.assert_almost_equal(np.abs(U), np.abs(expected_U), decimal=3) + np.testing.assert_almost_equal(np.abs(V), np.abs(expected_V), decimal=3) + +def test_svdtriplet_with_row_weights(): + """Test svdtriplet with row weights.""" + X = np.array([[1, 2], [3, 4], [5, 6]]) + row_w = np.array([0.2, 0.5, 0.3]) + # Manually compute the weighted X + X_weighted = X * np.sqrt(row_w)[:, None] + U_expected, s_expected, Vt_expected = np.linalg.svd(X_weighted, + full_matrices=False) + V_expected = Vt_expected.T + # Call svdtriplet with weights + s, U, V = svdtriplet(X, row_w=row_w, ncp=2) + # Rescale U_expected by dividing by sqrt(row_w) + U_expected /= np.sqrt(row_w)[:, None] + # Compare singular values + np.testing.assert_allclose(s, s_expected[:2], atol=1e-6) + # Compare U and V (up to sign) + np.testing.assert_allclose(np.abs(U), np.abs(U_expected[:, :2]), atol=1e-6) + np.testing.assert_allclose(np.abs(V), np.abs(V_expected[:, :2]), atol=1e-6) + +def test_svdtriplet_ncp_limit(): + """Test svdtriplet with ncp less than the full rank.""" + X = np.random.rand(5, 3) + ncp = 2 + s, U, V = svdtriplet(X, ncp=ncp) + # Check the dimensions + assert s.shape == (ncp,) + assert U.shape == (X.shape[0], ncp) + assert V.shape == (X.shape[1], ncp) + # Reconstruct X approximation + X_approx = U @ np.diag(s) @ V.T + # Check that the approximation is close to X + # Note: With reduced ncp, approximation won't be exact + assert X_approx.shape == X.shape + s_full, _, _ = svdtriplet(X) + X_full = U @ np.diag(s_full) @ V.T + error_ncp = np.linalg.norm(X - X_approx) + error_full = np.linalg.norm(X - X_full) + assert error_ncp >= error_full + +def test_svdtriplet_row_weights_none(): + """Test svdtriplet with default row weights.""" + X = np.random.rand(4, 4) + s_default, U_default, V_default = svdtriplet(X) + # Manually set uniform weights + row_w = np.ones(X.shape[0]) / X.shape[0] + s_manual, U_manual, V_manual = svdtriplet(X, row_w=row_w) + # Compare results + np.testing.assert_allclose(s_default, s_manual, atol=1e-6) + np.testing.assert_allclose(U_default, U_manual, atol=1e-6) + np.testing.assert_allclose(V_default, V_manual, atol=1e-6) + +def test_svdtriplet_zero_matrix(): + """Test svdtriplet on a zero matrix.""" + X = np.zeros((3, 3)) + s, U, V = svdtriplet(X) + # Singular values should be zero + expected_s = np.zeros(3) + np.testing.assert_array_equal(s, expected_s) + # U and V should be orthogonal matrices + np.testing.assert_allclose(U.T @ U, np.eye(3), atol=1e-6) + np.testing.assert_allclose(V.T @ V, np.eye(3), atol=1e-6) + +def test_svdtriplet_non_square_matrix(): + """Test svdtriplet on a non-square matrix.""" + X = np.random.rand(6, 4) + s, U, V = svdtriplet(X) + # Check dimensions + assert U.shape == (6, 4) + assert s.shape == (4,) + assert V.shape == (4, 4) + # Reconstruct X + X_reconstructed = U @ np.diag(s) @ V.T + np.testing.assert_allclose(X, X_reconstructed, atol=1e-6) + +def test_svdtriplet_large_ncp(): + """Test svdtriplet with ncp larger than possible.""" + X = np.random.rand(5, 3) + ncp = 10 # Larger than min(n_samples - 1, n_features) + s, U, V = svdtriplet(X, ncp=ncp) + expected_ncp = min(5 - 1, 3) + assert s.shape == (expected_ncp,) + assert U.shape == (5, expected_ncp) + assert V.shape == (3, expected_ncp) + +def test_svdtriplet_negative_weights(): + """Test svdtriplet with negative row weights (should raise an error).""" + X = np.random.rand(4, 4) + row_w = np.array([0.25, -0.25, 0.5, 0.5]) # Negative weight + with pytest.raises(ValueError): + s, U, V = svdtriplet(X, row_w=row_w) + + + + diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py index 40ee120a..713ff611 100644 --- a/tests/utils/test_data.py +++ b/tests/utils/test_data.py @@ -1,19 +1,40 @@ import datetime import os +from unittest.mock import MagicMock, patch import numpy as np import pandas as pd import pytest from pytest_mock.plugin import MockerFixture -from unittest.mock import MagicMock, patch + from qolmat.utils import data columns = ["station", "date", "year", "month", "day", "hour", "a", "b", "wd"] df_beijing_raw = pd.DataFrame( [ ["Beijing", datetime.datetime(2013, 3, 1), 2013, 3, 1, 0, 1, 2, "NW"], - ["Beijing", datetime.datetime(2013, 3, 1), 2014, 3, 1, 0, 3, np.nan, "NW"], - ["Beijing", datetime.datetime(2013, 3, 1), 2015, 3, 1, 0, np.nan, 6, "NW"], + [ + "Beijing", + datetime.datetime(2013, 3, 1), + 2014, + 3, + 1, + 0, + 3, + np.nan, + "NW", + ], + [ + "Beijing", + datetime.datetime(2013, 3, 1), + 2015, + 3, + 1, + 0, + np.nan, + 6, + "NW", + ], ], columns=columns, ) @@ -71,7 +92,13 @@ [2.0, 5.0, 4.0, 1.0, 4.0], [3.0, 6.0, 3.0, 4.0, 6.0], ], - columns=["T1 rain", "T2 preasure", "T3 temperature", "T4 humidity", "T5 sun"], + columns=[ + "T1 rain", + "T2 preasure", + "T3 temperature", + "T4 humidity", + "T5 sun", + ], index=pd.date_range(start="2010-01-01", periods=3, freq="1D"), ) @@ -222,7 +249,9 @@ def test_get_dataframes_in_folder(mock_convert_tsf, mock_read_csv, mock_walk): mock_walk.return_value = [("/fakepath", ("subfolder",), ("file.csv",))] result_csv = data.get_dataframes_in_folder("/fakepath", ".csv") assert len(result_csv) == 1 - mock_read_csv.assert_called_once_with(os.path.join("/fakepath", "file.csv")) + mock_read_csv.assert_called_once_with( + os.path.join("/fakepath", "file.csv") + ) pd.testing.assert_frame_equal(result_csv[0], df_conductor) mock_read_csv.reset_mock() @@ -230,7 +259,9 @@ def test_get_dataframes_in_folder(mock_convert_tsf, mock_read_csv, mock_walk): mock_walk.return_value = [("/fakepath", ("subfolder",), ("file.tsf",))] result_tsf = data.get_dataframes_in_folder("/fakepath", ".tsf") assert len(result_tsf) == 1 - mock_convert_tsf.assert_called_once_with(os.path.join("/fakepath", "file.tsf")) + mock_convert_tsf.assert_called_once_with( + os.path.join("/fakepath", "file.tsf") + ) pd.testing.assert_frame_equal(result_tsf[0], df_beijing) mock_read_csv.assert_called() @@ -238,14 +269,18 @@ def test_get_dataframes_in_folder(mock_convert_tsf, mock_read_csv, mock_walk): @patch("numpy.random.normal") @patch("numpy.random.choice") @patch("numpy.random.standard_exponential") -def test_generate_artificial_ts(mock_standard_exponential, mock_choice, mock_normal): +def test_generate_artificial_ts( + mock_standard_exponential, mock_choice, mock_normal +): n_samples = 100 periods = [10, 20] amp_anomalies = 1.0 ratio_anomalies = 0.1 amp_noise = 0.1 - mock_standard_exponential.return_value = np.ones(int(n_samples * ratio_anomalies)) + mock_standard_exponential.return_value = np.ones( + int(n_samples * ratio_anomalies) + ) mock_choice.return_value = np.arange(int(n_samples * ratio_anomalies)) mock_normal.return_value = np.zeros(n_samples) @@ -274,11 +309,20 @@ def test_generate_artificial_ts(mock_standard_exponential, mock_choice, mock_nor ("Bug", None), ], ) -def test_data_get_data(name_data: str, df: pd.DataFrame, mocker: MockerFixture) -> None: - mock_download = mocker.patch("qolmat.utils.data.download_data_from_zip", return_value=[df]) - mock_read = mocker.patch("qolmat.utils.data.read_csv_local", return_value=df) +def test_data_get_data( + name_data: str, df: pd.DataFrame, mocker: MockerFixture +) -> None: + mock_download = mocker.patch( + "qolmat.utils.data.download_data_from_zip", return_value=[df] + ) + mock_read = mocker.patch( + "qolmat.utils.data.read_csv_local", return_value=df + ) mock_read_dl = mocker.patch("pandas.read_csv", return_value=df) - mocker.patch("qolmat.utils.data.preprocess_data_beijing", return_value=df_preprocess_beijing) + mocker.patch( + "qolmat.utils.data.preprocess_data_beijing", + return_value=df_preprocess_beijing, + ) mocker.patch("pandas.read_parquet", return_value=df_sncf) try: @@ -346,7 +390,9 @@ def test_preprocess_data_beijing(df: pd.DataFrame) -> None: assert result_df.index.names == ["station", "datetime"] assert all(result_df.index.get_level_values("station") == "Beijing") assert len(result_df) == 1 - assert np.isclose(result_df.loc[(("Beijing"),), "pm2.5"], 176.66666666666666) + assert np.isclose( + result_df.loc[(("Beijing"),), "pm2.5"], 176.66666666666666 + ) @pytest.mark.parametrize("df", [df_preprocess_offline]) @@ -363,7 +409,9 @@ def test_data_add_holes(df: pd.DataFrame) -> None: ("Beijing", df_beijing), ], ) -def test_data_get_data_corrupted(name_data: str, df: pd.DataFrame, mocker: MockerFixture) -> None: +def test_data_get_data_corrupted( + name_data: str, df: pd.DataFrame, mocker: MockerFixture +) -> None: mock_get = mocker.patch("qolmat.utils.data.get_data", return_value=df) df_out = data.get_data_corrupted(name_data) assert mock_get.call_count == 1 @@ -395,5 +443,7 @@ def test_data_add_datetime_features(df: pd.DataFrame) -> None: result = data.add_datetime_features(df) pd.testing.assert_index_equal(result.index, df.index) assert result.columns.tolist() == columns_out - pd.testing.assert_frame_equal(result.drop(columns=["time_cos", "time_sin"]), df) + pd.testing.assert_frame_equal( + result.drop(columns=["time_cos", "time_sin"]), df + ) assert (result["time_cos"] ** 2 + result["time_sin"] ** 2 == 1).all() diff --git a/tests/utils/test_exceptions.py b/tests/utils/test_exceptions.py index e9e10b7a..e0703c7f 100644 --- a/tests/utils/test_exceptions.py +++ b/tests/utils/test_exceptions.py @@ -1,4 +1,3 @@ -import pytest from qolmat.utils import exceptions diff --git a/tests/utils/test_plot.py b/tests/utils/test_plot.py index 5c45e72e..aadbaf7f 100644 --- a/tests/utils/test_plot.py +++ b/tests/utils/test_plot.py @@ -1,13 +1,14 @@ from typing import Any, List, Tuple -import matplotlib as mpl + import matplotlib.pyplot as plt import numpy as np import pandas as pd import pytest import scipy.sparse -from qolmat.utils import plot from pytest_mock.plugin import MockerFixture +from qolmat.utils import plot + plt.switch_backend("Agg") np.random.seed(42) @@ -30,12 +31,16 @@ df1 = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) df2 = pd.DataFrame({"x": [2, 3, 4], "y": [5, 6, 7]}) dict_df_imputed = { - "Imputer1": pd.DataFrame({"A": [2, 3, np.nan], "B": [5, np.nan, 7], "C": [np.nan, 8, 9]}) + "Imputer1": pd.DataFrame( + {"A": [2, 3, np.nan], "B": [5, np.nan, 7], "C": [np.nan, 8, 9]} + ) } @pytest.mark.parametrize("list_matrices", [list_matrices]) -def test_utils_plot_plot_matrices(list_matrices: List[np.ndarray], mocker: MockerFixture) -> None: +def test_utils_plot_plot_matrices( + list_matrices: List[np.ndarray], mocker: MockerFixture +) -> None: mocker.patch("matplotlib.pyplot.savefig") mocker.patch("matplotlib.pyplot.show") plot.plot_matrices(list_matrices=list_matrices, title="title") @@ -45,7 +50,9 @@ def test_utils_plot_plot_matrices(list_matrices: List[np.ndarray], mocker: Mocke @pytest.mark.parametrize("list_signals", [list_signals]) -def test_utils_plot_plot_signal(list_signals: List[List[Any]], mocker: MockerFixture) -> None: +def test_utils_plot_plot_signal( + list_signals: List[List[Any]], mocker: MockerFixture +) -> None: mocker.patch("matplotlib.pyplot.savefig") mocker.patch("matplotlib.pyplot.show") plot.plot_signal(list_signals=list_signals, ylabel="ylabel", title="title") @@ -54,7 +61,9 @@ def test_utils_plot_plot_signal(list_signals: List[List[Any]], mocker: MockerFix plt.close("all") -@pytest.mark.parametrize("M, A, E, index_array, dims", [(M, A, E, [0, 1, 2], (10, 10))]) +@pytest.mark.parametrize( + "M, A, E, index_array, dims", [(M, A, E, [0, 1, 2], (10, 10))] +) def test__utils_plot_plot_images( M: np.ndarray, A: np.ndarray, @@ -72,7 +81,9 @@ def test__utils_plot_plot_images( @pytest.mark.parametrize("X", [X]) -def test_utils_plot_make_ellipses_from_data(X: np.ndarray, mocker: MockerFixture): +def test_utils_plot_make_ellipses_from_data( + X: np.ndarray, mocker: MockerFixture +): mocker.patch("matplotlib.pyplot.show") ax = plt.gca() plot.make_ellipses_from_data(X[1], X[2], ax, color="blue") @@ -93,7 +104,9 @@ def test_utils_plot_compare_covariances( @pytest.mark.parametrize("df", [df]) @pytest.mark.parametrize("orientation", ["horizontal", "vertical"]) -def test_utils_plot_multibar(df: pd.DataFrame, orientation: str, mocker: MockerFixture): +def test_utils_plot_multibar( + df: pd.DataFrame, orientation: str, mocker: MockerFixture +): mocker.patch("matplotlib.pyplot.show") plot.multibar(df, orientation=orientation) assert len(plt.gcf().get_axes()) > 0 diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 950d2bf0..4f048d10 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -1,20 +1,21 @@ import sys +from io import StringIO + import numpy as np -from numpy.typing import NDArray import pandas as pd import pytest -from qolmat.utils import utils -from pytest_mock.plugin import MockerFixture -from io import StringIO - -from qolmat.utils.exceptions import NotDimension2, SignalTooShort +from numpy.typing import NDArray +from qolmat.utils import utils +from qolmat.utils.exceptions import NotDimension2 df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) @pytest.mark.parametrize("iteration, total", [(1, 1)]) -def test_utils_utils_display_progress_bar(iteration: int, total: int, capsys) -> None: +def test_utils_utils_display_progress_bar( + iteration: int, total: int, capsys +) -> None: captured_output = StringIO() sys.stdout = captured_output utils.progress_bar( @@ -34,7 +35,9 @@ def test_utils_utils_display_progress_bar(iteration: int, total: int, capsys) -> assert output == output_expected -@pytest.mark.parametrize("values, lag_max", [(pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]), 3)]) +@pytest.mark.parametrize( + "values, lag_max", [(pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]), 3)] +) def test_utils_utils_acf(values, lag_max): result = utils.acf(values, lag_max) result_expected = pd.Series([1.0, 1.0, 1.0])