diff --git a/docs/analysis.rst b/docs/analysis.rst index 4dcbd2ec..3a8af731 100644 --- a/docs/analysis.rst +++ b/docs/analysis.rst @@ -11,7 +11,7 @@ The analysis module provides tools to characterize the type of holes. The MNAR case is the trickiest, the user must first consider whether their missing data mechanism is MNAR. In the meantime, we make assume that the missing-data mechanism is ignorable (ie., it is not MNAR). If an MNAR mechanism is suspected, please see this article :ref:`An approach to test for MNAR [1]` for relevant actions. -Then Qolmat proposes a test to determine whether the missing data mechanism is MCAR or MAR. +Then Qolmat proposes two tests to determine whether the missing data mechanism is MCAR or MAR. 2. How to use the results ------------------------- @@ -45,12 +45,16 @@ The MCAR missing-data mechanism means that there is independence between the pre a. Little's Test ^^^^^^^^^^^^^^^^ -The best-known MCAR test is the :ref:`Little [2]` test, and it has been implemented in :class:`LittleTest`. Keep in mind that the Little's test is designed to test the homogeneity of means across the missing patterns and won't be efficient to detect the heterogeneity of covariance accross missing patterns. +The best-known MCAR test is the :ref:`Little [1]` test, and it has been implemented in :class:`LittleTest`. Keep in mind that the Little's test is designed to test the homogeneity of means across the missing patterns and won't be efficient to detect the heterogeneity of covariance accross missing patterns. b. PKLM Test ^^^^^^^^^^^^ -The :ref:`PKLM [2]` (Projected Kullback-Leibler MCAR) test compares the distributions of different missing patterns on random projections in the variable space of the data. This recent test applies to mixed-type data. It is not implemented yet in Qolmat. +The :ref:`PKLM [2]` (Projected Kullback-Leibler MCAR) test compares the distributions of different missing patterns on random projections in the variable space of the data. This recent test applies to mixed-type data. The :class:`PKLMTest` is now implemented in Qolmat. +To carry out this test, we perform random projections in the variable space of the data. These random projections allow us to construct a fully observed sub-matrix and an associated number of missing patterns. +The idea is then to compare the distributions of the missing patterns through the Kullback-Leibler distance. +To do this, the distributions for each pattern are estimated using Random Forests. + References ---------- diff --git a/docs/images/schema_qolmat.png b/docs/images/schema_qolmat.png index 6f96338c..d5702a6e 100644 Binary files a/docs/images/schema_qolmat.png and b/docs/images/schema_qolmat.png differ diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py index 1fdbbddb..a92398cb 100644 --- a/examples/tutorials/plot_tuto_benchmark_TS.py +++ b/examples/tutorials/plot_tuto_benchmark_TS.py @@ -78,7 +78,9 @@ ratio_masked = 0.1 imputer_median = imputers.ImputerSimple(groups=("station",), strategy="median") -imputer_interpol = imputers.ImputerInterpolation(groups=("station",), method="linear") +imputer_interpol = imputers.ImputerInterpolation( + groups=("station",), method="linear" +) imputer_residuals = imputers.ImputerResiduals( groups=("station",), period=365, @@ -103,7 +105,10 @@ ) generator_holes = missing_patterns.EmpiricalHoleGenerator( - n_splits=4, groups=("station",), subset=cols_to_impute, ratio_masked=ratio_masked + n_splits=4, + groups=("station",), + subset=cols_to_impute, + ratio_masked=ratio_masked, ) dict_imputers = { @@ -142,11 +147,17 @@ # Aotizhongxin df_plot = df[cols_to_impute] -dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()} +dfs_imputed = { + name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items() +} station = "Aotizhongxin" df_station = df_plot.loc[station] -dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()} -fig, axs = plt.subplots(3, 1, sharex=True, figsize=(10, 3 * len(cols_to_impute))) +dfs_imputed_station = { + name: df_plot.loc[station] for name, df_plot in dfs_imputed.items() +} +fig, axs = plt.subplots( + 3, 1, sharex=True, figsize=(10, 3 * len(cols_to_impute)) +) for col, ax in zip(cols_to_impute, axs.flatten()): values_orig = df_station[col] ax.plot(values_orig, ".", color="black", label="original") @@ -174,7 +185,9 @@ fig = plt.figure(figsize=(10, 10)) i_plot = 1 for i, col in enumerate(cols_to_impute[:-1]): - for i_imputer, (name_imputer, df_imp) in enumerate(dfs_imputed_station.items()): + for i_imputer, (name_imputer, df_imp) in enumerate( + dfs_imputed_station.items() + ): ax = fig.add_subplot(n_columns, n_imputers, i_plot) plot.compare_covariances( df_station, diff --git a/examples/tutorials/plot_tuto_diffusion_models.py b/examples/tutorials/plot_tuto_diffusion_models.py index 0ff0d80a..8275f3bb 100644 --- a/examples/tutorials/plot_tuto_diffusion_models.py +++ b/examples/tutorials/plot_tuto_diffusion_models.py @@ -66,7 +66,11 @@ df_data_valid = df_data.iloc[:500] tabddpm = ImputerDiffusion( - model=TabDDPM(), epochs=10, batch_size=100, x_valid=df_data_valid, print_valid=True + model=TabDDPM(), + epochs=10, + batch_size=100, + x_valid=df_data_valid, + print_valid=True, ) tabddpm = tabddpm.fit(df_data) @@ -150,8 +154,12 @@ # reconstruction errors (mae) but increases distribution distance (KL_columnwise). dict_imputers = { - "num_sampling=5": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100), - "num_sampling=10": ImputerDiffusion(model=TabDDPM(num_sampling=10), epochs=10, batch_size=100), + "num_sampling=5": ImputerDiffusion( + model=TabDDPM(num_sampling=5), epochs=10, batch_size=100 + ), + "num_sampling=10": ImputerDiffusion( + model=TabDDPM(num_sampling=10), epochs=10, batch_size=100 + ), } comparison = comparator.Comparator( @@ -196,7 +204,9 @@ # but requires a longer training/inference time. dict_imputers = { - "tabddpm": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100), + "tabddpm": ImputerDiffusion( + model=TabDDPM(num_sampling=5), epochs=10, batch_size=100 + ), "tsddpm": ImputerDiffusion( model=TsDDPM(num_sampling=5, is_rolling=False), epochs=10, diff --git a/examples/tutorials/plot_tuto_hole_generator.py b/examples/tutorials/plot_tuto_hole_generator.py index 07ea6348..d8a7f4c2 100644 --- a/examples/tutorials/plot_tuto_hole_generator.py +++ b/examples/tutorials/plot_tuto_hole_generator.py @@ -14,6 +14,7 @@ It consists in hourly air pollutants data from 12 chinese nationally-controlled air-quality monitoring sites. """ + from typing import List import matplotlib @@ -49,7 +50,9 @@ # Missing values are in white, while observed ones are in black. plt.figure(figsize=(15, 4)) -plt.imshow(df.notna().values.T, aspect="auto", cmap="binary", interpolation="none") +plt.imshow( + df.notna().values.T, aspect="auto", cmap="binary", interpolation="none" +) plt.yticks(range(len(df.columns)), df.columns) plt.xlabel("Samples", fontsize=12) plt.grid(False) @@ -96,7 +99,9 @@ def visualise_missing_values(df_init: pd.DataFrame, df_mask: pd.DataFrame): colorsList = [(0.9, 0, 0), (0, 0, 0), (0.8, 0.8, 0.8)] custom_cmap = matplotlib.colors.ListedColormap(colorsList) plt.figure(figsize=(15, 4)) - plt.imshow(df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none") + plt.imshow( + df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none" + ) plt.yticks(range(len(df_tot.columns)), df_tot.columns) plt.xlabel("Samples", fontsize=12) plt.grid(False) @@ -156,7 +161,9 @@ def plot_cdf( _, axs = plt.subplots(1, df.shape[1], sharey=True, figsize=(15, 3)) hole_sizes_original = get_holes_sizes_column_wise(df.to_numpy()) - for ind, (hole_original, col) in enumerate(zip(hole_sizes_original, df.columns)): + for ind, (hole_original, col) in enumerate( + zip(hole_sizes_original, df.columns) + ): sorted_data = np.sort(hole_original) cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data) axs[ind].plot(sorted_data, cdf, c="gray", lw=2, label="original") @@ -166,7 +173,9 @@ def plot_cdf( array_mask[array_mask == True] = np.nan hole_sizes_created = get_holes_sizes_column_wise(array_mask.to_numpy()) - for ind, (hole_created, col) in enumerate(zip(hole_sizes_created, df.columns)): + for ind, (hole_created, col) in enumerate( + zip(hole_sizes_created, df.columns) + ): sorted_data = np.sort(hole_created) cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data) axs[ind].plot(sorted_data, cdf, c=color, lw=2, label=label) @@ -309,7 +318,13 @@ def plot_cdf( plot_cdf( df, - [uniform_mask, geometric_mask, empirical_mask, multi_markov_mask, grouped_mask], + [ + uniform_mask, + geometric_mask, + empirical_mask, + multi_markov_mask, + grouped_mask, + ], ["uniform", "geometric", "empirical", "mutli markov", "grouped"], ["tab:orange", "tab:blue", "tab:green", "tab:pink", "tab:olive"], ) diff --git a/examples/tutorials/plot_tuto_mcar.py b/examples/tutorials/plot_tuto_mcar.py index a9bddb7f..fbbd0587 100644 --- a/examples/tutorials/plot_tuto_mcar.py +++ b/examples/tutorials/plot_tuto_mcar.py @@ -1,18 +1,20 @@ -"""============================================ +""" +============================================ Tutorial for Testing the MCAR Case ============================================ -In this tutorial, we show how to test the MCAR case using the Little's test. +In this tutorial, we show how to test the MCAR case using the Little and the PKLM tests. """ # %% # First import some libraries +from matplotlib import pyplot as plt + import numpy as np import pandas as pd -from matplotlib import pyplot as plt from scipy.stats import norm -from qolmat.analysis.holes_characterization import LittleTest +from qolmat.analysis.holes_characterization import LittleTest, PKLMTest from qolmat.benchmark.missing_patterns import UniformHoleGenerator plt.rcParams.update({"font.size": 12}) @@ -29,22 +31,32 @@ q975 = norm.ppf(0.975) # %% +# 1. Testing the MCAR case with the Little's test and the PKLM test. +# ------------------------------------------------------------------ +# # The Little's test -# --------------------------------------------------------------- +# ================= +# # First, we need to introduce the concept of a missing pattern. A missing pattern, also called a # pattern, is the structure of observed and missing values in a dataset. For example, in a # dataset with two columns, the possible patterns are: (0, 0), (1, 0), (0, 1), (1, 1). The value 1 # (0) indicates that the column value is missing (observed). # # The null hypothesis, H0, is: "The means of observations within each pattern are similar.". + +# %% +# The PKLM test +# ============= +# The test compares distributions of different missing patterns. # +# The null hypothesis, H0, is: "Distributions within each pattern are similar.". # We choose to use the classic threshold of 5%. If the test p-value is below this threshold, # we reject the null hypothesis. -# -# This notebook shows how the Little's test performs on a simplistic case and its limitations. We -# instanciate a test object with a random state for reproducibility. +# This notebook shows how the Little and PKLM tests perform on a simplistic case and their +# limitations. We instantiate a test object with a random state for reproducibility. -test_mcar = LittleTest(random_state=rng) +little_test_mcar = LittleTest(random_state=rng) +pklm_test_mcar = PKLMTest(random_state=rng) # %% # Case 1: MCAR holes (True negative) @@ -61,8 +73,16 @@ df_observed = df.loc[~has_nan] df_hidden = df.loc[has_nan] -plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values") -plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2") +plt.scatter( + df_observed["Column 1"], + df_observed[["Column 2"]], + label="Fully observed values", +) +plt.scatter( + df_hidden[["Column 1"]], + df_hidden[["Column 2"]], + label="Values with missing C2", +) plt.legend( loc="lower left", @@ -75,17 +95,21 @@ plt.show() # %% -result = test_mcar.test(df_nan) -print(f"Test p-value: {result:.2%}") +little_result = little_test_mcar.test(df_nan) +pklm_result = pklm_test_mcar.test(df_nan) +print(f"The p-value of the Little's test is: {little_result:.2%}") +print(f"The p-value of the PKLM test is: {pklm_result:.2%}") # %% -# The p-value is larger than 0.05, therefore we don't reject the HO MCAR assumption. In this case -# this is a true negative. +# The two p-values are larger than 0.05, therefore we don't reject the H0 MCAR assumption. +# In this case this is a true negative. # %% # Case 2: MAR holes with mean bias (True positive) # ================================================ -df_mask = pd.DataFrame({"Column 1": False, "Column 2": df["Column 1"] > q975}, index=df.index) +df_mask = pd.DataFrame( + {"Column 1": False, "Column 2": df["Column 1"] > q975}, index=df.index +) df_nan = df.where(~df_mask, np.nan) @@ -93,8 +117,16 @@ df_observed = df.loc[~has_nan] df_hidden = df.loc[has_nan] -plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values") -plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2") +plt.scatter( + df_observed["Column 1"], + df_observed[["Column 2"]], + label="Fully observed values", +) +plt.scatter( + df_hidden[["Column 1"]], + df_hidden[["Column 2"]], + label="Values with missing C2", +) plt.legend( loc="lower left", @@ -108,11 +140,13 @@ # %% -result = test_mcar.test(df_nan) -print(f"Test p-value: {result:.2%}") +little_result = little_test_mcar.test(df_nan) +pklm_result = pklm_test_mcar.test(df_nan) +print(f"The p-value of the Little's test is: {little_result:.2%}") +print(f"The p-value of the PKLM test is: {pklm_result:.2%}") # %% -# The p-value is smaller than 0.05, therefore we reject the HO MCAR assumption. In this case -# this is a true positive. +# The two p-values are smaller than 0.05, therefore we reject the H0 MCAR assumption. +# In this case this is a true positive. # %% # Case 3: MAR holes with any mean bias (False negative) @@ -123,7 +157,8 @@ # MAR but the means between missing patterns is not statistically different. df_mask = pd.DataFrame( - {"Column 1": False, "Column 2": df["Column 1"].abs() > q975}, index=df.index + {"Column 1": False, "Column 2": df["Column 1"].abs() > q975}, + index=df.index, ) df_nan = df.where(~df_mask, np.nan) @@ -132,8 +167,16 @@ df_observed = df.loc[~has_nan] df_hidden = df.loc[has_nan] -plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values") -plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2") +plt.scatter( + df_observed["Column 1"], + df_observed[["Column 2"]], + label="Fully observed values", +) +plt.scatter( + df_hidden[["Column 1"]], + df_hidden[["Column 2"]], + label="Values with missing C2", +) plt.legend( loc="lower left", @@ -147,17 +190,207 @@ # %% -result = test_mcar.test(df_nan) -print(f"Test p-value: {result:.2%}") +little_result = little_test_mcar.test(df_nan) +pklm_result = pklm_test_mcar.test(df_nan) +print(f"The p-value of the Little's test is: {little_result:.2%}") +print(f"The p-value of the PKLM test is: {pklm_result:.2%}") # %% -# The p-value is larger than 0.05, therefore we don't reject the HO MCAR assumption. In this case -# this is a false negative since the missingness mechanism is MAR. +# The Little's p-value is larger than 0.05, therefore, using this test we don't reject the H0 MCAR +# assumption. In this case this is a false negative since the missingness mechanism is MAR. +# +# However the PKLM test p-value is smaller than 0.05 therefore we don't reject the H0 MCAR +# assumption. In this case this is a true negative. # %% -# Limitations -# ----------- +# Limitations and conclusion +# ========================== # In this tutoriel, we can see that Little's test fails to detect covariance heterogeneity between # patterns. # # We also note that the Little's test does not handle categorical data or temporally # correlated data. +# +# This is why we have implemented the PKLM test, which makes up for the shortcomings of the Little +# test. We present this test in more detail in the next section. + +# %% +# 2. The PKLM test +# ------------------------------------------------------------------ +# +# The PKLM test is very powerful for several reasons. Firstly, it covers the concerns that Little's +# test may have (covariance heterogeneity). Secondly, it is currently the only MCAR test applicable +# to mixed data. Finally, it proposes a concept of partial p-value which enables us to carry out a +# variable-by-variable diagnosis to identify the potential causes of a MAR mechanism. +# +# There is a parameter in the paper called size.res.set. The authors of the paper recommend setting +# this parameter to 2. We have chosen to follow this advice and not leave the possibility of +# increasing this parameter. The results are satisfactory and the code is simpler. +# +# It does have one disadvantage, however: its calculation time. +# + +# %% + +""" +Calculation time +================ + ++------------+------------+----------------------+ +| **n_rows** | **n_cols** | **Calculation_time** | ++============+============+======================+ +| 200 | 2 | 2"12 | ++------------+------------+----------------------+ +| 500 | 2 | 2"24 | ++------------+------------+----------------------+ +| 500 | 4 | 2"18 | ++------------+------------+----------------------+ +| 1000 | 4 | 2"48 | ++------------+------------+----------------------+ +| 1000 | 6 | 2"42 | ++------------+------------+----------------------+ +| 10000 | 6 | 20"54 | ++------------+------------+----------------------+ +| 10000 | 10 | 14"48 | ++------------+------------+----------------------+ +| 100000 | 10 | 4'51" | ++------------+------------+----------------------+ +| 100000 | 15 | 3'06" | ++------------+------------+----------------------+ +""" + +# %% +# 2.1 Parameters and Hyperparmaters +# ================================================ +# +# To use the PKLM test properly, it may be necessary to understand the use of hyper-parameters. +# +# * ``nb_projections``: Number of projections on which the test statistic is calculated. This +# parameter has the greatest influence on test calculation time. Its defaut value +# ``nb_projections=100``. +# Est-ce qu'on donne des ordres de grandeurs utiles ? J'avais un peu fait ce travail. +# +# * ``nb_permutation`` : Number of permutations of the projected targets. The higher is better. This +# parameter has little impact on calculation time. +# Its default value ``nb_permutation=30``. +# +# * ``nb_trees_per_proj`` : The number of subtrees in each random forest fitted. In order to +# estimate the Kullback-Leibler divergence, we need to obtain probabilities of belonging to +# certain missing patterns. Random Forests are used to estimate these probabilities. This +# hyperparameter has a significant impact on test calculation time. Its default +# value is ``nb_trees_per_proj=200`` +# +# * ``compute_partial_p_values``: Boolean that indicates if you want to compute the partial +# p-values. Those partial p-values could help the user to identify the variables responsible for +# the MAR missing-data mechanism. Please see the section 2.3 for examples. Its default value is +# ``compute_partial_p_values=False``. +# +# * ``encoder``: Scikit-Learn encoder to encode non-numerical values. +# Its default value ``encoder=sklearn.preprocessing.OneHotEncoder()`` +# +# * ``random_state``: Controls the randomness. Pass an int for reproducible output across +# multiple function calls. Its default value ``random_state=None`` + +# %% +# 2.2 Application on mixed data types +# ================================================ +# +# As we have seen, Little's test only applies to quantitative data. In real life, however, it is +# common to have to deal with mixed data. Here's an example of how to use the PKLM test on a dataset +# with mixed data types. + +# %% +n_rows = 100 + +col1 = rng.rand(n_rows) * 100 +col2 = rng.randint(1, 100, n_rows) +col3 = rng.choice([True, False], n_rows) +modalities = ["A", "B", "C", "D"] +col4 = rng.choice(modalities, n_rows) + +df = pd.DataFrame( + {"Numeric1": col1, "Numeric2": col2, "Boolean": col3, "Object": col4} +) + +hole_gen = UniformHoleGenerator( + n_splits=1, + ratio_masked=0.2, + subset=["Numeric1", "Numeric2", "Boolean", "Object"], + random_state=rng, +) +df_mask = hole_gen.generate_mask(df) +df_nan = df.where(~df_mask, np.nan) +df_nan.dtypes + +# %% +pklm_result = pklm_test_mcar.test(df_nan) +print(f"The p-value of the PKLM test is: {pklm_result:.2%}") + +# %% +# To perform the PKLM test over mixed data types, non numerical features need to be encoded. The +# default encoder in the :class:`~qolmat.analysis.holes_characterization.PKLMTest` class is the +# default OneHotEncoder from scikit-learn. If you wish to use an encoder adapted to your data, you +# can perform this encoding step beforehand, and then use the PKLM test. +# Currently, we do not support the following types : +# +# - datetimes +# +# - timedeltas +# +# - Pandas datetimetz + +# %% +# 2.3 Partial p-values +# ================================================ +# +# In addition, the PKLM test can be used to calculate partial p-values. We denote as many partial +# p-values as there are columns in the input dataframe. This “partial” p-value corresponds to the +# effect of removing the patterns induced by variable k. +# +# Let's take a look at an example of how to use this feature + +# %% +data = rng.multivariate_normal( + mean=[0, 0, 0, 0], + cov=[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], + size=400, +) +df = pd.DataFrame( + data=data, columns=["Column 1", "Column 2", "Column 3", "Column 4"] +) + +df_mask = pd.DataFrame( + { + "Column 1": False, + "Column 2": df["Column 1"] > q975, + "Column 3": False, + "Column 4": False, + }, + index=df.index, +) +df_nan = df.where(~df_mask, np.nan) + +# %% +# The missing-data mechanism is clearly MAR. Intuitively, if we remove the second column from the +# matrix, the missing-data mechanism is MCAR. Let's see how the PKLM test can help us identify the +# variable responsible for the MAR mechanism. + +# %% +pklm_test = PKLMTest(random_state=rng, compute_partial_p_values=True) +p_value, partial_p_values = pklm_test.test(df_nan) +print(f"The p-value of the PKLM test is: {p_value:.2%}") + +# %% +# The test result confirms that we can reject the null hypothesis and therefore assume that the +# missing-data mechanism is MAR. +# Let's now take a look at what partial p-values can tell us. + +# %% +for col_index, partial_p_v in enumerate(partial_p_values): + print( + f"The partial p-value for the column index {col_index + 1} is: {partial_p_v:.2%}" + ) + +# %% +# As a result, by removing the missing patterns induced by variable 2, the p-value rises +# above the significance threshold set beforehand. Thus in this sense, the test detects that the +# main culprit of the MAR mechanism lies in the second variable. diff --git a/pyproject.toml b/pyproject.toml index a0f87501..3c2bc7aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,6 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.8.1,<3.12" bump2version = "1.0.1" -dcor = "0.6" jupyter = "1.0.0" jupyterlab = "1.2.6" jupytext = "1.14.4" @@ -53,7 +52,8 @@ twine = "3.7.1" wheel = "0.37.1" category-encoders = "^2.6.3" ipykernel = "^6.29.5" -torch = "^2.4.0" +torch = "*" +dcor = "0.6" [tool.poetry.dev-dependencies] matplotlib = "3.6.2" @@ -116,7 +116,7 @@ docstring-code-format = true [tool.ruff.lint] select = ["C", "D", "E", "F", "I", "Q", "W"] -ignore = ["C901", "D107"] +ignore = ["C901", "D107", "D203", "D213"] [tool.ruff.lint.isort] known-first-party = ["qolmat"] diff --git a/qolmat/analysis/holes_characterization.py b/qolmat/analysis/holes_characterization.py index aa511052..1ab7cb6f 100644 --- a/qolmat/analysis/holes_characterization.py +++ b/qolmat/analysis/holes_characterization.py @@ -1,22 +1,70 @@ """Script for characterising the holes.""" from abc import ABC, abstractmethod -from typing import Optional, Union +from itertools import combinations +from typing import List, Optional, Tuple, Union import numpy as np import pandas as pd +from category_encoders.one_hot import OneHotEncoder +from joblib import Parallel, delayed from scipy.stats import chi2 +from sklearn import utils as sku +from sklearn.ensemble import RandomForestClassifier from qolmat.imputations.imputers import ImputerEM +from qolmat.utils.input_check import check_pd_df_dtypes class McarTest(ABC): - """Astract class for MCAR tests.""" + """Abstract class for MCAR tests. + + Parameters + ---------- + random_state : int or np.random.RandomState, optional + Seed or random state for reproducibility. + + Methods + ------- + test + Abstract method to perform the MCAR test on the given DataFrame or + NumPy array. + + """ + + def __init__( + self, random_state: Union[None, int, np.random.RandomState] = None + ): + """Initialize the McarTest class with a random state. + + Parameters + ---------- + random_state : int or np.random.RandomState, optional + Seed or random state for reproducibility. + + """ + self.rng = sku.check_random_state(random_state) @abstractmethod - def test(self, df: pd.DataFrame) -> float: - """Test function.""" - pass + def test( + self, df: Union[pd.DataFrame, np.ndarray] + ) -> Union[float, Tuple[float, List[float]]]: + """Perform the MCAR test on the input data. + + Parameters + ---------- + df : pd.DataFrame or np.ndarray + Data to be tested for MCAR. Can be provided as a pandas DataFrame + or a NumPy array. + + Returns + ------- + float or tuple of float and list of float + Test statistic, or a tuple with the test statistic and additional + details if applicable. + + """ + raise NotImplementedError class LittleTest(McarTest): @@ -103,3 +151,633 @@ def test(self, df: pd.DataFrame) -> float: degree_f += tup_pattern.count(True) return 1 - float(chi2.cdf(d0, degree_f)) + + +class PKLMTest(McarTest): + """PKLM Test class. + + This class implements the PKLM test, a fully non-parametric, easy-to-use + and powerful test for the missing completely at random (MCAR) assumption on + the missingness mechanism of a dataset. + The null hypothesis is "The missing data mechanism is MCAR". + + This test is applicable to mixed data (quantitative and categoricals) types + + + If you're familiar with the paper, this implementation of the PKLM test was + made for the parameter size.resp.set=2 only. + + References + ---------- + Spohn, M. L., Näf, J., Michel, L., & Meinshausen, N. (2021). PKLM: A + flexible MCAR test using Classification. arXiv preprint arXiv:2109.10150. + + Parameters + ---------- + nb_projections : int + Number of projections. + nb_projections_threshold : int + If the maximum number of possible permutations is less than this + threshold, then all projections are used. Otherwise, nb_projections + random projections are drawn. + nb_permutation : int + Number of permutations. + nb_trees_per_proj : int + Number of trees per projection. + compute_partial_p_values : bool + If true, compute the partial p-values. + encoder : OneHotEncoder or None, default=None + Encoder to convert non numeric pandas dataframe values to numeric + values. + random_state : int, RandomState instance or None, default=None + Controls the randomness. + Pass an int for reproducible output across multiple function calls. + + """ + + def __init__( + self, + nb_projections: int = 100, + nb_projections_threshold: int = 200, + nb_permutation: int = 30, + nb_trees_per_proj: int = 200, + compute_partial_p_values: bool = False, + encoder: Union[None, OneHotEncoder] = None, + random_state: Union[None, int, np.random.RandomState] = None, + ): + super().__init__(random_state=random_state) + self.nb_projections = nb_projections + self.nb_projections_threshold = nb_projections_threshold + self.nb_permutation = nb_permutation + self.nb_trees_per_proj = nb_trees_per_proj + self.compute_partial_p_values = compute_partial_p_values + self.encoder = encoder + + def _encode_dataframe(self, df: pd.DataFrame) -> np.ndarray: + """Encode the DataFrame. + + Encode the DataFrame by converting numeric columns to a numpy array + and applying one-hot encoding to objects and boolean columns. + + Parameters + ---------- + df : pd.DataFrame + The DataFrame to be encoded. + + Returns + ------- + np.ndarray + The encoded DataFrame as a numpy ndarray, with numeric data + concatenated with one-hot encoded categorical and boolean data. + + """ + if not df.select_dtypes(include=["object", "bool"]).columns.to_list(): + return df.to_numpy() + + if not self.encoder: + self.encoder = OneHotEncoder( + cols=df.select_dtypes(include=["object", "bool"]).columns, + return_df=False, + handle_missing="return_nan", + ) + + return self.encoder.fit_transform(df) + + def _pklm_preprocessing( + self, X: Union[pd.DataFrame, np.ndarray] + ) -> np.ndarray: + """Preprocess the input DataFrame or ndarray for further processing. + + Parameters + ---------- + X : Union[pd.DataFrame, np.ndarray] + The input data to be preprocessed. Can be a pandas DataFrame or a + numpy ndarray. + + Returns + ------- + np.ndarray + The preprocessed data as a numpy ndarray. + + Raises + ------ + TypeNotHandled + If the DataFrame contains columns with data types that are not + numeric, string, or boolean. + + """ + if isinstance(X, np.ndarray): + return X + + check_pd_df_dtypes( + X, + [ + pd.api.types.is_numeric_dtype, + pd.api.types.is_string_dtype, + pd.api.types.is_bool_dtype, + ], + ) + return self._encode_dataframe(X) + + @staticmethod + def _get_max_draw(p: int) -> int: + """Calculate the number of possible projections. + + Parameters + ---------- + p : int + The number of columns of the input matrix. + + Returns + ------- + int + The number of possible projections. + + """ + return p * (2 ** (p - 1) - 1) + + def _draw_features_and_target_indexes( + self, X: np.ndarray + ) -> Tuple[List[int], int]: + """Randomly select features and a target from the dataframe. + + This corresponds to the Ai and Bi projections of the paper. + + Parameters + ---------- + X : np.ndarray + The input dataframe. + + Returns + ------- + Tuple[np.ndarray, int] + Indices of selected features and the target. + + """ + _, p = X.shape + nb_features = self.rng.randint(1, p) + features_idx = self.rng.choice(p, size=nb_features, replace=False) + target_idx = self.rng.choice(np.setdiff1d(np.arange(p), features_idx)) + return features_idx.tolist(), target_idx + + @staticmethod + def _check_draw( + X: np.ndarray, features_idx: List[int], target_idx: int + ) -> bool: + """Check if the drawn features and target are valid. + + Here we check + that the number of induced classes is equal to 2. Using the notation + from the paper, we want |G(Ai, Bi)| = 2. + + Parameters + ---------- + X : np.ndarray + The input dataframe. + features_idx : np.ndarray + Indices of the selected features. + target_idx : int + Index of the target. + + Returns + ------- + bool + True if the draw is valid, False otherwise. + + """ + target_values = X[~np.isnan(X[:, features_idx]).any(axis=1)][ + :, target_idx + ] + is_nan = np.isnan(target_values).any() + is_distinct_values = (~np.isnan(target_values)).any() + return is_nan and is_distinct_values + + def _generate_label_feature_combinations( + self, X: np.ndarray + ) -> List[Tuple[List[int], int]]: + """Generate all valid combinations of features and labels. + + Parameters + ---------- + X : np.ndarray + The input data array. + + Returns + ------- + List[Tuple[int, List[int]]] + A list of tuples where each tuple contains a label and a list of + selected features that can be used for projection. + + """ + _, p = X.shape + indices = list(range(p)) + result = [] + + for label in indices: + feature_candidates = [i for i in indices if i != label] + + for r in range(1, len(feature_candidates) + 1): + for feature_set in combinations(feature_candidates, r): + if self._check_draw(X, list(feature_set), label): + result.append((list(feature_set), label)) + + return result + + def _draw_projection(self, X: np.ndarray) -> Tuple[List[int], int]: + """Draw a valid projection of features and a target. + + If nb_projections_threshold < _get_max_draw(X.shape[1]). + + Parameters + ---------- + X : np.ndarray + The input dataframe. + + Returns + ------- + Tuple[np.ndarray, int] + Indices of selected features and the target. + + """ + is_checked = False + while not is_checked: + features_idx, target_idx = self._draw_features_and_target_indexes( + X + ) + is_checked = self._check_draw(X, features_idx, target_idx) + return features_idx, target_idx + + @staticmethod + def _build_dataset( + X: np.ndarray, features_idx: np.ndarray, target_idx: int + ) -> Tuple[np.ndarray, np.ndarray]: + """Build a dataset for a classification task. + + Build a dataset by selecting specified features and target from a + NumPy array, excluding rows with NaN values in the feature columns. + For the label, we create a binary classification problem where yi =1 if + target_idx_i is missing. + + Parameters + ---------- + X: np.ndarray + Input data array. + features_idx: np.ndarray + Indices of the feature columns. + target_idx: int + Index of the target column. + + Returns + ------- + Tuple[np.ndarray, np.ndarray]: A tuple containing: + - X (np.ndarray): Full observed array of selected features. + - y (np.ndarray): Binary array indicating presence of NaN (1) in + the target column. + + """ + X_features = X[~np.isnan(X[:, features_idx]).any(axis=1)][ + :, features_idx + ] + y = np.where( + np.isnan( + X[~np.isnan(X[:, features_idx]).any(axis=1)][:, target_idx] + ), + 1, + 0, + ) + return X_features, y + + @staticmethod + def _build_label( + X: np.ndarray, + perm: np.ndarray, + features_idx: np.ndarray, + target_idx: int, + ) -> np.ndarray: + """Build a label. + + Build a label array by selecting target values from a permutation + array, excluding rows with NaN values in the specified feature columns. + For the label, we create a binary classification problem where yi =1 if + target_idx_i is missing. + + Parameters + ---------- + X: np.ndarray + Input data array. + perm: np.ndarray + Permutation array from which labels are selected. + features_idx: np.ndarray + Indices of the feature columns. + target_idx: int + Index of the target column in the permutation array. + + Returns + ------- + np.ndarray: Binary array indicating presence of NaN (1) in the + target column. + + """ + return perm[~np.isnan(X[:, features_idx]).any(axis=1), target_idx] + + def _get_oob_probabilities( + self, X: np.ndarray, y: np.ndarray + ) -> np.ndarray: + """Retrieve out-of-bag probabilities. + + Train a RandomForestClassifier and retrieves out-of-bag (OOB) + probabilities. + + Parameters + ---------- + X: np.ndarray + Feature array for training. + y: np.ndarray + Target array for training. + + Returns + ------- + np.ndarray: Out-of-bag probabilities for each class. + + """ + clf = RandomForestClassifier( + n_estimators=self.nb_trees_per_proj, + min_samples_split=10, + bootstrap=True, + oob_score=True, + random_state=self.rng, + max_features=1.0, + ) + clf.fit(X, y) + return clf.oob_decision_function_ + + @staticmethod + def _U_hat(oob_probabilities: np.ndarray, labels: np.ndarray) -> float: + """Compute the U_hat statistic. + + U_hat is a measure of classifier performance, using out-of-bag + probabilities and true labels. + + Parameters + ---------- + oob_probabilities: np.ndarray + Out-of-bag probabilities for each class. + labels: np.ndarray + True labels for the data. + + Returns + ------- + float: The computed U_hat statistic. + + """ + if oob_probabilities.shape[1] == 1: + return 0.0 + + oob_probabilities = np.clip(oob_probabilities, 1e-9, 1 - 1e-9) + + unique_labels = np.unique(labels) + label_matrix = (labels[:, None] == unique_labels).astype(int) + p_true = oob_probabilities * label_matrix + p_false = oob_probabilities * (1 - label_matrix) + + p0_0 = p_true[:, 0][np.where(p_true[:, 0] != 0.0)] + p0_1 = p_false[:, 0][np.where(p_false[:, 0] != 0.0)] + p1_1 = p_true[:, 1][np.where(p_true[:, 1] != 0.0)] + p1_0 = p_false[:, 1][np.where(p_false[:, 1] != 0.0)] + + if unique_labels.shape[0] == 1: + if unique_labels[0] == 0: + n0 = labels.shape[0] + return ( + np.log(p0_0 / (1 - p0_0)).sum() / n0 + - np.log(p1_0 / (1 - p1_0)).sum() / n0 + ) + else: + n1 = labels.shape[0] + return ( + np.log(p1_1 / (1 - p1_1)).sum() / n1 + - np.log(p0_1 / (1 - p0_1)).sum() / n1 + ) + + n0, n1 = label_matrix.sum(axis=0) + u_0 = ( + np.log(p0_0 / (1 - p0_0)).sum() / n0 + - np.log(p0_1 / (1 - p0_1)).sum() / n1 + ) + u_1 = ( + np.log(p1_1 / (1 - p1_1)).sum() / n1 + - np.log(p1_0 / (1 - p1_0)).sum() / n0 + ) + + return u_0 + u_1 + + def _parallel_process_permutation( + self, + X: np.ndarray, + M_perm: np.ndarray, + features_idx: np.ndarray, + target_idx: int, + oob_probabilities: np.ndarray, + ) -> float: + """Process a permutation. + + Parameters + ---------- + X : np.ndarray + input array + M_perm : np.ndarray + permutation array + features_idx : np.ndarray + index of the features + target_idx : int + index of the target + oob_probabilities : np.ndarray + out of bag probabilities + + Returns + ------- + float + esimtated statistic U_hat + + """ + y = self._build_label(X, M_perm, features_idx, target_idx) + return self._U_hat(oob_probabilities, y) + + def _parallel_process_projection( + self, + X: np.ndarray, + list_permutations: List[np.ndarray], + features_idx: np.ndarray, + target_idx: int, + ) -> Tuple[float, List[float]]: + """Compute statistics for a projection. + + Parameters + ---------- + X : np.ndarray + input array + list_permutations : List[np.ndarray] + list of permutations + features_idx : np.ndarray + index of the features + target_idx : int + index of the target + + Returns + ------- + Tuple[float, List[float]] + estimated statistic u_hat and list of u_hat for each permutation + + """ + X_features, y = self._build_dataset(X, features_idx, target_idx) + oob_probabilities = self._get_oob_probabilities(X_features, y) + u_hat = self._U_hat(oob_probabilities, y) + # We iterate over the permutation because for a given projection + # We fit only one classifier to get oob probabilities and compute u_hat + # nb_permutations times. + result_u_permutations = Parallel(n_jobs=-1)( + delayed(self._parallel_process_permutation)( + X, M_perm, features_idx, target_idx, oob_probabilities + ) + for M_perm in list_permutations + ) + return u_hat, result_u_permutations + + @staticmethod + def _build_B(list_proj: List, n_cols: int) -> np.ndarray: + """Construct a binary matrix B based on the given projections. + + Parameters + ---------- + list_proj : List + A list of tuples where each tuple represents a projection, and the + second element of each tuple is an index used to build the target. + n_cols : int + The number of columns in the resulting matrix B. + + Returns + ------- + np.ndarray + A binary matrix of shape (n_cols, len(list_proj)) where each column + corresponds to a projection, and the entries are 0 or 1 based on + the projections. + + """ + list_bi = [projection[1] for projection in list_proj] + B = np.ones((len(list_proj), n_cols), dtype=int) + + for j in range(len(list_bi)): + B[j, list_bi[j]] = 0 + + return B.transpose() + + def _compute_partial_p_value( + self, B: np.ndarray, U: np.ndarray, U_sigma: np.ndarray, k: int + ) -> float: + """Compute the partial p-values. + + Compute the partial p-value for a statistical test based on a given + permutation. + + Parameters + ---------- + B : np.ndarray + Pass matrix indicating the column used to create the target in each + projection. + U : np.ndarray + A vector of shape (nb_permutations,) representing the observed test + statistics. + U_sigma : np.ndarray + A matrix of shape (nb_permutations, nb_observations) where each row + represents the test statistics for a given projection and all the + permutations. + k : int + The index of the column on which to compute the partial p_value. + + Returns + ------- + float + The partial p-value. + + """ + U_k = B[k, :] @ U + p_v_k = 1.0 + + for u_sigma_k in (B[k, :] @ U_sigma).tolist(): + if u_sigma_k >= U_k: + p_v_k += 1 + + return p_v_k / (self.nb_permutation + 1) + + def test( + self, X: Union[pd.DataFrame, np.ndarray] + ) -> Union[float, Tuple[float, List[float]]]: + """Apply the PKLM test over a real dataset. + + Parameters + ---------- + X : np.ndarray + The input dataset with missing values. + + Returns + ------- + float + If compute_partial_p_values=False. Returns the p-value of the test. + Tuple[float, List[float]] + If compute_partial_p_values=True. Returns the p-value of the test + and the list of all the partial p-values. + + """ + X = self._pklm_preprocessing(X) + _, n_cols = X.shape + + if self._get_max_draw(n_cols) <= self.nb_projections_threshold: + list_proj = self._generate_label_feature_combinations(X) + else: + list_proj = [ + self._draw_projection(X) for _ in range(self.nb_projections) + ] + + M = np.isnan(X).astype(int) + list_perm = [ + self.rng.permutation(M) for _ in range(self.nb_permutation) + ] + U = 0.0 + list_U_sigma = [0.0 for _ in range(self.nb_permutation)] + + parallel_results = Parallel(n_jobs=-1)( + delayed(self._parallel_process_projection)( + X, list_perm, features_idx, target_idx + ) + for features_idx, target_idx in list_proj + ) + + for U_projection, results in parallel_results: + U += U_projection + list_U_sigma = [x + y for x, y in zip(list_U_sigma, results)] + + U = U / self.nb_projections + list_U_sigma = [x / self.nb_permutation for x in list_U_sigma] + + p_value = 1.0 + for u_sigma in list_U_sigma: + if u_sigma >= U: + p_value += 1 + + p_value = p_value / (self.nb_permutation + 1) + + if not self.compute_partial_p_values: + return p_value + else: + B = self._build_B(list_proj, n_cols) + U_matrix = np.array( + [np.atleast_1d(item[0]) for item in parallel_results] + ) + U_sigma = np.array( + [np.atleast_1d(item[1]) for item in parallel_results] + ) + p_values = [ + self._compute_partial_p_value(B, U_matrix, U_sigma, k) + for k in range(n_cols) + ] + return p_value, p_values diff --git a/qolmat/utils/data.py b/qolmat/utils/data.py index 1ae46f34..eeef323a 100644 --- a/qolmat/utils/data.py +++ b/qolmat/utils/data.py @@ -482,9 +482,9 @@ def add_datetime_features( df = df.copy() time = df.index.get_level_values(col_time).to_series() days_in_year = time.dt.year.apply( - lambda x: 366 - if ((x % 4 == 0) and (x % 100 != 0)) or (x % 400 == 0) - else 365 + lambda x: ( + 366 if ((x % 4 == 0) and (x % 100 != 0)) or (x % 400 == 0) else 365 + ) ) ratio = time.dt.dayofyear.values / days_in_year.values df["time_cos"] = np.cos(2 * np.pi * ratio) diff --git a/qolmat/utils/input_check.py b/qolmat/utils/input_check.py new file mode 100644 index 00000000..3c13508e --- /dev/null +++ b/qolmat/utils/input_check.py @@ -0,0 +1,34 @@ +"""Util file for input checks.""" +import pandas as pd + +from qolmat.utils.exceptions import TypeNotHandled + + +def check_pd_df_dtypes(df: pd.DataFrame, allowed_types: list): + """Validate that the columns of the DataFrame have allowed data types. + + Parameters + ---------- + df : pd.DataFrame + DataFrame whose columns' data types are to be checked. + allowed_types : list + List which contains the allowed data types. + + Raises + ------ + TypeNotHandled + If any column has a data type that is not numeric, string, or boolean. + + """ + + def is_allowed_type(dtype): + return any(check(dtype) for check in allowed_types) + + invalid_columns = [ + (col, dtype) + for col, dtype in df.dtypes.items() + if not is_allowed_type(dtype) + ] + if invalid_columns: + for column_name, dtype in invalid_columns: + raise TypeNotHandled(col=str(column_name), type_col=dtype) diff --git a/tests/analysis/test_holes_characterization.py b/tests/analysis/test_holes_characterization.py index a77ecbb1..bcc51e4e 100644 --- a/tests/analysis/test_holes_characterization.py +++ b/tests/analysis/test_holes_characterization.py @@ -3,10 +3,12 @@ import pytest from scipy.stats import norm -from qolmat.analysis.holes_characterization import LittleTest +from qolmat.analysis.holes_characterization import LittleTest, PKLMTest from qolmat.benchmark.missing_patterns import UniformHoleGenerator from qolmat.imputations.imputers import ImputerEM +### Tests for the LittleTest class + @pytest.fixture def mcar_df() -> pd.DataFrame: @@ -67,3 +69,201 @@ def test_little_mcar_test(df_input: pd.DataFrame, expected: bool, request): def test_attribute_error(): with pytest.raises(AttributeError): LittleTest(random_state=42, imputer=ImputerEM(model="VAR")) + + +### Tests for the PKLMTest class + + +@pytest.fixture +def supported_multitypes_dataframe() -> pd.DataFrame: + return pd.DataFrame( + { + "int_col": [1, 2, 3], + "float_col": [1.1, 2.2, 3.3], + "str_col": ["a", "b", "c"], + "bool_col": [True, False, True], + } + ) + + +@pytest.fixture +def np_matrix_with_nan_mcar() -> np.ndarray: + rng = np.random.default_rng(42) + n_rows, n_cols = 10, 4 + matrix = rng.normal(size=(n_rows, n_cols)) + num_nan = int(n_rows * n_cols * 0.40) + nan_indices = rng.choice(n_rows * n_cols, num_nan, replace=False) + matrix.flat[nan_indices] = np.nan + return matrix + + +@pytest.fixture +def missingness_matrix_mcar(np_matrix_with_nan_mcar): + return np.isnan(np_matrix_with_nan_mcar).astype(int) + + +@pytest.fixture +def missingness_matrix_mcar_perm(missingness_matrix_mcar): + rng = np.random.default_rng(42) + return rng.permutation(missingness_matrix_mcar) + + +@pytest.fixture +def oob_probabilities() -> np.ndarray: + return np.array([[0.5, 0.5], [0, 1], [1, 0], [1, 0]]) + + +def test__encode_dataframe(supported_multitypes_dataframe): + mcar_test_pklm = PKLMTest(random_state=42) + np_dataframe = mcar_test_pklm._encode_dataframe( + supported_multitypes_dataframe + ) + n_rows, n_cols = np_dataframe.shape + assert n_rows == 3 + assert n_cols == 7 + + +def test__draw_features_and_target_indexes(np_matrix_with_nan_mcar): + mcar_test_pklm = PKLMTest(random_state=42) + _, p = np_matrix_with_nan_mcar.shape + features_idx, target_idx = ( + mcar_test_pklm._draw_features_and_target_indexes( + np_matrix_with_nan_mcar + ) + ) + assert isinstance(target_idx, np.integer) + assert isinstance(features_idx, list) + assert target_idx not in features_idx + assert 0 <= target_idx <= (p - 1) + for feature_index in features_idx: + assert 0 <= feature_index <= (p - 1) + + +@pytest.mark.parametrize( + "dataframe_fixture, features_idx, target_idx, expected", + [ + ("np_matrix_with_nan_mcar", np.array([1, 0]), 2, True), + ("np_matrix_with_nan_mcar", np.array([1, 0, 2]), 3, False), + ], +) +def test__check_draw( + request, dataframe_fixture, features_idx, target_idx, expected +): + dataframe = request.getfixturevalue(dataframe_fixture) + mcar_test_pklm = PKLMTest() + result = mcar_test_pklm._check_draw(dataframe, features_idx, target_idx) + assert result == expected + + +@pytest.mark.parametrize("matrix_fixture", [("np_matrix_with_nan_mcar")]) +def test__generate_label_feature_combinations(request, matrix_fixture): + X = request.getfixturevalue(matrix_fixture) + _, n_cols = X.shape + mcar_test_pklm = PKLMTest() + result = mcar_test_pklm._generate_label_feature_combinations(X) + # Check that number of projections is smaller than possible + assert len(result) <= mcar_test_pklm._get_max_draw(n_cols) + # Check there are no duplicates + assert len({x for x in result if result.count(x) > 1}) == 0 + for features, label in result: + assert isinstance(label, int) + assert isinstance(features, list) + assert label not in features + for feature_index in features: + assert 0 <= feature_index <= (n_cols - 1) + + +@pytest.mark.parametrize( + "dataframe_fixture, features_idx, target_idx", + [ + ("np_matrix_with_nan_mcar", np.array([1, 0]), 2), + ], +) +def test__build_dataset(request, dataframe_fixture, features_idx, target_idx): + dataframe = request.getfixturevalue(dataframe_fixture) + mcar_test_pklm = PKLMTest() + X, y = mcar_test_pklm._build_dataset(dataframe, features_idx, target_idx) + assert X.shape[0] == len(y) + assert not np.any(np.isnan(X)) + assert not np.any(np.isnan(y)) + assert np.all(np.unique(y) == [0, 1]) + assert X.shape[1] == len(features_idx) + assert len(y.shape) == 1 + + +@pytest.mark.parametrize( + "dataframe_fixture, permutation_fixture, features_idx, target_idx", + [ + ( + "np_matrix_with_nan_mcar", + "missingness_matrix_mcar_perm", + np.array([1, 0]), + 2, + ), + ], +) +def test__build_label( + request, dataframe_fixture, permutation_fixture, features_idx, target_idx +): + dataframe = request.getfixturevalue(dataframe_fixture) + m_perm = request.getfixturevalue(permutation_fixture) + mcar_test_pklm = PKLMTest() + label = mcar_test_pklm._build_label( + dataframe, m_perm, features_idx, target_idx + ) + assert not np.any(np.isnan(label)) + assert len(label.shape) == 1 + assert np.isin(label, [0, 1]).all() + + +@pytest.mark.parametrize( + "oob_fixture, label", + [ + ("oob_probabilities", np.array([1, 1, 1, 1])), + ("oob_probabilities", np.array([0, 0, 0, 0])), + ], +) +def test__U_hat_unique_label(request, oob_fixture, label): + oob_prob = request.getfixturevalue(oob_fixture) + mcar_test_pklm = PKLMTest() + mcar_test_pklm._U_hat(oob_prob, label) + + +@pytest.mark.parametrize( + "oob_fixture, label, expected", + [ + ( + "oob_probabilities", + np.array([1, 0, 0, 0]), + 2 / 3 * (np.log(1 - 1e-9) - np.log(1e-9)), + ), + ], +) +def test__U_hat_computation(request, oob_fixture, label, expected): + oob_prob = request.getfixturevalue(oob_fixture) + mcar_test_pklm = PKLMTest() + u_hat = mcar_test_pklm._U_hat(oob_prob, label) + assert round(u_hat, 2) == round(expected, 2) + + +@pytest.mark.parametrize( + "list_proj, n_cols", + [ + ( + [ + (np.array([3, 1]), 0), + (np.array([0]), 1), + (np.array([3]), 0), + (np.array([1, 2]), 3), + (np.array([3, 0]), 2), + (np.array([0, 1]), 2), + ], + 4, + ) + ], +) +def test__build_B(list_proj, n_cols): + mcar_test_pklm = PKLMTest() + B = mcar_test_pklm._build_B(list_proj, n_cols) + column_sums = np.sum(B, axis=0) + assert np.all(column_sums == 3) diff --git a/tests/utils/test_input_check.py b/tests/utils/test_input_check.py new file mode 100644 index 00000000..073e6c0d --- /dev/null +++ b/tests/utils/test_input_check.py @@ -0,0 +1,55 @@ +import pandas as pd +import pytest + +from qolmat.utils.exceptions import TypeNotHandled +from qolmat.utils.input_check import check_pd_df_dtypes + + +@pytest.fixture +def multitypes_dataframe() -> pd.DataFrame: + return pd.DataFrame( + { + "int_col": [1, 2, 3], + "float_col": [1.1, 2.2, 3.3], + "str_col": ["a", "b", "c"], + "bool_col": [True, False, True], + "datetime_col": pd.to_datetime( + ["2021-01-01", "2021-01-02", "2021-01-03"] + ), + } + ) + + +@pytest.fixture +def supported_multitypes_dataframe() -> pd.DataFrame: + return pd.DataFrame( + { + "int_col": [1, 2, 3], + "float_col": [1.1, 2.2, 3.3], + "str_col": ["a", "b", "c"], + "bool_col": [True, False, True], + } + ) + + +def test__check_pd_df_dtypes_raise_error(multitypes_dataframe): + with pytest.raises(TypeNotHandled): + check_pd_df_dtypes( + multitypes_dataframe, + [ + pd.api.types.is_numeric_dtype, + pd.api.types.is_string_dtype, + pd.api.types.is_bool_dtype, + ], + ) + + +def test__check_pd_df_dtypes(supported_multitypes_dataframe): + check_pd_df_dtypes( + supported_multitypes_dataframe, + [ + pd.api.types.is_numeric_dtype, + pd.api.types.is_string_dtype, + pd.api.types.is_bool_dtype, + ], + )