1- """============================================
1+ """
2+ ============================================
23Tutorial for Testing the MCAR Case
34============================================
45
5- In this tutorial, we show how to test the MCAR case using the Little's test .
6+ In this tutorial, we show how to test the MCAR case using the Little and the PKLM tests .
67"""
78
89# %%
910# First import some libraries
11+ from matplotlib import pyplot as plt
12+
1013import numpy as np
1114import pandas as pd
12- from matplotlib import pyplot as plt
1315from scipy .stats import norm
1416
15- from qolmat .analysis .holes_characterization import LittleTest
17+ from qolmat .analysis .holes_characterization import LittleTest , PKLMTest
1618from qolmat .benchmark .missing_patterns import UniformHoleGenerator
1719
1820plt .rcParams .update ({"font.size" : 12 })
2931q975 = norm .ppf (0.975 )
3032
3133# %%
34+ # 1. Testing the MCAR case with the Little's test and the PKLM test.
35+ # ------------------------------------------------------------------
36+ #
3237# The Little's test
33- # ---------------------------------------------------------------
38+ # =================
39+ #
3440# First, we need to introduce the concept of a missing pattern. A missing pattern, also called a
3541# pattern, is the structure of observed and missing values in a dataset. For example, in a
3642# dataset with two columns, the possible patterns are: (0, 0), (1, 0), (0, 1), (1, 1). The value 1
3743# (0) indicates that the column value is missing (observed).
3844#
3945# The null hypothesis, H0, is: "The means of observations within each pattern are similar.".
46+
47+ # %%
48+ # The PKLM test
49+ # =============
50+ # The test compares distributions of different missing patterns.
4051#
52+ # The null hypothesis, H0, is: "Distributions within each pattern are similar.".
4153# We choose to use the classic threshold of 5%. If the test p-value is below this threshold,
4254# we reject the null hypothesis.
43- #
44- # This notebook shows how the Little's test performs on a simplistic case and its limitations. We
45- # instanciate a test object with a random state for reproducibility.
55+ # This notebook shows how the Little and PKLM tests perform on a simplistic case and their
56+ # limitations. We instantiate a test object with a random state for reproducibility.
4657
47- test_mcar = LittleTest (random_state = rng )
58+ little_test_mcar = LittleTest (random_state = rng )
59+ pklm_test_mcar = PKLMTest (random_state = rng )
4860
4961# %%
5062# Case 1: MCAR holes (True negative)
7587plt .show ()
7688
7789# %%
78- result = test_mcar .test (df_nan )
79- print (f"Test p-value: { result :.2%} " )
90+ little_result = little_test_mcar .test (df_nan )
91+ pklm_result = pklm_test_mcar .test (df_nan )
92+ print (f"The p-value of the Little's test is: { little_result :.2%} " )
93+ print (f"The p-value of the PKLM test is: { pklm_result :.2%} " )
8094# %%
81- # The p-value is larger than 0.05, therefore we don't reject the HO MCAR assumption. In this case
82- # this is a true negative.
95+ # The two p-values are larger than 0.05, therefore we don't reject the H0 MCAR assumption.
96+ # In this case this is a true negative.
8397
8498# %%
8599# Case 2: MAR holes with mean bias (True positive)
108122
109123# %%
110124
111- result = test_mcar .test (df_nan )
112- print (f"Test p-value: { result :.2%} " )
125+ little_result = little_test_mcar .test (df_nan )
126+ pklm_result = pklm_test_mcar .test (df_nan )
127+ print (f"The p-value of the Little's test is: { little_result :.2%} " )
128+ print (f"The p-value of the PKLM test is: { pklm_result :.2%} " )
113129# %%
114- # The p-value is smaller than 0.05, therefore we reject the HO MCAR assumption. In this case
115- # this is a true positive.
130+ # The two p-values are smaller than 0.05, therefore we reject the H0 MCAR assumption.
131+ # In this case this is a true positive.
116132
117133# %%
118134# Case 3: MAR holes with any mean bias (False negative)
147163
148164# %%
149165
150- result = test_mcar .test (df_nan )
151- print (f"Test p-value: { result :.2%} " )
166+ little_result = little_test_mcar .test (df_nan )
167+ pklm_result = pklm_test_mcar .test (df_nan )
168+ print (f"The p-value of the Little's test is: { little_result :.2%} " )
169+ print (f"The p-value of the PKLM test is: { pklm_result :.2%} " )
152170# %%
153- # The p-value is larger than 0.05, therefore we don't reject the HO MCAR assumption. In this case
154- # this is a false negative since the missingness mechanism is MAR.
171+ # The Little's p-value is larger than 0.05, therefore, using this test we don't reject the H0 MCAR
172+ # assumption. In this case this is a false negative since the missingness mechanism is MAR.
173+ #
174+ # However the PKLM test p-value is smaller than 0.05 therefore we don't reject the H0 MCAR
175+ # assumption. In this case this is a true negative.
155176
156177# %%
157- # Limitations
158- # -----------
178+ # Limitations and conclusion
179+ # ==========================
159180# In this tutoriel, we can see that Little's test fails to detect covariance heterogeneity between
160181# patterns.
161182#
162183# We also note that the Little's test does not handle categorical data or temporally
163184# correlated data.
185+ #
186+ # This is why we have implemented the PKLM test, which makes up for the shortcomings of the Little
187+ # test. We present this test in more detail in the next section.
188+
189+ # %%
190+ # 2. The PKLM test
191+ # ------------------------------------------------------------------
192+ #
193+ # The PKLM test is very powerful for several reasons. Firstly, it covers the concerns that Little's
194+ # test may have (covariance heterogeneity). Secondly, it is currently the only MCAR test applicable
195+ # to mixed data. Finally, it proposes a concept of partial p-value which enables us to carry out a
196+ # variable-by-variable diagnosis to identify the potential causes of a MAR mechanism.
197+ #
198+ # There is a parameter in the paper called size.res.set. The authors of the paper recommend setting
199+ # this parameter to 2. We have chosen to follow this advice and not leave the possibility of
200+ # increasing this parameter. The results are satisfactory and the code is simpler.
201+ #
202+ # It does have one disadvantage, however: its calculation time.
203+ #
204+
205+ # %%
206+
207+ """
208+ Calculation time
209+ ================
210+
211+ +------------+------------+----------------------+
212+ | **n_rows** | **n_cols** | **Calculation_time** |
213+ +============+============+======================+
214+ | 200 | 2 | 2"12 |
215+ +------------+------------+----------------------+
216+ | 500 | 2 | 2"24 |
217+ +------------+------------+----------------------+
218+ | 500 | 4 | 2"18 |
219+ +------------+------------+----------------------+
220+ | 1000 | 4 | 2"48 |
221+ +------------+------------+----------------------+
222+ | 1000 | 6 | 2"42 |
223+ +------------+------------+----------------------+
224+ | 10000 | 6 | 20"54 |
225+ +------------+------------+----------------------+
226+ | 10000 | 10 | 14"48 |
227+ +------------+------------+----------------------+
228+ | 100000 | 10 | 4'51" |
229+ +------------+------------+----------------------+
230+ | 100000 | 15 | 3'06" |
231+ +------------+------------+----------------------+
232+ """
233+
234+ # %%
235+ # 2.1 Parameters and Hyperparmaters
236+ # ================================================
237+ #
238+ # To use the PKLM test properly, it may be necessary to understand the use of hyper-parameters.
239+ #
240+ # * ``nb_projections``: Number of projections on which the test statistic is calculated. This
241+ # parameter has the greatest influence on test calculation time. Its defaut value
242+ # ``nb_projections=100``.
243+ # Est-ce qu'on donne des ordres de grandeurs utiles ? J'avais un peu fait ce travail.
244+ #
245+ # * ``nb_permutation`` : Number of permutations of the projected targets. The higher is better. This
246+ # parameter has little impact on calculation time.
247+ # Its default value ``nb_permutation=30``.
248+ #
249+ # * ``nb_trees_per_proj`` : The number of subtrees in each random forest fitted. In order to
250+ # estimate the Kullback-Leibler divergence, we need to obtain probabilities of belonging to
251+ # certain missing patterns. Random Forests are used to estimate these probabilities. This
252+ # hyperparameter has a significant impact on test calculation time. Its default
253+ # value is ``nb_trees_per_proj=200``
254+ #
255+ # * ``compute_partial_p_values``: Boolean that indicates if you want to compute the partial
256+ # p-values. Those partial p-values could help the user to identify the variables responsible for
257+ # the MAR missing-data mechanism. Please see the section 2.3 for examples. Its default value is
258+ # ``compute_partial_p_values=False``.
259+ #
260+ # * ``encoder``: Scikit-Learn encoder to encode non-numerical values.
261+ # Its default value ``encoder=sklearn.preprocessing.OneHotEncoder()``
262+ #
263+ # * ``random_state``: Controls the randomness. Pass an int for reproducible output across
264+ # multiple function calls. Its default value ``random_state=None``
265+
266+ # %%
267+ # 2.2 Application on mixed data types
268+ # ================================================
269+ #
270+ # As we have seen, Little's test only applies to quantitative data. In real life, however, it is
271+ # common to have to deal with mixed data. Here's an example of how to use the PKLM test on a dataset
272+ # with mixed data types.
273+
274+ # %%
275+ n_rows = 100
276+
277+ col1 = rng .rand (n_rows ) * 100
278+ col2 = rng .randint (1 , 100 , n_rows )
279+ col3 = rng .choice ([True , False ], n_rows )
280+ modalities = ['A' , 'B' , 'C' , 'D' ]
281+ col4 = rng .choice (modalities , n_rows )
282+
283+ df = pd .DataFrame ({
284+ 'Numeric1' : col1 ,
285+ 'Numeric2' : col2 ,
286+ 'Boolean' : col3 ,
287+ 'Object' : col4
288+ })
289+
290+ hole_gen = UniformHoleGenerator (
291+ n_splits = 1 ,
292+ ratio_masked = 0.2 ,
293+ subset = ['Numeric1' , 'Numeric2' , 'Boolean' , 'Object' ],
294+ random_state = rng
295+ )
296+ df_mask = hole_gen .generate_mask (df )
297+ df_nan = df .where (~ df_mask , np .nan )
298+ df_nan .dtypes
299+
300+ # %%
301+ pklm_result = pklm_test_mcar .test (df_nan )
302+ print (f"The p-value of the PKLM test is: { pklm_result :.2%} " )
303+
304+ # %%
305+ # To perform the PKLM test over mixed data types, non numerical features need to be encoded. The
306+ # default encoder in the :class:`~qolmat.analysis.holes_characterization.PKLMTest` class is the
307+ # default OneHotEncoder from scikit-learn. If you wish to use an encoder adapted to your data, you
308+ # can perform this encoding step beforehand, and then use the PKLM test.
309+ # Currently, we do not support the following types :
310+ #
311+ # - datetimes
312+ #
313+ # - timedeltas
314+ #
315+ # - Pandas datetimetz
316+
317+ # %%
318+ # 2.3 Partial p-values
319+ # ================================================
320+ #
321+ # In addition, the PKLM test can be used to calculate partial p-values. We denote as many partial
322+ # p-values as there are columns in the input dataframe. This “partial” p-value corresponds to the
323+ # effect of removing the patterns induced by variable k.
324+ #
325+ # Let's take a look at an example of how to use this feature
326+
327+ # %%
328+ data = rng .multivariate_normal (
329+ mean = [0 , 0 , 0 , 0 ],
330+ cov = [[1 , 0 , 0 , 0 ], [0 , 1 , 0 , 0 ], [0 , 0 , 1 , 0 ], [0 , 0 , 0 , 1 ]],
331+ size = 400
332+ )
333+ df = pd .DataFrame (data = data , columns = ["Column 1" , "Column 2" , "Column 3" , "Column 4" ])
334+
335+ df_mask = pd .DataFrame (
336+ {
337+ "Column 1" : False ,
338+ "Column 2" : df ["Column 1" ] > q975 ,
339+ "Column 3" : False ,
340+ "Column 4" : False ,
341+ },
342+ index = df .index
343+ )
344+ df_nan = df .where (~ df_mask , np .nan )
345+
346+ # %%
347+ # The missing-data mechanism is clearly MAR. Intuitively, if we remove the second column from the
348+ # matrix, the missing-data mechanism is MCAR. Let's see how the PKLM test can help us identify the
349+ # variable responsible for the MAR mechanism.
350+
351+ # %%
352+ pklm_test = PKLMTest (random_state = rng , compute_partial_p_values = True )
353+ p_value , partial_p_values = pklm_test .test (df_nan )
354+ print (f"The p-value of the PKLM test is: { p_value :.2%} " )
355+
356+ # %%
357+ # The test result confirms that we can reject the null hypothesis and therefore assume that the
358+ # missing-data mechanism is MAR.
359+ # Let's now take a look at what partial p-values can tell us.
360+
361+ # %%
362+ for col_index , partial_p_v in enumerate (partial_p_values ):
363+ print (f"The partial p-value for the column index { col_index + 1 } is: { partial_p_v :.2%} " )
364+
365+ # %%
366+ # As a result, by removing the missing patterns induced by variable 2, the p-value rises
367+ # above the significance threshold set beforehand. Thus in this sense, the test detects that the
368+ # main culprit of the MAR mechanism lies in the second variable.
369+
370+
371+ # %%
372+ # Calculation time -> TO BE DELETED
373+ # | **n_rows** | **n_cols** | **Calculation_time** |
374+ # |------------|------------|----------------------|
375+ # | 200 | 2 | 2"12 |
376+ # | 500 | 2 | 2"24 |
377+ # | 500 | 4 | 2"18 |
378+ # | 1000 | 4 | 2"48 |
379+ # | 1000 | 6 | 2"42 |
380+ # | 10000 | 6 | 20"54 |
381+ # | 10000 | 10 | 14"48 |
382+ # | 100000 | 10 | 4'51" |
383+ # | 100000 | 15 | 3'06" |
0 commit comments