11import warnings
2- import pandas as pd
3- import scipy .stats as stats
4- import numpy as np
52from typing import List
3+
64import matplotlib as matplotlib
5+ import numpy as np
6+ import pandas as pd
7+ import scipy .stats as stats
78import seaborn as sns
89
10+
911def fillna (object ):
1012 if isinstance (object , pd .Series ):
1113 return object .fillna (0 )
1214 else :
1315 return np .array ([value if value is not None else 0 for value in object ])
1416
15- def corr (x ,
16- y ,
17- bias_correction = True ,
18- Tschuprow = False ):
17+
18+ def corr (x , y , bias_correction = True , Tschuprow = False ):
1919 """
2020 Calculates correlation statistic for categorical-categorical association.
2121 The two measures supported are:
2222 1. Cramer'V ( default )
2323 2. Tschuprow'T
2424
2525 Bias correction and formula's taken from : https://www.researchgate.net/publication/270277061_A_bias-correction_for_Cramer's_V_and_Tschuprow's_T
26-
26+
2727 Wikipedia for Cramer's V: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
2828 Wikipedia for Tschuprow' T: https://en.wikipedia.org/wiki/Tschuprow%27s_T
2929 Parameters:
@@ -44,34 +44,48 @@ def corr(x,
4444 x , y = fillna (x ), fillna (y )
4545 crosstab_matrix = pd .crosstab (x , y )
4646 n_observations = crosstab_matrix .sum ().sum ()
47- chi2 , p , dof , expected = stats .chi2_contingency (crosstab_matrix )
47+
48+ yates_correct = True
49+ if bias_correction :
50+ if crosstab_matrix .shape == (2 , 2 ):
51+ yates_correct = False
52+
53+ chi2 , _ , _ , _ = stats .chi2_contingency (
54+ crosstab_matrix , correction = yates_correct
55+ )
4856 phi2 = chi2 / n_observations
4957
5058 # r and c are number of categories of x and y
5159 r , c = crosstab_matrix .shape
5260 if bias_correction :
5361 phi2_corrected = max (0 , phi2 - ((r - 1 ) * (c - 1 )) / (n_observations - 1 ))
54- r_corrected = r - ((r - 1 )** 2 ) / (n_observations - 1 )
55- c_corrected = c - ((c - 1 )** 2 ) / (n_observations - 1 )
62+ r_corrected = r - ((r - 1 ) ** 2 ) / (n_observations - 1 )
63+ c_corrected = c - ((c - 1 ) ** 2 ) / (n_observations - 1 )
5664 if Tschuprow :
57- corr_coeff = np .sqrt (phi2_corrected / np .sqrt ((r_corrected - 1 )* (c_corrected - 1 )))
65+ corr_coeff = np .sqrt (
66+ phi2_corrected / np .sqrt ((r_corrected - 1 ) * (c_corrected - 1 ))
67+ )
5868 return corr_coeff
59- corr_coeff = np .sqrt (phi2_corrected / min ((r_corrected - 1 ), (c_corrected - 1 )))
69+ corr_coeff = np .sqrt (
70+ phi2_corrected / min ((r_corrected - 1 ), (c_corrected - 1 ))
71+ )
6072 return corr_coeff
6173 if Tschuprow :
62- corr_coeff = np .sqrt (phi2 / np .sqrt ((r - 1 )* (c - 1 )))
63- return corr_coeff
74+ corr_coeff = np .sqrt (phi2 / np .sqrt ((r - 1 ) * (c - 1 )))
75+ return corr_coeff
6476 corr_coeff = np .sqrt (phi2 / min ((r - 1 ), (c - 1 )))
6577 return corr_coeff
66- except :
67- warnings .warn ("Error calculating Cramer's V" ,RuntimeWarning )
78+ except Exception :
79+ warnings .warn ("Error calculating Cramer's V" , RuntimeWarning )
6880 return corr_coeff
6981
7082
71- def corr_matrix (data : pd .DataFrame ,
72- columns : List ,
73- bias_correction : bool = True ,
74- Tschuprow : bool = False ) -> pd .DataFrame :
83+ def corr_matrix (
84+ data : pd .DataFrame ,
85+ columns : List ,
86+ bias_correction : bool = True ,
87+ Tschuprow : bool = False ,
88+ ) -> pd .DataFrame :
7589 """
7690 Calculates correlation for all the columns provided and returns pandas like correlation matrix.
7791 The two measures supported are:
@@ -82,7 +96,7 @@ def corr_matrix(data: pd.DataFrame,
8296 -----------
8397 data : pandas DataFrame
8498 A pandas DataFrame containing the categorical columns
85- columns : list
99+ columns : list
86100 A list of categorical columns
87101 bias_correction : Boolean, default = True
88102 Tschuprow : Boolean, default = False
@@ -92,31 +106,42 @@ def corr_matrix(data: pd.DataFrame,
92106 pandas dataframe object similar to pandas.DataFrame.corr()
93107 """
94108 # checking length of columns
95- if not columns .__len__ ()> 0 or set (data .columns .values ).intersection (columns ).__len__ ()> 0 :
109+ if (
110+ not columns .__len__ () > 0
111+ or set (data .columns .values ).intersection (columns ).__len__ () > 0
112+ ):
96113 ValueError ("Check the columns list provided" )
97114
98115 target_data = data .filter (columns )
99116 cols = target_data .columns .values
100117 shape = target_data .columns .__len__ ()
101118
102119 matrix = np .zeros ((shape , shape ))
103- for x ,i in enumerate (cols ):
120+ for x , i in enumerate (cols ):
104121 temp = np .zeros ((0 , shape ))
105122 for j in cols :
106- temp = np .append (temp ,corr (target_data [i ], target_data [j ], bias_correction = bias_correction , Tschuprow = Tschuprow ))
123+ temp = np .append (
124+ temp ,
125+ corr (
126+ target_data [i ],
127+ target_data [j ],
128+ bias_correction = bias_correction ,
129+ Tschuprow = Tschuprow ,
130+ ),
131+ )
107132 matrix [x ] = temp
108133
109- corr_matrix = pd .DataFrame (data = matrix ,
110- index = cols ,
111- columns = cols )
134+ corr_matrix = pd .DataFrame (data = matrix , index = cols , columns = cols )
112135 return corr_matrix
113136
114- def plot_corr (data : pd .DataFrame ,
115- columns : List ,
116- diagonal : str = False ,
117- bias_correction : bool = True ,
118- Tschuprow : bool = False
119- ) -> matplotlib .axes .Axes :
137+
138+ def plot_corr (
139+ data : pd .DataFrame ,
140+ columns : List ,
141+ diagonal : str = False ,
142+ bias_correction : bool = True ,
143+ Tschuprow : bool = False ,
144+ ) -> matplotlib .axes .Axes :
120145 """
121146 Plots correlation matrix for all the columns provided and returns Matplotlib axes.
122147 The two measures supported are:
@@ -127,7 +152,7 @@ def plot_corr(data: pd.DataFrame,
127152 -----------
128153 data : pandas DataFrame
129154 A pandas DataFrame containing the categorical columns
130- columns : list
155+ columns : list
131156 A list of categorical columns
132157 diagonal : string
133158 When true gives a masked version of heatmap
@@ -139,8 +164,10 @@ def plot_corr(data: pd.DataFrame,
139164 ax : matplotlib Axes
140165 Axes object with the heatmap.
141166 """
142- corr = corr_matrix (data , columns , bias_correction = bias_correction , Tschuprow = Tschuprow )
143- if (diagonal ):
167+ corr = corr_matrix (
168+ data , columns , bias_correction = bias_correction , Tschuprow = Tschuprow
169+ )
170+ if diagonal :
144171 mask = np .triu (corr )
145172 return sns .heatmap (corr , annot = True , mask = mask )
146- return sns .heatmap (corr , annot = True )
173+ return sns .heatmap (corr , annot = True )
0 commit comments