@@ -18,20 +18,17 @@ def load_data_csv(path):
1818def summary_stats (df ):
1919 """Print basic info and stats"""
2020 print (df .info ())
21- print (
22- "\n --- Describe Numerical ---\n " ,
23- df .describe ()
24- )
21+ print ("\n --- Describe Numerical ---\n " , df .describe ())
2522 print ("\n --- Describe Categorical ---\n " )
26- print (df .select_dtypes (include = ' object' ).describe ())
23+ print (df .select_dtypes (include = " object" ).describe ())
2724
2825
2926def logarithmic_numerical_distribution (
3027 df ,
3128 columns = None ,
3229):
3330 if columns is None :
34- columns = [' Amount' , ' Value' , ' FraudResult' ]
31+ columns = [" Amount" , " Value" , " FraudResult" ]
3532
3633 """
3734 Plot log-scale histograms for positive and negative values of specified columns.
@@ -44,49 +41,54 @@ def logarithmic_numerical_distribution(
4441 pos_vals = df [df [column ] > 0 ][column ]
4542 if not pos_vals .empty :
4643 plt .hist (pos_vals , bins = 50 , log = True )
47- plt .title (f' { column } (positive values, log scale)' )
44+ plt .title (f" { column } (positive values, log scale)" )
4845 plt .xlabel (column )
49- plt .ylabel (' Frequency (log scale)' )
46+ plt .ylabel (" Frequency (log scale)" )
5047 plt .show ()
5148
5249 neg_vals = np .abs (df [df [column ] < 0 ][column ])
5350 if not neg_vals .empty :
5451 plt .hist (neg_vals , bins = 50 , log = True )
55- plt .title (f' { column } (negative abs values, log scale)' )
56- plt .xlabel (f' Absolute { column } ' )
57- plt .ylabel (' Frequency (log scale)' )
52+ plt .title (f" { column } (negative abs values, log scale)" )
53+ plt .xlabel (f" Absolute { column } " )
54+ plt .ylabel (" Frequency (log scale)" )
5855 plt .show ()
5956
6057
61- num_cols = [' CurrencyCode' , ' CountryCode' , ' PricingStrategy' ]
58+ num_cols = [" CurrencyCode" , " CountryCode" , " PricingStrategy" ]
6259
6360
6461def plot_numeric_distributions (df , num_cols = num_cols ):
6562 """Plot histograms for numeric columns"""
6663 if num_cols is None :
67- num_cols = df .select_dtypes (include = [' int64' , ' float64' ]).columns
64+ num_cols = df .select_dtypes (include = [" int64" , " float64" ]).columns
6865
6966 for col in num_cols :
7067 plt .figure (figsize = (8 , 4 ))
71- plt .hist (df [col ].dropna (), bins = 30 , edgecolor = ' black' )
68+ plt .hist (df [col ].dropna (), bins = 30 , edgecolor = " black" )
7269 plt .title (f"Distribution of { col } " )
7370 plt .xlabel (col )
7471 plt .ylabel ("Frequency" )
75- plt .grid (True , linestyle = '--' , alpha = 0.5 )
72+ plt .grid (True , linestyle = "--" , alpha = 0.5 )
7673 plt .tight_layout ()
7774 plt .show ()
7875
7976
8077cat_cols = [
81- 'CurrencyCode' , 'CountryCode' , 'ProviderId' , 'ProductCategory' ,
82- 'ChannelId' , 'PricingStrategy' , 'FraudResult'
78+ "CurrencyCode" ,
79+ "CountryCode" ,
80+ "ProviderId" ,
81+ "ProductCategory" ,
82+ "ChannelId" ,
83+ "PricingStrategy" ,
84+ "FraudResult" ,
8385]
8486
8587
8688def plot_categorical_distributions (df , cat_cols = cat_cols , top_k = 10 ):
8789 """Plot bar plots for categorical features"""
8890 if cat_cols is None :
89- cat_cols = df .select_dtypes (include = ' object' ).columns
91+ cat_cols = df .select_dtypes (include = " object" ).columns
9092
9193 for col in cat_cols :
9294 plt .figure (figsize = (8 , 4 ))
@@ -101,18 +103,17 @@ def check_missing_values(df):
101103 """Display missing value counts and percentages"""
102104 missing = df .isnull ().sum ()
103105 missing_percent = (missing / len (df )) * 100
104- missing_df = pd .DataFrame ({
105- 'Missing Values' : missing ,
106- 'Percent' : missing_percent
107- })
108- print (missing_df [missing_df ['Missing Values' ] > 0 ])
106+ missing_df = pd .DataFrame (
107+ {"Missing Values" : missing , "Percent" : missing_percent }
108+ )
109+ print (missing_df [missing_df ["Missing Values" ] > 0 ])
109110
110111
111112def plot_correlations (df ):
112113 """Plot correlation heatmap for numeric features"""
113- corr = df .select_dtypes (include = [' int64' , ' float64' ]).corr ()
114+ corr = df .select_dtypes (include = [" int64" , " float64" ]).corr ()
114115 plt .figure (figsize = (12 , 8 ))
115- sns .heatmap (corr , annot = True , cmap = ' coolwarm' , fmt = ".2f" , square = True )
116+ sns .heatmap (corr , annot = True , cmap = " coolwarm" , fmt = ".2f" , square = True )
116117 plt .title ("Correlation Matrix" )
117118 plt .show ()
118119
@@ -123,9 +124,9 @@ def cramers_v(confusion_matrix):
123124 n = confusion_matrix .sum ().sum ()
124125 phi2 = chi2 / n
125126 r , k = confusion_matrix .shape
126- phi2_corr = max (0 , phi2 - ((k - 1 )* (r - 1 )) / (n - 1 ))
127- r_corr = r - ((r - 1 )** 2 ) / (n - 1 )
128- k_corr = k - ((k - 1 )** 2 ) / (n - 1 )
127+ phi2_corr = max (0 , phi2 - ((k - 1 ) * (r - 1 )) / (n - 1 ))
128+ r_corr = r - ((r - 1 ) ** 2 ) / (n - 1 )
129+ k_corr = k - ((k - 1 ) ** 2 ) / (n - 1 )
129130 return np .sqrt (phi2_corr / min ((k_corr - 1 ), (r_corr - 1 )))
130131
131132
@@ -134,7 +135,7 @@ def cramers_v_matrix(df, cat_cols):
134135 matrix = pd .DataFrame (
135136 np .zeros ((len (cat_cols ), len (cat_cols ))),
136137 index = cat_cols ,
137- columns = cat_cols
138+ columns = cat_cols ,
138139 )
139140 for col1 in cat_cols :
140141 for col2 in cat_cols :
@@ -147,22 +148,22 @@ def cramers_v_matrix(df, cat_cols):
147148
148149
149150def plot_cramers_v_heatmap (
150- df , categorical_features , figsize = (6 , 4 ), cmap = ' YlOrBr'
151+ df , categorical_features , figsize = (6 , 4 ), cmap = " YlOrBr"
151152):
152153 """
153154 Plot Cramér's V heatmap for categorical columns.
154155 """
155156 cramers_matrix = cramers_v_matrix (df , categorical_features )
156157 plt .figure (figsize = figsize )
157- sns .heatmap (cramers_matrix , annot = True , cmap = cmap , fmt = ' .2f' )
158+ sns .heatmap (cramers_matrix , annot = True , cmap = cmap , fmt = " .2f" )
158159 plt .title ("Cramér's V Correlation Between Categorical Features" )
159160 plt .show ()
160161
161162
162163def detect_outliers (df , num_cols = None ):
163164 """Boxplot for numeric outlier detection"""
164165 if num_cols is None :
165- num_cols = df .select_dtypes (include = [' int64' , ' float64' ]).columns
166+ num_cols = df .select_dtypes (include = [" int64" , " float64" ]).columns
166167
167168 for col in num_cols :
168169 plt .figure (figsize = (8 , 4 ))
0 commit comments