@@ -23,16 +23,14 @@ def summary_stats(df):
2323 print (df .select_dtypes (include = "object" ).describe ())
2424
2525
26- def logarithmic_numerical_distribution (
27- df ,
28- columns = None ,
29- ):
26+ def logarithmic_numerical_distribution (df , columns = None ):
27+ """
28+ Plot log-scale histograms for positive and negative values
29+ of specified columns.
30+ """
3031 if columns is None :
3132 columns = ["Amount" , "Value" , "FraudResult" ]
3233
33- """
34- Plot log-scale histograms for positive and negative values of specified columns.
35- """
3634 for column in columns :
3735 if column not in df .columns :
3836 print (f"Column '{ column } ' not found in DataFrame." )
@@ -55,13 +53,12 @@ def logarithmic_numerical_distribution(
5553 plt .show ()
5654
5755
58- num_cols = ["CurrencyCode" , "CountryCode" , "PricingStrategy" ]
59-
60-
61- def plot_numeric_distributions (df , num_cols = num_cols ):
56+ def plot_numeric_distributions (df , num_cols = None ):
6257 """Plot histograms for numeric columns"""
6358 if num_cols is None :
64- num_cols = df .select_dtypes (include = ["int64" , "float64" ]).columns
59+ num_cols = df .select_dtypes (
60+ include = ["int64" , "float64" ]
61+ ).columns
6562
6663 for col in num_cols :
6764 plt .figure (figsize = (8 , 4 ))
@@ -74,25 +71,16 @@ def plot_numeric_distributions(df, num_cols=num_cols):
7471 plt .show ()
7572
7673
77- cat_cols = [
78- "CurrencyCode" ,
79- "CountryCode" ,
80- "ProviderId" ,
81- "ProductCategory" ,
82- "ChannelId" ,
83- "PricingStrategy" ,
84- "FraudResult" ,
85- ]
86-
87-
88- def plot_categorical_distributions (df , cat_cols = cat_cols , top_k = 10 ):
74+ def plot_categorical_distributions (df , cat_cols = None , top_k = 10 ):
8975 """Plot bar plots for categorical features"""
9076 if cat_cols is None :
9177 cat_cols = df .select_dtypes (include = "object" ).columns
9278
9379 for col in cat_cols :
9480 plt .figure (figsize = (8 , 4 ))
95- sns .countplot (data = df , x = col , order = df [col ].value_counts ().index )
81+ sns .countplot (
82+ data = df , x = col , order = df [col ].value_counts ().index [:top_k ]
83+ )
9684 plt .title (f"Distribution of { col } " )
9785 plt .xticks (rotation = 45 )
9886 plt .tight_layout ()
@@ -104,7 +92,10 @@ def check_missing_values(df):
10492 missing = df .isnull ().sum ()
10593 missing_percent = (missing / len (df )) * 100
10694 missing_df = pd .DataFrame (
107- {"Missing Values" : missing , "Percent" : missing_percent }
95+ {
96+ "Missing Values" : missing ,
97+ "Percent" : missing_percent
98+ }
10899 )
109100 print (missing_df [missing_df ["Missing Values" ] > 0 ])
110101
@@ -113,7 +104,9 @@ def plot_correlations(df):
113104 """Plot correlation heatmap for numeric features"""
114105 corr = df .select_dtypes (include = ["int64" , "float64" ]).corr ()
115106 plt .figure (figsize = (12 , 8 ))
116- sns .heatmap (corr , annot = True , cmap = "coolwarm" , fmt = ".2f" , square = True )
107+ sns .heatmap (
108+ corr , annot = True , cmap = "coolwarm" , fmt = ".2f" , square = True
109+ )
117110 plt .title ("Correlation Matrix" )
118111 plt .show ()
119112
@@ -163,7 +156,9 @@ def plot_cramers_v_heatmap(
163156def detect_outliers (df , num_cols = None ):
164157 """Boxplot for numeric outlier detection"""
165158 if num_cols is None :
166- num_cols = df .select_dtypes (include = ["int64" , "float64" ]).columns
159+ num_cols = df .select_dtypes (
160+ include = ["int64" , "float64" ]
161+ ).columns
167162
168163 for col in num_cols :
169164 plt .figure (figsize = (8 , 4 ))
0 commit comments