Skip to content

Commit 91ac128

Browse files
committed
final change of formatting
1 parent ee63eff commit 91ac128

File tree

3 files changed

+34
-37
lines changed

3 files changed

+34
-37
lines changed

.gitignore

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
11

22
#
33
# DVC local storage
4-
.data/dvc_file/
54

65

7-
data\dvc_file\
8-
96
# Ignore actual data files and cache
107
*.csv
11-
data/dvc_file/
128

139
# Byte-compiled / optimized / DLL files
1410
__pycache__/

src/load.py

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,17 @@ def load_data_csv(path):
1818
def summary_stats(df):
1919
"""Print basic info and stats"""
2020
print(df.info())
21-
print(
22-
"\n--- Describe Numerical ---\n",
23-
df.describe()
24-
)
21+
print("\n--- Describe Numerical ---\n", df.describe())
2522
print("\n--- Describe Categorical ---\n")
26-
print(df.select_dtypes(include='object').describe())
23+
print(df.select_dtypes(include="object").describe())
2724

2825

2926
def logarithmic_numerical_distribution(
3027
df,
3128
columns=None,
3229
):
3330
if columns is None:
34-
columns = ['Amount', 'Value', 'FraudResult']
31+
columns = ["Amount", "Value", "FraudResult"]
3532

3633
"""
3734
Plot log-scale histograms for positive and negative values of specified columns.
@@ -44,49 +41,54 @@ def logarithmic_numerical_distribution(
4441
pos_vals = df[df[column] > 0][column]
4542
if not pos_vals.empty:
4643
plt.hist(pos_vals, bins=50, log=True)
47-
plt.title(f'{column} (positive values, log scale)')
44+
plt.title(f"{column} (positive values, log scale)")
4845
plt.xlabel(column)
49-
plt.ylabel('Frequency (log scale)')
46+
plt.ylabel("Frequency (log scale)")
5047
plt.show()
5148

5249
neg_vals = np.abs(df[df[column] < 0][column])
5350
if not neg_vals.empty:
5451
plt.hist(neg_vals, bins=50, log=True)
55-
plt.title(f'{column} (negative abs values, log scale)')
56-
plt.xlabel(f'Absolute {column}')
57-
plt.ylabel('Frequency (log scale)')
52+
plt.title(f"{column} (negative abs values, log scale)")
53+
plt.xlabel(f"Absolute {column}")
54+
plt.ylabel("Frequency (log scale)")
5855
plt.show()
5956

6057

61-
num_cols = ['CurrencyCode', 'CountryCode', 'PricingStrategy']
58+
num_cols = ["CurrencyCode", "CountryCode", "PricingStrategy"]
6259

6360

6461
def plot_numeric_distributions(df, num_cols=num_cols):
6562
"""Plot histograms for numeric columns"""
6663
if num_cols is None:
67-
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
64+
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
6865

6966
for col in num_cols:
7067
plt.figure(figsize=(8, 4))
71-
plt.hist(df[col].dropna(), bins=30, edgecolor='black')
68+
plt.hist(df[col].dropna(), bins=30, edgecolor="black")
7269
plt.title(f"Distribution of {col}")
7370
plt.xlabel(col)
7471
plt.ylabel("Frequency")
75-
plt.grid(True, linestyle='--', alpha=0.5)
72+
plt.grid(True, linestyle="--", alpha=0.5)
7673
plt.tight_layout()
7774
plt.show()
7875

7976

8077
cat_cols = [
81-
'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductCategory',
82-
'ChannelId', 'PricingStrategy', 'FraudResult'
78+
"CurrencyCode",
79+
"CountryCode",
80+
"ProviderId",
81+
"ProductCategory",
82+
"ChannelId",
83+
"PricingStrategy",
84+
"FraudResult",
8385
]
8486

8587

8688
def plot_categorical_distributions(df, cat_cols=cat_cols, top_k=10):
8789
"""Plot bar plots for categorical features"""
8890
if cat_cols is None:
89-
cat_cols = df.select_dtypes(include='object').columns
91+
cat_cols = df.select_dtypes(include="object").columns
9092

9193
for col in cat_cols:
9294
plt.figure(figsize=(8, 4))
@@ -101,18 +103,17 @@ def check_missing_values(df):
101103
"""Display missing value counts and percentages"""
102104
missing = df.isnull().sum()
103105
missing_percent = (missing / len(df)) * 100
104-
missing_df = pd.DataFrame({
105-
'Missing Values': missing,
106-
'Percent': missing_percent
107-
})
108-
print(missing_df[missing_df['Missing Values'] > 0])
106+
missing_df = pd.DataFrame(
107+
{"Missing Values": missing, "Percent": missing_percent}
108+
)
109+
print(missing_df[missing_df["Missing Values"] > 0])
109110

110111

111112
def plot_correlations(df):
112113
"""Plot correlation heatmap for numeric features"""
113-
corr = df.select_dtypes(include=['int64', 'float64']).corr()
114+
corr = df.select_dtypes(include=["int64", "float64"]).corr()
114115
plt.figure(figsize=(12, 8))
115-
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", square=True)
116+
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", square=True)
116117
plt.title("Correlation Matrix")
117118
plt.show()
118119

@@ -123,9 +124,9 @@ def cramers_v(confusion_matrix):
123124
n = confusion_matrix.sum().sum()
124125
phi2 = chi2 / n
125126
r, k = confusion_matrix.shape
126-
phi2_corr = max(0, phi2 - ((k - 1)*(r - 1)) / (n - 1))
127-
r_corr = r - ((r - 1)**2) / (n - 1)
128-
k_corr = k - ((k - 1)**2) / (n - 1)
127+
phi2_corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
128+
r_corr = r - ((r - 1) ** 2) / (n - 1)
129+
k_corr = k - ((k - 1) ** 2) / (n - 1)
129130
return np.sqrt(phi2_corr / min((k_corr - 1), (r_corr - 1)))
130131

131132

@@ -134,7 +135,7 @@ def cramers_v_matrix(df, cat_cols):
134135
matrix = pd.DataFrame(
135136
np.zeros((len(cat_cols), len(cat_cols))),
136137
index=cat_cols,
137-
columns=cat_cols
138+
columns=cat_cols,
138139
)
139140
for col1 in cat_cols:
140141
for col2 in cat_cols:
@@ -147,22 +148,22 @@ def cramers_v_matrix(df, cat_cols):
147148

148149

149150
def plot_cramers_v_heatmap(
150-
df, categorical_features, figsize=(6, 4), cmap='YlOrBr'
151+
df, categorical_features, figsize=(6, 4), cmap="YlOrBr"
151152
):
152153
"""
153154
Plot Cramér's V heatmap for categorical columns.
154155
"""
155156
cramers_matrix = cramers_v_matrix(df, categorical_features)
156157
plt.figure(figsize=figsize)
157-
sns.heatmap(cramers_matrix, annot=True, cmap=cmap, fmt='.2f')
158+
sns.heatmap(cramers_matrix, annot=True, cmap=cmap, fmt=".2f")
158159
plt.title("Cramér's V Correlation Between Categorical Features")
159160
plt.show()
160161

161162

162163
def detect_outliers(df, num_cols=None):
163164
"""Boxplot for numeric outlier detection"""
164165
if num_cols is None:
165-
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
166+
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
166167

167168
for col in num_cols:
168169
plt.figure(figsize=(8, 4))

src/saveFile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def convert_tz_aware_to_naive(df):
1919
"""
2020
for col in df.columns:
2121
if pd.api.types.is_datetime64_any_dtype(df[col]):
22-
if hasattr(df[col].dt, 'tz') and df[col].dt.tz is not None:
22+
if hasattr(df[col].dt, "tz") and df[col].dt.tz is not None:
2323
df[col] = df[col].dt.tz_localize(None)
2424
return df
2525

0 commit comments

Comments
 (0)