Skip to content

Commit e584b5b

Browse files
committed
final edit of formatting
1 parent 91ac128 commit e584b5b

File tree

2 files changed

+32
-34
lines changed

2 files changed

+32
-34
lines changed

src/load.py

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,14 @@ def summary_stats(df):
2323
print(df.select_dtypes(include="object").describe())
2424

2525

26-
def logarithmic_numerical_distribution(
27-
df,
28-
columns=None,
29-
):
26+
def logarithmic_numerical_distribution(df, columns=None):
27+
"""
28+
Plot log-scale histograms for positive and negative values
29+
of specified columns.
30+
"""
3031
if columns is None:
3132
columns = ["Amount", "Value", "FraudResult"]
3233

33-
"""
34-
Plot log-scale histograms for positive and negative values of specified columns.
35-
"""
3634
for column in columns:
3735
if column not in df.columns:
3836
print(f"Column '{column}' not found in DataFrame.")
@@ -55,13 +53,12 @@ def logarithmic_numerical_distribution(
5553
plt.show()
5654

5755

58-
num_cols = ["CurrencyCode", "CountryCode", "PricingStrategy"]
59-
60-
61-
def plot_numeric_distributions(df, num_cols=num_cols):
56+
def plot_numeric_distributions(df, num_cols=None):
6257
"""Plot histograms for numeric columns"""
6358
if num_cols is None:
64-
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
59+
num_cols = df.select_dtypes(
60+
include=["int64", "float64"]
61+
).columns
6562

6663
for col in num_cols:
6764
plt.figure(figsize=(8, 4))
@@ -74,25 +71,16 @@ def plot_numeric_distributions(df, num_cols=num_cols):
7471
plt.show()
7572

7673

77-
cat_cols = [
78-
"CurrencyCode",
79-
"CountryCode",
80-
"ProviderId",
81-
"ProductCategory",
82-
"ChannelId",
83-
"PricingStrategy",
84-
"FraudResult",
85-
]
86-
87-
88-
def plot_categorical_distributions(df, cat_cols=cat_cols, top_k=10):
74+
def plot_categorical_distributions(df, cat_cols=None, top_k=10):
8975
"""Plot bar plots for categorical features"""
9076
if cat_cols is None:
9177
cat_cols = df.select_dtypes(include="object").columns
9278

9379
for col in cat_cols:
9480
plt.figure(figsize=(8, 4))
95-
sns.countplot(data=df, x=col, order=df[col].value_counts().index)
81+
sns.countplot(
82+
data=df, x=col, order=df[col].value_counts().index[:top_k]
83+
)
9684
plt.title(f"Distribution of {col}")
9785
plt.xticks(rotation=45)
9886
plt.tight_layout()
@@ -104,7 +92,10 @@ def check_missing_values(df):
10492
missing = df.isnull().sum()
10593
missing_percent = (missing / len(df)) * 100
10694
missing_df = pd.DataFrame(
107-
{"Missing Values": missing, "Percent": missing_percent}
95+
{
96+
"Missing Values": missing,
97+
"Percent": missing_percent
98+
}
10899
)
109100
print(missing_df[missing_df["Missing Values"] > 0])
110101

@@ -113,7 +104,9 @@ def plot_correlations(df):
113104
"""Plot correlation heatmap for numeric features"""
114105
corr = df.select_dtypes(include=["int64", "float64"]).corr()
115106
plt.figure(figsize=(12, 8))
116-
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", square=True)
107+
sns.heatmap(
108+
corr, annot=True, cmap="coolwarm", fmt=".2f", square=True
109+
)
117110
plt.title("Correlation Matrix")
118111
plt.show()
119112

@@ -163,7 +156,9 @@ def plot_cramers_v_heatmap(
163156
def detect_outliers(df, num_cols=None):
164157
"""Boxplot for numeric outlier detection"""
165158
if num_cols is None:
166-
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
159+
num_cols = df.select_dtypes(
160+
include=["int64", "float64"]
161+
).columns
167162

168163
for col in num_cols:
169164
plt.figure(figsize=(8, 4))

src/saveFile.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
1+
import os
12
import pandas as pd
23

3-
# Define your output path
4-
output_path = (
5-
r"C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-Probability-Model"
6-
r"\data\processed"
4+
# Define your output path using os.path.join
5+
output_path = os.path.join(
6+
"C:/Users/ABC/Desktop/10Acadamy/Week 5",
7+
"Credit-Risk-Probability-Model",
8+
"data",
9+
"processed",
710
)
811

912

1013
def convert_tz_aware_to_naive(df):
1114
"""
12-
Convert timezone-aware datetime columns in the DataFrame to timezone-unaware.
15+
Convert timezone-aware datetime to timezone-unaware.
1316
1417
Parameters:
1518
df (pd.DataFrame): The DataFrame to process.
@@ -33,6 +36,6 @@ def save_dataframe_to_csv(df, filename):
3336
filename (str): The name of the CSV file (without extension).
3437
"""
3538
df = convert_tz_aware_to_naive(df)
36-
full_path = f"{output_path}/{filename}.csv"
39+
full_path = os.path.join(output_path, f"{filename}.csv")
3740
df.to_csv(full_path, index=False)
3841
print(f"Data saved to {full_path}")

0 commit comments

Comments
 (0)