Skip to content

Commit 17e9ca0

Browse files
committed
Add FastAPI app with Docker, CI/CD, and linter
1 parent a229e9c commit 17e9ca0

File tree

6 files changed

+223
-134
lines changed

6 files changed

+223
-134
lines changed

register.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,12 @@
33
from pathlib import Path
44

55
# Set tracking URI
6-
mlruns_path = Path(r"C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-Probability-Model\mlruns").absolute()
6+
mlruns_path = Path(
7+
r"C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-"
8+
r"Probability-Model\mlruns"
9+
).absolute()
710
mlflow.set_tracking_uri(f"file:///{mlruns_path.as_posix()}")
11+
812
# Define model variables
913
model_name = "best_model"
1014
run_id = "1bed56713a694528a9571bb00576059c"
@@ -16,7 +20,7 @@
1620
# Register the model (will raise exception if already exists)
1721
try:
1822
client.create_registered_model(model_name)
19-
except:
23+
except Exception:
2024
pass # model already exists
2125

2226
# Create new version
@@ -34,4 +38,3 @@
3438
)
3539

3640
print(f"✅ Re-registered as models:/{model_name}/Staging")
37-

src/PreProcessing.py

Lines changed: 62 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,37 @@
11
import pandas as pd
22
from sklearn.pipeline import Pipeline
3-
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
3+
from sklearn.preprocessing import (
4+
StandardScaler,
5+
RobustScaler,
6+
OneHotEncoder
7+
)
48
from sklearn.compose import ColumnTransformer
59
from sklearn.impute import SimpleImputer
610

11+
712
def process_data(file_path):
813
# Load data
914
data = pd.read_excel(file_path)
10-
# Remove rows with missing values
15+
16+
# Remove rows with missing values
1117
data.dropna(inplace=True)
18+
1219
# Convert TransactionStartTime to datetime
13-
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])
14-
# Define feature types
15-
numerical_features = ['Amount','Value']
16-
categorical_features = ['ProviderId', 'ProductId', 'ProductCategory',
17-
'ChannelId', 'PricingStrategy', 'FraudResult']
20+
data['TransactionStartTime'] = pd.to_datetime(
21+
data['TransactionStartTime']
22+
)
23+
24+
# Define feature types
25+
numerical_features = ['Amount', 'Value']
26+
categorical_features = [
27+
'ProviderId', 'ProductId', 'ProductCategory',
28+
'ChannelId', 'PricingStrategy', 'FraudResult'
29+
]
1830

1931
# Define the numerical pipeline
2032
numerical_pipeline = Pipeline(steps=[
2133
('imputer', SimpleImputer(strategy='mean')),
22-
('scaler', RobustScaler())
34+
('scaler', RobustScaler())
2335
])
2436

2537
# Define the categorical pipeline
@@ -33,31 +45,58 @@ def process_data(file_path):
3345
transformers=[
3446
('num', numerical_pipeline, numerical_features),
3547
('cat', categorical_pipeline, categorical_features)
36-
])
48+
]
49+
)
3750

38-
# Fit and transform the data
39-
X_processed = preprocessor.fit_transform(data)
51+
# Fit and transform the data (not returned in this function)
52+
_ = preprocessor.fit_transform(data)
4053

4154
# Create aggregate features
42-
# For net transaction amount
43-
data['Net_Total_Transaction_Amount'] = data.groupby('CustomerId')['Amount'].transform('sum')
44-
# For gross transaction amount
45-
data['Gross_Transaction_Amount'] = data.groupby('CustomerId')['Value'].transform('sum')
46-
data['Average_Transaction_Amount'] = data.groupby('CustomerId')['Amount'].transform('mean')
47-
data['Transaction_Count'] = data.groupby('CustomerId')['TransactionId'].transform('count')
48-
data['Std_Transaction_Amount'] = data.groupby('CustomerId')['Amount'].transform('std')
49-
data['Last_Transaction_Date'] = data.groupby('CustomerId')['TransactionStartTime'].transform('max')
50-
data['Recency_in_person'] = (data['TransactionStartTime'].max() - data['Last_Transaction_Date']).dt.days
55+
data['Net_Total_Transaction_Amount'] = data.groupby(
56+
'CustomerId'
57+
)['Amount'].transform('sum')
58+
59+
data['Gross_Transaction_Amount'] = data.groupby(
60+
'CustomerId'
61+
)['Value'].transform('sum')
62+
63+
data['Average_Transaction_Amount'] = data.groupby(
64+
'CustomerId'
65+
)['Amount'].transform('mean')
66+
67+
data['Transaction_Count'] = data.groupby(
68+
'CustomerId'
69+
)['TransactionId'].transform('count')
70+
71+
data['Std_Transaction_Amount'] = data.groupby(
72+
'CustomerId'
73+
)['Amount'].transform('std')
74+
75+
data['Last_Transaction_Date'] = data.groupby(
76+
'CustomerId'
77+
)['TransactionStartTime'].transform('max')
78+
79+
data['Recency_in_person'] = (
80+
data['TransactionStartTime'].max() - data['Last_Transaction_Date']
81+
).dt.days
5182

5283
# Extract date features
53-
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])
5484
data['Transaction_Hour'] = data['TransactionStartTime'].dt.hour
5585
data['Transaction_Day'] = data['TransactionStartTime'].dt.day
5686
data['Transaction_Month'] = data['TransactionStartTime'].dt.month
5787
data['Transaction_Year'] = data['TransactionStartTime'].dt.year
5888

5989
return data
6090

91+
6192
if __name__ == "__main__":
62-
processed_data = process_data(r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-Probability-Model\data\processed')
63-
processed_data.to_excel(r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-Probability-Model\data\processed\processed_data.xlsx', index=False)
93+
processed_data = process_data(
94+
r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-'
95+
r'Probability-Model\data\processed'
96+
)
97+
98+
processed_data.to_excel(
99+
r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-'
100+
r'Probability-Model\data\processed\processed_data.xlsx',
101+
index=False
102+
)

src/RFMmetrics.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
from sklearn.preprocessing import StandardScaler
33
from sklearn.cluster import KMeans
44

5-
import pandas as pd
6-
import pandas as pd
75

86
def load_data(filepath):
97
"""
@@ -29,12 +27,15 @@ def load_data(filepath):
2927
print(f"❌ An error occurred while loading the data: {e}")
3028
return None
3129

32-
def get_snapshot_date(data, time_col='TransactionStartTime', last_col='Last_Transaction_Date'):
30+
31+
def get_snapshot_date(
32+
data, time_col='TransactionStartTime', last_col='Last_Transaction_Date'
33+
):
3334
"""
34-
Converts date columns to datetime and returns the snapshot date (latest transaction time).
35+
Converts date columns to datetime and returns the snapshot date.
3536
3637
Parameters:
37-
data (pd.DataFrame): The DataFrame containing transaction data.
38+
data (pd.DataFrame): DataFrame containing transaction data.
3839
time_col (str): Name of the main transaction time column.
3940
last_col (str): Name of the last transaction date column.
4041
@@ -47,45 +48,54 @@ def get_snapshot_date(data, time_col='TransactionStartTime', last_col='Last_Tran
4748
print("Snapshot Date:", snapshot_date)
4849
return snapshot_date
4950

51+
5052
def calculate_rfm(data, snapshot_date):
5153
rfm_df = data.groupby('CustomerId').agg({
5254
'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,
5355
'TransactionId': 'count',
54-
'Value': 'sum'
56+
'Value': 'sum'
5557
}).rename(columns={
5658
'TransactionStartTime': 'Recency',
5759
'TransactionId': 'Frequency',
5860
'Value': 'Monetary'
5961
}).reset_index()
6062
return rfm_df
6163

62-
# Step 2: Pre-process RFM Data
64+
6365
def scale_rfm(rfm_df):
6466
scaler = StandardScaler()
65-
rfm_scaled = scaler.fit_transform(rfm_df[['Recency', 'Frequency', 'Monetary']])
67+
rfm_scaled = scaler.fit_transform(
68+
rfm_df[['Recency', 'Frequency', 'Monetary']]
69+
)
6670
return rfm_scaled
6771

68-
# Step 3: Cluster Customers
72+
6973
def cluster_customers(rfm_scaled, n_clusters=3, random_state=42):
7074
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
7175
clusters = kmeans.fit_predict(rfm_scaled)
7276
return clusters
7377

74-
# Step 4: Define and Assign the "High-Risk" Label
78+
7579
def assign_high_risk_label(rfm_df, high_risk_cluster_number):
76-
rfm_df['is_high_risk'] = (rfm_df['Cluster'] == high_risk_cluster_number).astype(int)
80+
rfm_df['is_high_risk'] = (
81+
rfm_df['Cluster'] == high_risk_cluster_number
82+
).astype(int)
7783
return rfm_df
7884

79-
# Step 5: Integrate the Target Variable
85+
8086
def integrate_target_variable(main_data, rfm_df):
81-
merged_data = main_data.merge(rfm_df, on='CustomerId', how='left') # Include all RFM columns
87+
merged_data = main_data.merge(
88+
rfm_df, on='CustomerId', how='left'
89+
)
8290
return merged_data
8391

84-
# Main Function to Execute Task 4
85-
def main_task_4(data, snapshot_date, n_clusters=3, high_risk_cluster_number=2):
92+
93+
def main_task_4(
94+
data, snapshot_date, n_clusters=3, high_risk_cluster_number=2
95+
):
8696
rfm_df = calculate_rfm(data, snapshot_date)
8797
rfm_scaled = scale_rfm(rfm_df)
8898
rfm_df['Cluster'] = cluster_customers(rfm_scaled, n_clusters)
8999
rfm_df = assign_high_risk_label(rfm_df, high_risk_cluster_number)
90100
final_data = integrate_target_variable(data, rfm_df)
91-
return final_data
101+
return final_data

0 commit comments

Comments
 (0)