sumeyaaaa
diff --git a/‎register.py‎
Lines changed: 6 additions & 3 deletions b/‎register.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎src/PreProcessing.py‎
Lines changed: 62 additions & 23 deletions b/‎src/PreProcessing.py‎
Lines changed: 62 additions & 23 deletions
diff --git a/‎src/RFMmetrics.py‎
Lines changed: 26 additions & 16 deletions b/‎src/RFMmetrics.py‎
Lines changed: 26 additions & 16 deletions
@@ -3,8 +3,12 @@
 from pathlib import Path
 
 # Set tracking URI
-mlruns_path = Path(r"C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-Probability-Model\mlruns").absolute()
+mlruns_path = Path(
+    r"C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-"
+    r"Probability-Model\mlruns"
+).absolute()
 mlflow.set_tracking_uri(f"file:///{mlruns_path.as_posix()}")
+
 # Define model variables
 model_name = "best_model"
 run_id = "1bed56713a694528a9571bb00576059c"
@@ -16,7 +20,7 @@
 # Register the model (will raise exception if already exists)
 try:
     client.create_registered_model(model_name)
-except:
+except Exception:
     pass  # model already exists
 
 # Create new version
@@ -34,4 +38,3 @@
 )
 
 print(f"✅ Re-registered as models:/{model_name}/Staging")
-
 
@@ -1,25 +1,37 @@
 import pandas as pd
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
+from sklearn.preprocessing import (
+    StandardScaler,
+    RobustScaler,
+    OneHotEncoder
+)
 from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
 
+
 def process_data(file_path):
     # Load data
     data = pd.read_excel(file_path)
-     # Remove rows with missing values
+
+    # Remove rows with missing values
     data.dropna(inplace=True)
+
     # Convert TransactionStartTime to datetime
-    data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])
-   # Define feature types
-    numerical_features = ['Amount','Value']
-    categorical_features = ['ProviderId', 'ProductId', 'ProductCategory',
-                             'ChannelId', 'PricingStrategy', 'FraudResult']
+    data['TransactionStartTime'] = pd.to_datetime(
+        data['TransactionStartTime']
+    )
+
+    # Define feature types
+    numerical_features = ['Amount', 'Value']
+    categorical_features = [
+        'ProviderId', 'ProductId', 'ProductCategory',
+        'ChannelId', 'PricingStrategy', 'FraudResult'
+    ]
 
     # Define the numerical pipeline
     numerical_pipeline = Pipeline(steps=[
         ('imputer', SimpleImputer(strategy='mean')),
-        ('scaler',  RobustScaler())
+        ('scaler', RobustScaler())
     ])
 
     # Define the categorical pipeline
@@ -33,31 +45,58 @@ def process_data(file_path):
         transformers=[
             ('num', numerical_pipeline, numerical_features),
             ('cat', categorical_pipeline, categorical_features)
-        ])
+        ]
+    )
 
-    # Fit and transform the data
-    X_processed = preprocessor.fit_transform(data)
+    # Fit and transform the data (not returned in this function)
+    _ = preprocessor.fit_transform(data)
 
     # Create aggregate features
-    # For net transaction amount
-    data['Net_Total_Transaction_Amount'] = data.groupby('CustomerId')['Amount'].transform('sum')
-    # For gross transaction amount
-    data['Gross_Transaction_Amount'] = data.groupby('CustomerId')['Value'].transform('sum')
-    data['Average_Transaction_Amount'] = data.groupby('CustomerId')['Amount'].transform('mean')
-    data['Transaction_Count'] = data.groupby('CustomerId')['TransactionId'].transform('count')
-    data['Std_Transaction_Amount'] = data.groupby('CustomerId')['Amount'].transform('std')
-    data['Last_Transaction_Date'] = data.groupby('CustomerId')['TransactionStartTime'].transform('max')
-    data['Recency_in_person'] = (data['TransactionStartTime'].max() - data['Last_Transaction_Date']).dt.days
+    data['Net_Total_Transaction_Amount'] = data.groupby(
+        'CustomerId'
+    )['Amount'].transform('sum')
+
+    data['Gross_Transaction_Amount'] = data.groupby(
+        'CustomerId'
+    )['Value'].transform('sum')
+
+    data['Average_Transaction_Amount'] = data.groupby(
+        'CustomerId'
+    )['Amount'].transform('mean')
+
+    data['Transaction_Count'] = data.groupby(
+        'CustomerId'
+    )['TransactionId'].transform('count')
+
+    data['Std_Transaction_Amount'] = data.groupby(
+        'CustomerId'
+    )['Amount'].transform('std')
+
+    data['Last_Transaction_Date'] = data.groupby(
+        'CustomerId'
+    )['TransactionStartTime'].transform('max')
+
+    data['Recency_in_person'] = (
+        data['TransactionStartTime'].max() - data['Last_Transaction_Date']
+    ).dt.days
 
     # Extract date features
-    data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])
     data['Transaction_Hour'] = data['TransactionStartTime'].dt.hour
     data['Transaction_Day'] = data['TransactionStartTime'].dt.day
     data['Transaction_Month'] = data['TransactionStartTime'].dt.month
     data['Transaction_Year'] = data['TransactionStartTime'].dt.year
 
     return data
 
+
 if __name__ == "__main__":
-    processed_data = process_data(r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-Probability-Model\data\processed')
-    processed_data.to_excel(r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-Probability-Model\data\processed\processed_data.xlsx', index=False)
+    processed_data = process_data(
+        r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-'
+        r'Probability-Model\data\processed'
+    )
+
+    processed_data.to_excel(
+        r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-'
+        r'Probability-Model\data\processed\processed_data.xlsx',
+        index=False
+    )
@@ -2,8 +2,6 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.cluster import KMeans
 
-import pandas as pd
-import pandas as pd
 
 def load_data(filepath):
     """
@@ -29,12 +27,15 @@ def load_data(filepath):
         print(f"❌ An error occurred while loading the data: {e}")
         return None
 
-def get_snapshot_date(data, time_col='TransactionStartTime', last_col='Last_Transaction_Date'):
+
+def get_snapshot_date(
+    data, time_col='TransactionStartTime', last_col='Last_Transaction_Date'
+):
     """
-    Converts date columns to datetime and returns the snapshot date (latest transaction time).
+    Converts date columns to datetime and returns the snapshot date.
 
     Parameters:
-        data (pd.DataFrame): The DataFrame containing transaction data.
+        data (pd.DataFrame): DataFrame containing transaction data.
         time_col (str): Name of the main transaction time column.
         last_col (str): Name of the last transaction date column.
 
@@ -47,45 +48,54 @@ def get_snapshot_date(data, time_col='TransactionStartTime', last_col='Last_Tran
     print("Snapshot Date:", snapshot_date)
     return snapshot_date
 
+
 def calculate_rfm(data, snapshot_date):
     rfm_df = data.groupby('CustomerId').agg({
         'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,
         'TransactionId': 'count',
-         'Value': 'sum'
+        'Value': 'sum'
     }).rename(columns={
         'TransactionStartTime': 'Recency',
         'TransactionId': 'Frequency',
         'Value': 'Monetary'
     }).reset_index()
     return rfm_df
 
-# Step 2: Pre-process RFM Data
+
 def scale_rfm(rfm_df):
     scaler = StandardScaler()
-    rfm_scaled = scaler.fit_transform(rfm_df[['Recency', 'Frequency', 'Monetary']])
+    rfm_scaled = scaler.fit_transform(
+        rfm_df[['Recency', 'Frequency', 'Monetary']]
+    )
     return rfm_scaled
 
-# Step 3: Cluster Customers
+
 def cluster_customers(rfm_scaled, n_clusters=3, random_state=42):
     kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
     clusters = kmeans.fit_predict(rfm_scaled)
     return clusters
 
-# Step 4: Define and Assign the "High-Risk" Label
+
 def assign_high_risk_label(rfm_df, high_risk_cluster_number):
-    rfm_df['is_high_risk'] = (rfm_df['Cluster'] == high_risk_cluster_number).astype(int)
+    rfm_df['is_high_risk'] = (
+        rfm_df['Cluster'] == high_risk_cluster_number
+    ).astype(int)
     return rfm_df
 
-# Step 5: Integrate the Target Variable
+
 def integrate_target_variable(main_data, rfm_df):
-    merged_data = main_data.merge(rfm_df, on='CustomerId', how='left')  # Include all RFM columns
+    merged_data = main_data.merge(
+        rfm_df, on='CustomerId', how='left'
+    )
     return merged_data
 
-# Main Function to Execute Task 4
-def main_task_4(data, snapshot_date, n_clusters=3, high_risk_cluster_number=2):
+
+def main_task_4(
+    data, snapshot_date, n_clusters=3, high_risk_cluster_number=2
+):
     rfm_df = calculate_rfm(data, snapshot_date)
     rfm_scaled = scale_rfm(rfm_df)
     rfm_df['Cluster'] = cluster_customers(rfm_scaled, n_clusters)
     rfm_df = assign_high_risk_label(rfm_df, high_risk_cluster_number)
     final_data = integrate_target_variable(data, rfm_df)
-    return final_data
+    return final_data