11import pandas as pd
22from sklearn .pipeline import Pipeline
3- from sklearn .preprocessing import StandardScaler , RobustScaler , OneHotEncoder
3+ from sklearn .preprocessing import (
4+ StandardScaler ,
5+ RobustScaler ,
6+ OneHotEncoder
7+ )
48from sklearn .compose import ColumnTransformer
59from sklearn .impute import SimpleImputer
610
11+
712def process_data (file_path ):
813 # Load data
914 data = pd .read_excel (file_path )
10- # Remove rows with missing values
15+
16+ # Remove rows with missing values
1117 data .dropna (inplace = True )
18+
1219 # Convert TransactionStartTime to datetime
13- data ['TransactionStartTime' ] = pd .to_datetime (data ['TransactionStartTime' ])
14- # Define feature types
15- numerical_features = ['Amount' ,'Value' ]
16- categorical_features = ['ProviderId' , 'ProductId' , 'ProductCategory' ,
17- 'ChannelId' , 'PricingStrategy' , 'FraudResult' ]
20+ data ['TransactionStartTime' ] = pd .to_datetime (
21+ data ['TransactionStartTime' ]
22+ )
23+
24+ # Define feature types
25+ numerical_features = ['Amount' , 'Value' ]
26+ categorical_features = [
27+ 'ProviderId' , 'ProductId' , 'ProductCategory' ,
28+ 'ChannelId' , 'PricingStrategy' , 'FraudResult'
29+ ]
1830
1931 # Define the numerical pipeline
2032 numerical_pipeline = Pipeline (steps = [
2133 ('imputer' , SimpleImputer (strategy = 'mean' )),
22- ('scaler' , RobustScaler ())
34+ ('scaler' , RobustScaler ())
2335 ])
2436
2537 # Define the categorical pipeline
@@ -33,31 +45,58 @@ def process_data(file_path):
3345 transformers = [
3446 ('num' , numerical_pipeline , numerical_features ),
3547 ('cat' , categorical_pipeline , categorical_features )
36- ])
48+ ]
49+ )
3750
38- # Fit and transform the data
39- X_processed = preprocessor .fit_transform (data )
51+ # Fit and transform the data (not returned in this function)
52+ _ = preprocessor .fit_transform (data )
4053
4154 # Create aggregate features
42- # For net transaction amount
43- data ['Net_Total_Transaction_Amount' ] = data .groupby ('CustomerId' )['Amount' ].transform ('sum' )
44- # For gross transaction amount
45- data ['Gross_Transaction_Amount' ] = data .groupby ('CustomerId' )['Value' ].transform ('sum' )
46- data ['Average_Transaction_Amount' ] = data .groupby ('CustomerId' )['Amount' ].transform ('mean' )
47- data ['Transaction_Count' ] = data .groupby ('CustomerId' )['TransactionId' ].transform ('count' )
48- data ['Std_Transaction_Amount' ] = data .groupby ('CustomerId' )['Amount' ].transform ('std' )
49- data ['Last_Transaction_Date' ] = data .groupby ('CustomerId' )['TransactionStartTime' ].transform ('max' )
50- data ['Recency_in_person' ] = (data ['TransactionStartTime' ].max () - data ['Last_Transaction_Date' ]).dt .days
55+ data ['Net_Total_Transaction_Amount' ] = data .groupby (
56+ 'CustomerId'
57+ )['Amount' ].transform ('sum' )
58+
59+ data ['Gross_Transaction_Amount' ] = data .groupby (
60+ 'CustomerId'
61+ )['Value' ].transform ('sum' )
62+
63+ data ['Average_Transaction_Amount' ] = data .groupby (
64+ 'CustomerId'
65+ )['Amount' ].transform ('mean' )
66+
67+ data ['Transaction_Count' ] = data .groupby (
68+ 'CustomerId'
69+ )['TransactionId' ].transform ('count' )
70+
71+ data ['Std_Transaction_Amount' ] = data .groupby (
72+ 'CustomerId'
73+ )['Amount' ].transform ('std' )
74+
75+ data ['Last_Transaction_Date' ] = data .groupby (
76+ 'CustomerId'
77+ )['TransactionStartTime' ].transform ('max' )
78+
79+ data ['Recency_in_person' ] = (
80+ data ['TransactionStartTime' ].max () - data ['Last_Transaction_Date' ]
81+ ).dt .days
5182
5283 # Extract date features
53- data ['TransactionStartTime' ] = pd .to_datetime (data ['TransactionStartTime' ])
5484 data ['Transaction_Hour' ] = data ['TransactionStartTime' ].dt .hour
5585 data ['Transaction_Day' ] = data ['TransactionStartTime' ].dt .day
5686 data ['Transaction_Month' ] = data ['TransactionStartTime' ].dt .month
5787 data ['Transaction_Year' ] = data ['TransactionStartTime' ].dt .year
5888
5989 return data
6090
91+
6192if __name__ == "__main__" :
62- processed_data = process_data (r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-Probability-Model\data\processed' )
63- processed_data .to_excel (r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-Probability-Model\data\processed\processed_data.xlsx' , index = False )
93+ processed_data = process_data (
94+ r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-'
95+ r'Probability-Model\data\processed'
96+ )
97+
98+ processed_data .to_excel (
99+ r'C:\Users\ABC\Desktop\10Acadamy\Week 5\Credit-Risk-'
100+ r'Probability-Model\data\processed\processed_data.xlsx' ,
101+ index = False
102+ )
0 commit comments