1+ import sys
2+ import os
3+ import numpy as np
4+ import pandas as pd
5+ from sklearn .impute import SimpleImputer
6+ from sklearn .preprocessing import StandardScaler , OneHotEncoder
7+ from sklearn .pipeline import Pipeline
8+ from sklearn .compose import ColumnTransformer
9+
10+ from networksecurity .constants .Training_pipeline import TARGET_COLUMN
11+ from networksecurity .entity .artifact_entity import (
12+ DataTransformationArtifact ,
13+ DataValidationArtifact
14+ )
15+
16+ from networksecurity .entity .config_entity import DataTransformationConfig
17+ from networksecurity .exception .exception import NetworkSecurityException
18+ from networksecurity .logging .logger import logging
19+ from networksecurity .utils .main_utils import save_numpy_array_data , save_object
20+
21+ class DataTransformation :
22+ def __init__ (self , data_validation_artifact :DataValidationArtifact ,
23+ data_transformation_config :DataTransformationConfig ):
24+ try :
25+ self .data_validation_artifact = data_validation_artifact
26+ self .data_transformation_config = data_transformation_config
27+ except Exception as e :
28+ raise NetworkSecurityException (e ,sys )
29+
30+ @staticmethod
31+ def read_data (file_path ) -> pd .DataFrame :
32+ try :
33+ return pd .read_csv (file_path )
34+ except Exception as e :
35+ raise NetworkSecurityException (e , sys )
36+
37+ def get_data_transformer_object (self ) -> Pipeline :
38+ """
39+ Creates a preprocessing pipeline that combines numerical and categorical transformations
40+
41+ Returns:
42+ Pipeline object
43+ """
44+ logging .info (
45+ "Entered get_data_transformer_Object method of Transformation class"
46+ )
47+ try :
48+ # Read sample data to determine column types
49+ sample_df = self .read_data (self .data_validation_artifact .valid_train_file_path )
50+ sample_df = sample_df .drop (columns = [TARGET_COLUMN ], axis = 1 )
51+
52+ # Identify numeric and categorical columns
53+ numeric_features = sample_df .select_dtypes (include = ['int64' , 'float64' ]).columns
54+ categorical_features = sample_df .select_dtypes (include = ['object' ]).columns
55+
56+ logging .info (f"Numerical columns: { numeric_features } " )
57+ logging .info (f"Categorical columns: { categorical_features } " )
58+
59+ # Numeric pipeline
60+ numeric_transformer = Pipeline (steps = [
61+ ('imputer' , SimpleImputer (strategy = 'mean' )),
62+ ('scaler' , StandardScaler ())
63+ ])
64+
65+ # Categorical pipeline - Updated OneHotEncoder parameters
66+ categorical_transformer = Pipeline (steps = [
67+ ('imputer' , SimpleImputer (strategy = 'constant' , fill_value = 'missing' )),
68+ ('onehot' , OneHotEncoder (handle_unknown = 'ignore' , sparse_output = False ))
69+ ])
70+
71+ # Combine transformers
72+ preprocessor = ColumnTransformer (
73+ transformers = [
74+ ('num' , numeric_transformer , numeric_features ),
75+ ('cat' , categorical_transformer , categorical_features )
76+ ],
77+ remainder = 'passthrough'
78+ )
79+
80+ logging .info ("Created preprocessing pipeline" )
81+ return preprocessor
82+
83+ except Exception as e :
84+ raise NetworkSecurityException (e , sys ) from e
85+
86+ def initiate_data_transformation (self ) -> DataTransformationArtifact :
87+ logging .info ("Initiating data transformation" )
88+ try :
89+ logging .info ("Starting data transformation" )
90+ train_df = pd .read_csv (self .data_validation_artifact .valid_train_file_path )
91+ test_df = pd .read_csv (self .data_validation_artifact .valid_test_file_path )
92+
93+ # training dataframe
94+ input_feature_train_df = train_df .drop (columns = [TARGET_COLUMN ], axis = 1 )
95+ target_feature_train_df = train_df [TARGET_COLUMN ]
96+ target_feature_train_df = target_feature_train_df .replace (- 1 , 0 )
97+
98+ # testing dataframe
99+ input_feature_test_df = test_df .drop (columns = [TARGET_COLUMN ], axis = 1 )
100+ target_feature_test_df = test_df [TARGET_COLUMN ]
101+ target_feature_test_df = target_feature_test_df .replace (- 1 , 0 )
102+
103+ logging .info ("Got preprocessor object" )
104+ preprocessor = self .get_data_transformer_object ()
105+
106+ logging .info ("Fitting preprocessor on training data" )
107+ transformed_input_train_feature = preprocessor .fit_transform (input_feature_train_df )
108+
109+ logging .info ("Transforming test data" )
110+ transformed_input_test_feature = preprocessor .transform (input_feature_test_df )
111+
112+ logging .info ("Creating final numpy arrays" )
113+ train_arr = np .c_ [transformed_input_train_feature , np .array (target_feature_train_df )]
114+ test_arr = np .c_ [transformed_input_test_feature , np .array (target_feature_test_df )]
115+
116+ logging .info ("Saving transformed data and preprocessor object" )
117+ save_numpy_array_data (self .data_transformation_config .transformed_train_file_path , array = train_arr )
118+ save_numpy_array_data (self .data_transformation_config .transformed_test_file_path , array = test_arr )
119+ save_object (self .data_transformation_config .transformed_object_file_path , preprocessor )
120+
121+ # preparing artifact
122+ data_transformation_artifact = DataTransformationArtifact (
123+ transformed_object_file_path = self .data_transformation_config .transformed_object_file_path ,
124+ transformed_train_file_path = self .data_transformation_config .transformed_train_file_path ,
125+ transformed_test_file_path = self .data_transformation_config .transformed_test_file_path ,
126+ )
127+
128+ logging .info ("Data transformation completed successfully" )
129+ return data_transformation_artifact
130+
131+ except Exception as e :
132+ logging .error ("Error in data transformation" )
133+ raise NetworkSecurityException (e , sys )
0 commit comments