Skip to content

Commit 754f5f9

Browse files
data transformation completed
1 parent 745b91b commit 754f5f9

File tree

16 files changed

+225
-8
lines changed

16 files changed

+225
-8
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
netsec_env
22
.env
3-
.DS_Store
3+
.DS_Store
4+
.artifact
5+
.log

main.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from networksecurity.components.data_ingestion import DataIngestion
22
from networksecurity.components.data_validation import DataValidation
3+
from networksecurity.components.data_transformation import DataTransformation
34
from networksecurity.exception.exception import NetworkSecurityException
45
from networksecurity.logging.logger import logging
5-
from networksecurity.entity.config_entity import DataIngestionConfig, DataValidationConfig
6+
from networksecurity.entity.config_entity import DataIngestionConfig, DataValidationConfig,DataTransformationConfig
67
from networksecurity.entity.config_entity import Training_pipeline_config
78
import sys
89

@@ -21,6 +22,12 @@
2122
data_validation_artifact=data_validation.initiate_data_validation()
2223
logging.info("data validation completed")
2324
print(data_validation_artifact)
25+
datatransformationconfig=DataTransformationConfig(training_pipeline_config)
26+
data_transformation=DataTransformation(data_validation_artifact, datatransformationconfig)
27+
logging.info("starting data transformation")
28+
data_transformation_artifact=data_transformation.initiate_data_transformation()
29+
logging.info("data transformation completed")
30+
print(data_transformation_artifact)
2431

2532

2633
except Exception as e:
Binary file not shown.
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import sys
2+
import os
3+
import numpy as np
4+
import pandas as pd
5+
from sklearn.impute import SimpleImputer
6+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
7+
from sklearn.pipeline import Pipeline
8+
from sklearn.compose import ColumnTransformer
9+
10+
from networksecurity.constants.Training_pipeline import TARGET_COLUMN
11+
from networksecurity.entity.artifact_entity import (
12+
DataTransformationArtifact,
13+
DataValidationArtifact
14+
)
15+
16+
from networksecurity.entity.config_entity import DataTransformationConfig
17+
from networksecurity.exception.exception import NetworkSecurityException
18+
from networksecurity.logging.logger import logging
19+
from networksecurity.utils.main_utils import save_numpy_array_data, save_object
20+
21+
class DataTransformation:
22+
def __init__(self, data_validation_artifact:DataValidationArtifact,
23+
data_transformation_config:DataTransformationConfig):
24+
try:
25+
self.data_validation_artifact=data_validation_artifact
26+
self.data_transformation_config=data_transformation_config
27+
except Exception as e:
28+
raise NetworkSecurityException(e,sys)
29+
30+
@staticmethod
31+
def read_data(file_path) -> pd.DataFrame:
32+
try:
33+
return pd.read_csv(file_path)
34+
except Exception as e:
35+
raise NetworkSecurityException(e, sys)
36+
37+
def get_data_transformer_object(self) -> Pipeline:
38+
"""
39+
Creates a preprocessing pipeline that combines numerical and categorical transformations
40+
41+
Returns:
42+
Pipeline object
43+
"""
44+
logging.info(
45+
"Entered get_data_transformer_Object method of Transformation class"
46+
)
47+
try:
48+
# Read sample data to determine column types
49+
sample_df = self.read_data(self.data_validation_artifact.valid_train_file_path)
50+
sample_df = sample_df.drop(columns=[TARGET_COLUMN], axis=1)
51+
52+
# Identify numeric and categorical columns
53+
numeric_features = sample_df.select_dtypes(include=['int64', 'float64']).columns
54+
categorical_features = sample_df.select_dtypes(include=['object']).columns
55+
56+
logging.info(f"Numerical columns: {numeric_features}")
57+
logging.info(f"Categorical columns: {categorical_features}")
58+
59+
# Numeric pipeline
60+
numeric_transformer = Pipeline(steps=[
61+
('imputer', SimpleImputer(strategy='mean')),
62+
('scaler', StandardScaler())
63+
])
64+
65+
# Categorical pipeline - Updated OneHotEncoder parameters
66+
categorical_transformer = Pipeline(steps=[
67+
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
68+
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
69+
])
70+
71+
# Combine transformers
72+
preprocessor = ColumnTransformer(
73+
transformers=[
74+
('num', numeric_transformer, numeric_features),
75+
('cat', categorical_transformer, categorical_features)
76+
],
77+
remainder='passthrough'
78+
)
79+
80+
logging.info("Created preprocessing pipeline")
81+
return preprocessor
82+
83+
except Exception as e:
84+
raise NetworkSecurityException(e, sys) from e
85+
86+
def initiate_data_transformation(self) -> DataTransformationArtifact:
87+
logging.info("Initiating data transformation")
88+
try:
89+
logging.info("Starting data transformation")
90+
train_df = pd.read_csv(self.data_validation_artifact.valid_train_file_path)
91+
test_df = pd.read_csv(self.data_validation_artifact.valid_test_file_path)
92+
93+
# training dataframe
94+
input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN], axis=1)
95+
target_feature_train_df = train_df[TARGET_COLUMN]
96+
target_feature_train_df = target_feature_train_df.replace(-1, 0)
97+
98+
# testing dataframe
99+
input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN], axis=1)
100+
target_feature_test_df = test_df[TARGET_COLUMN]
101+
target_feature_test_df = target_feature_test_df.replace(-1, 0)
102+
103+
logging.info("Got preprocessor object")
104+
preprocessor = self.get_data_transformer_object()
105+
106+
logging.info("Fitting preprocessor on training data")
107+
transformed_input_train_feature = preprocessor.fit_transform(input_feature_train_df)
108+
109+
logging.info("Transforming test data")
110+
transformed_input_test_feature = preprocessor.transform(input_feature_test_df)
111+
112+
logging.info("Creating final numpy arrays")
113+
train_arr = np.c_[transformed_input_train_feature, np.array(target_feature_train_df)]
114+
test_arr = np.c_[transformed_input_test_feature, np.array(target_feature_test_df)]
115+
116+
logging.info("Saving transformed data and preprocessor object")
117+
save_numpy_array_data(self.data_transformation_config.transformed_train_file_path, array=train_arr)
118+
save_numpy_array_data(self.data_transformation_config.transformed_test_file_path, array=test_arr)
119+
save_object(self.data_transformation_config.transformed_object_file_path, preprocessor)
120+
121+
# preparing artifact
122+
data_transformation_artifact = DataTransformationArtifact(
123+
transformed_object_file_path=self.data_transformation_config.transformed_object_file_path,
124+
transformed_train_file_path=self.data_transformation_config.transformed_train_file_path,
125+
transformed_test_file_path=self.data_transformation_config.transformed_test_file_path,
126+
)
127+
128+
logging.info("Data transformation completed successfully")
129+
return data_transformation_artifact
130+
131+
except Exception as e:
132+
logging.error("Error in data transformation")
133+
raise NetworkSecurityException(e, sys)

networksecurity/constants/Training_pipeline/__init__.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,15 @@
4343

4444
"""
4545
Data Transformation related constant start with DATA_TRANSFORMATION VAR NAME
46-
"""
46+
"""
47+
DATA_TRANSFORMATION_DIR_NAME: str = "data_transformation"
48+
DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR: str = "transformed_data"
49+
DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR_NAME: str = "transformed_object"
50+
PREPROCSSING_OBJECT_DIR_NAME: str = "preprocessing_object" # Added constant
51+
PREPROCESSING_TRANSFORMED_OBJECT_FILE_NAME: str = "transformed_object.pkl" # Added constant
52+
53+
# knn imputer to replace nan values
54+
DATA_TRANSFORMATION_IMPUTER_PARAMS: dict = {
55+
"n_neighbors": 3,
56+
"weights": "uniform"
57+
}
Binary file not shown.

networksecurity/constants/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@
2323
DATA_VALIDATION_DRIFT_REPORT_FILE_NAME,
2424
DATA_VALIDATION_TRAIN_FILE_NAME,
2525
DATA_VALIDATION_TEST_FILE_NAME,
26+
# Data Transformation Constants
27+
DATA_TRANSFORMATION_DIR_NAME,
28+
DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR,
29+
DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR_NAME,
30+
DATA_TRANSFORMATION_IMPUTER_PARAMS,
2631
)
2732

2833
__all__ = [
@@ -46,4 +51,8 @@
4651
"DATA_VALIDATION_DRIFT_REPORT_FILE_NAME",
4752
"DATA_VALIDATION_TRAIN_FILE_NAME",
4853
"DATA_VALIDATION_TEST_FILE_NAME",
54+
"DATA_TRANSFORMATION_DIR_NAME",
55+
"DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR",
56+
"DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR_NAME",
57+
"DATA_TRANSFORMATION_IMPUTER_PARAMS",
4958
]
193 Bytes
Binary file not shown.
219 Bytes
Binary file not shown.
682 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)