Skip to content

Commit 0f54a49

Browse files
committed
finished data transformation
1 parent 5ce75e5 commit 0f54a49

File tree

6 files changed

+176
-5
lines changed

6 files changed

+176
-5
lines changed

main.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
from network_security.components.data_ingestion import DataIngestion
22
from network_security.components.data_validation import DataValidation
3+
from network_security.components.data_transformation import DataTransformation
34
from network_security.exceptions.exception import NetworkSecurityException
45
from network_security.logging.logger import logging
5-
from network_security.entity.config_entity import DataIngestionConfig, DataValidationConfig, TrainingPipelineConfig
6+
from network_security.entity.config_entity import (
7+
TrainingPipelineConfig,
8+
DataIngestionConfig,
9+
DataValidationConfig,
10+
DataTransformationConfig,
11+
)
612
import sys
713

814

@@ -32,8 +38,21 @@
3238

3339
# initiating data validation
3440
logging.info("Initiating data validation")
35-
3641
data_validation_artifact = data_validation.initiate_data_validation()
3742
print(f"Data Validation Artifact: \n{data_validation_artifact} \n")
43+
44+
# data transformation configuration
45+
data_transformation_config = DataTransformationConfig(training_pipeline_config=training_pipeline_config)
46+
data_transformation = DataTransformation(
47+
data_validation_artifact = data_validation_artifact,
48+
data_transformation_config = data_transformation_config
49+
)
50+
51+
# initiating data transformation
52+
logging.info("Initiating data transformation")
53+
data_transformation_artifact = data_transformation.initiate_data_transformation()
54+
print(f"Data Transformation Artifact: \n{data_transformation_artifact} \n")
55+
56+
3857
except Exception as e:
3958
raise NetworkSecurityException(e, sys)
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
from network_security.exceptions.exception import NetworkSecurityException
2+
from network_security.logging.logger import logging
3+
from network_security.utils.main_utils.utils import save_numpy_array, save_object
4+
from network_security.constants.training_pipeline import TARGET_COLUMN, DATA_TRANSFORMATION_IMPUTER_PARAMS
5+
from network_security.entity.config_entity import DataTransformationConfig
6+
from network_security.entity.artifact_entity import (
7+
DataTransformationArtifact,
8+
DataValidationArtifact
9+
)
10+
11+
from sklearn.impute import KNNImputer
12+
from sklearn.pipeline import Pipeline
13+
14+
import pandas as pd
15+
import numpy as np
16+
import os, sys
17+
18+
19+
class DataTransformation:
20+
def __init__(self,
21+
data_validation_artifact: DataValidationArtifact,
22+
data_transformation_config: DataTransformationConfig):
23+
try:
24+
self.data_validation_artifact = data_validation_artifact
25+
self.data_transformation_config = data_transformation_config
26+
except Exception as e:
27+
raise NetworkSecurityException(e, sys)
28+
29+
@staticmethod
30+
def read_data(file_path: str) -> pd.DataFrame:
31+
try:
32+
return pd.read_csv(file_path)
33+
except Exception as e:
34+
raise NetworkSecurityException(e, sys)
35+
36+
def get_knn_transformation_object(cls) -> Pipeline:
37+
"""
38+
Initialize the KNN imputer object with the parameters defined in the constants file
39+
"""
40+
logging.info("Inside get_transformation_object method of DataTransformation class")
41+
try:
42+
knn_imputer = KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS)
43+
processor: Pipeline = Pipeline([("imputer", knn_imputer)])
44+
return processor
45+
except Exception as e:
46+
raise NetworkSecurityException(e, sys)
47+
48+
def initiate_data_transformation(self) -> DataTransformationArtifact:
49+
logging.info("Starting data transformation")
50+
try:
51+
# reading train and test data
52+
train_df = DataTransformation.read_data(self.data_validation_artifact.valid_train_file_path)
53+
test_df = DataTransformation.read_data(self.data_validation_artifact.valid_test_file_path)
54+
55+
# removing target variable
56+
input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN], axis=1)
57+
target_feature_train_df = train_df[TARGET_COLUMN]
58+
input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN], axis=1)
59+
target_feature_test_df = test_df[TARGET_COLUMN]
60+
61+
# replace -1s in target feature to 0 for better classification
62+
target_feature_train_df.replace(-1, 0, inplace=True)
63+
target_feature_test_df.replace(-1, 0, inplace=True)
64+
65+
# implmenting the KNN imputer
66+
knn_preprocessor = self.get_knn_transformation_object()
67+
knn_processor_obj = knn_preprocessor.fit(input_feature_train_df)
68+
transformed_input_feature_train_df = knn_processor_obj.transform(input_feature_train_df)
69+
transformed_input_feature_test_df = knn_processor_obj.transform(input_feature_test_df)
70+
71+
# combining input and target features for both train and tests datasets
72+
train_nparray = np.c_[transformed_input_feature_train_df, np.array(target_feature_train_df)]
73+
test_nparray = np.c_[transformed_input_feature_test_df, np.array(target_feature_test_df)]
74+
75+
# saving the numpty arrays and the object into their respective paths
76+
save_numpy_array(self.data_transformation_config.transformed_train_file_path, array=train_nparray)
77+
save_numpy_array(self.data_transformation_config.transformed_test_file_path, array=test_nparray)
78+
save_object(self.data_transformation_config.transformed_object_file_path, obj=knn_processor_obj)
79+
80+
# preparing artifacts
81+
data_transformation_artifact = DataTransformationArtifact(
82+
transformation_object_path = self.data_transformation_config.transformed_object_file_path,
83+
transformed_train_file_path = self.data_transformation_config.transformed_train_file_path,
84+
transformed_test_file_path= self.data_transformation_config.transformed_test_file_path
85+
)
86+
logging.info("Data transformation completed")
87+
return data_transformation_artifact
88+
except Exception as e:
89+
raise NetworkSecurityException(e, sys)

network_security/constants/training_pipeline/__init__.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
import os, sys
1+
import numpy as np
2+
import os
23

34
"""
45
Training Pipeline Constants
@@ -40,4 +41,18 @@
4041
DATA_VALIDATION_VALID_DIR: str = "validated"
4142
DATA_VALIDATION_INVALID_DIR: str = "invalid"
4243
DATA_VALIDATION_DRIFT_REPORT_DIR: str = "drift_report"
43-
DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str = "report.yaml"
44+
DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str = "report.yaml"
45+
46+
47+
"""
48+
Defining constants for data transformation
49+
"""
50+
DATA_TRANSFORMATION_DIR_NAME: str = "data_transformation"
51+
DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR: str = "transformed"
52+
DATA_TRANSFORMATION_TRANSFORMED_OBJECT_FILE_NAME: str = "knn_preprocessor"
53+
DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR: str = "preprocessor"
54+
DATA_TRANSFORMATION_IMPUTER_PARAMS: dict = { # knn imputer to replace nan values
55+
"missing_values": np.nan,
56+
"n_neighbors": 3,
57+
"weights": "uniform",
58+
}

network_security/entity/artifact_entity.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,10 @@ class DataValidationArtifact:
1414
valid_test_file_path: str
1515
invalid_train_file_path: str
1616
invalid_test_file_path: str
17-
drift_report_file_path: str
17+
drift_report_file_path: str
18+
19+
@dataclass
20+
class DataTransformationArtifact:
21+
transformed_train_file_path: str
22+
transformed_test_file_path: str
23+
transformation_object_path: str

network_security/entity/config_entity.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,4 +73,27 @@ def __init__(self, training_pipeline_config:TrainingPipelineConfig):
7373
self.data_validation_dir,
7474
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR,
7575
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME,
76+
)
77+
78+
79+
class DataTransformationConfig:
80+
def __init__(self, training_pipeline_config:TrainingPipelineConfig):
81+
self.data_transformation_dir: str = os.path.join(
82+
training_pipeline_config.artifact_dir,
83+
training_pipeline.DATA_TRANSFORMATION_DIR_NAME,
84+
)
85+
self.transformed_train_file_path: str = os.path.join(
86+
self.data_transformation_dir,
87+
training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR,
88+
training_pipeline.TRAIN_FILE_NAME.replace("csv", "npy"),
89+
)
90+
self.transformed_test_file_path: str = os.path.join(
91+
self.data_transformation_dir,
92+
training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR,
93+
training_pipeline.TEST_FILE_NAME.replace("csv", "npy"),
94+
)
95+
self.transformed_object_file_path: str = os.path.join(
96+
self.data_transformation_dir,
97+
training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR,
98+
training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_OBJECT_FILE_NAME,
7699
)

network_security/utils/main_utils/utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,22 @@ def write_yaml_file(file_path: str,
2929
yaml.dump(content, file)
3030
except Exception as e:
3131
raise NetworkSecurityException(e, sys)
32+
33+
34+
def save_numpy_array(file_path: str, array: np.array) -> None:
35+
try:
36+
dir_path = os.path.dirname(file_path)
37+
os.makedirs(dir_path, exist_ok=True)
38+
with open(file_path, "wb") as file:
39+
np.save(file, array)
40+
except Exception as e:
41+
raise NetworkSecurityException(e, sys)
42+
43+
44+
def save_object(file_path: str, obj: object) -> None:
45+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
46+
try:
47+
with open(file_path, "wb") as file:
48+
pickle.dump(obj, file)
49+
except Exception as e:
50+
raise NetworkSecurityException(e, sys)

0 commit comments

Comments
 (0)