|
| 1 | +from network_security.exceptions.exception import NetworkSecurityException |
| 2 | +from network_security.logging.logger import logging |
| 3 | +from network_security.utils.main_utils.utils import save_numpy_array, save_object |
| 4 | +from network_security.constants.training_pipeline import TARGET_COLUMN, DATA_TRANSFORMATION_IMPUTER_PARAMS |
| 5 | +from network_security.entity.config_entity import DataTransformationConfig |
| 6 | +from network_security.entity.artifact_entity import ( |
| 7 | + DataTransformationArtifact, |
| 8 | + DataValidationArtifact |
| 9 | +) |
| 10 | + |
| 11 | +from sklearn.impute import KNNImputer |
| 12 | +from sklearn.pipeline import Pipeline |
| 13 | + |
| 14 | +import pandas as pd |
| 15 | +import numpy as np |
| 16 | +import os, sys |
| 17 | + |
| 18 | + |
| 19 | +class DataTransformation: |
| 20 | + def __init__(self, |
| 21 | + data_validation_artifact: DataValidationArtifact, |
| 22 | + data_transformation_config: DataTransformationConfig): |
| 23 | + try: |
| 24 | + self.data_validation_artifact = data_validation_artifact |
| 25 | + self.data_transformation_config = data_transformation_config |
| 26 | + except Exception as e: |
| 27 | + raise NetworkSecurityException(e, sys) |
| 28 | + |
| 29 | + @staticmethod |
| 30 | + def read_data(file_path: str) -> pd.DataFrame: |
| 31 | + try: |
| 32 | + return pd.read_csv(file_path) |
| 33 | + except Exception as e: |
| 34 | + raise NetworkSecurityException(e, sys) |
| 35 | + |
| 36 | + def get_knn_transformation_object(cls) -> Pipeline: |
| 37 | + """ |
| 38 | + Initialize the KNN imputer object with the parameters defined in the constants file |
| 39 | + """ |
| 40 | + logging.info("Inside get_transformation_object method of DataTransformation class") |
| 41 | + try: |
| 42 | + knn_imputer = KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS) |
| 43 | + processor: Pipeline = Pipeline([("imputer", knn_imputer)]) |
| 44 | + return processor |
| 45 | + except Exception as e: |
| 46 | + raise NetworkSecurityException(e, sys) |
| 47 | + |
| 48 | + def initiate_data_transformation(self) -> DataTransformationArtifact: |
| 49 | + logging.info("Starting data transformation") |
| 50 | + try: |
| 51 | + # reading train and test data |
| 52 | + train_df = DataTransformation.read_data(self.data_validation_artifact.valid_train_file_path) |
| 53 | + test_df = DataTransformation.read_data(self.data_validation_artifact.valid_test_file_path) |
| 54 | + |
| 55 | + # removing target variable |
| 56 | + input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN], axis=1) |
| 57 | + target_feature_train_df = train_df[TARGET_COLUMN] |
| 58 | + input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN], axis=1) |
| 59 | + target_feature_test_df = test_df[TARGET_COLUMN] |
| 60 | + |
| 61 | + # replace -1s in target feature to 0 for better classification |
| 62 | + target_feature_train_df.replace(-1, 0, inplace=True) |
| 63 | + target_feature_test_df.replace(-1, 0, inplace=True) |
| 64 | + |
| 65 | + # implmenting the KNN imputer |
| 66 | + knn_preprocessor = self.get_knn_transformation_object() |
| 67 | + knn_processor_obj = knn_preprocessor.fit(input_feature_train_df) |
| 68 | + transformed_input_feature_train_df = knn_processor_obj.transform(input_feature_train_df) |
| 69 | + transformed_input_feature_test_df = knn_processor_obj.transform(input_feature_test_df) |
| 70 | + |
| 71 | + # combining input and target features for both train and tests datasets |
| 72 | + train_nparray = np.c_[transformed_input_feature_train_df, np.array(target_feature_train_df)] |
| 73 | + test_nparray = np.c_[transformed_input_feature_test_df, np.array(target_feature_test_df)] |
| 74 | + |
| 75 | + # saving the numpty arrays and the object into their respective paths |
| 76 | + save_numpy_array(self.data_transformation_config.transformed_train_file_path, array=train_nparray) |
| 77 | + save_numpy_array(self.data_transformation_config.transformed_test_file_path, array=test_nparray) |
| 78 | + save_object(self.data_transformation_config.transformed_object_file_path, obj=knn_processor_obj) |
| 79 | + |
| 80 | + # preparing artifacts |
| 81 | + data_transformation_artifact = DataTransformationArtifact( |
| 82 | + transformation_object_path = self.data_transformation_config.transformed_object_file_path, |
| 83 | + transformed_train_file_path = self.data_transformation_config.transformed_train_file_path, |
| 84 | + transformed_test_file_path= self.data_transformation_config.transformed_test_file_path |
| 85 | + ) |
| 86 | + logging.info("Data transformation completed") |
| 87 | + return data_transformation_artifact |
| 88 | + except Exception as e: |
| 89 | + raise NetworkSecurityException(e, sys) |
0 commit comments