|
| 1 | +""" |
| 2 | +Module to run feature extraction for EPT attack steps 2 and 3. |
| 3 | +Overall workflow and decisions are taken with from the Cyber@BGU team's attack implementation at |
| 4 | +https://github.com/eyalgerman/MIA-EPT. |
| 5 | +
|
| 6 | +""" |
| 7 | + |
| 8 | +from logging import INFO |
| 9 | + |
| 10 | +import numpy as np |
| 11 | +import pandas as pd |
| 12 | +from sklearn.compose import ColumnTransformer |
| 13 | +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor |
| 14 | +from sklearn.pipeline import Pipeline |
| 15 | +from sklearn.preprocessing import OneHotEncoder, StandardScaler |
| 16 | + |
| 17 | +from midst_toolkit.common.enumerations import TaskType |
| 18 | +from midst_toolkit.common.logger import log |
| 19 | + |
| 20 | + |
| 21 | +def preprocess_train_predict( |
| 22 | + train_points: pd.DataFrame, |
| 23 | + test_points: pd.DataFrame, |
| 24 | + target_col: str, |
| 25 | + column_types: dict[str, list[str]], |
| 26 | + random_seed: int | None = None, |
| 27 | +) -> tuple[np.ndarray, pd.Series, TaskType]: |
| 28 | + """ |
| 29 | + An attribute prediction model is trained on `train_points` to predict the `target_col`. |
| 30 | +
|
| 31 | + We determine the nature of the prediction task based on the data type of the target column. |
| 32 | + If the `target_col` is categorical, the model uses a classification approach. Otherwise, if |
| 33 | + the `target_col` is numerical, a regression model is used. This allows the |
| 34 | + model to effectively learn the relationship between the `target_col` and the other attributes |
| 35 | + present in the training data. |
| 36 | +
|
| 37 | + After the model is trained on `train_points`, it is then used to generate predictions for the `target_col` |
| 38 | + on `test_points`. |
| 39 | +
|
| 40 | + Args: |
| 41 | + train_points: Data to train the attribute prediction model on. Must include the target column. |
| 42 | + test_points: Data to test the attribute prediction model on. Must include the target column. |
| 43 | + target_col: Name of the target column to predict. |
| 44 | + column_types: Types of columns in the data. Relevant keys are "numerical", "categorical". |
| 45 | + random_seed: Seed for model reproducibility. Defaults to None. |
| 46 | +
|
| 47 | + Returns: |
| 48 | + predictions: Predicted values for the target column on the test data. |
| 49 | + y_test: True values for the target column on the test data. |
| 50 | + task_type: Whether the attribution prediction model was a classification or regression model. |
| 51 | + """ |
| 52 | + assert target_col in train_points.columns, f"Target column '{target_col}' not found in train_points." |
| 53 | + assert target_col in test_points.columns, f"Target column '{target_col}' not found in test_points." |
| 54 | + |
| 55 | + assert set(train_points.columns) == set(test_points.columns), "Columns in df_train and df_test do not match" |
| 56 | + |
| 57 | + x_train = train_points.drop([target_col], axis=1) |
| 58 | + y_train = train_points[target_col] |
| 59 | + x_test = test_points.drop([target_col], axis=1) |
| 60 | + y_test = test_points[target_col] |
| 61 | + |
| 62 | + numeric_columns = column_types["numerical"] |
| 63 | + categorical_columns = column_types["categorical"] |
| 64 | + |
| 65 | + # Assert that the target column appears exactly once in numeric_columns + categorical_columns |
| 66 | + assert (numeric_columns + categorical_columns).count(target_col) == 1, ( |
| 67 | + f"The target column '{target_col}' must appear exactly once in numeric_columns + categorical_columns" |
| 68 | + ) |
| 69 | + |
| 70 | + # Assert that the union of numeric_columns and categorical_columns matches the columns in train_points |
| 71 | + assert set(numeric_columns + categorical_columns) == set(train_points.columns), ( |
| 72 | + "The union of numeric_columns and categorical_columns must match the columns in the combined dataframe" |
| 73 | + ) |
| 74 | + |
| 75 | + task_type = TaskType.MULTICLASS_CLASSIFICATION if target_col in categorical_columns else TaskType.REGRESSION |
| 76 | + |
| 77 | + # Remove target column from feature columns |
| 78 | + numeric_columns = [col for col in numeric_columns if col != target_col] |
| 79 | + categorical_columns = [col for col in categorical_columns if col != target_col] |
| 80 | + |
| 81 | + numeric_transformer = StandardScaler() |
| 82 | + categorical_transformer = OneHotEncoder(drop="first", handle_unknown="ignore") |
| 83 | + |
| 84 | + preprocessor = ColumnTransformer( |
| 85 | + transformers=[ |
| 86 | + ("num", numeric_transformer, numeric_columns), |
| 87 | + ("cat", categorical_transformer, categorical_columns), |
| 88 | + ] |
| 89 | + ) |
| 90 | + |
| 91 | + model = ( |
| 92 | + RandomForestClassifier(random_state=random_seed) |
| 93 | + if task_type == TaskType.MULTICLASS_CLASSIFICATION |
| 94 | + else RandomForestRegressor(random_state=random_seed) |
| 95 | + ) |
| 96 | + |
| 97 | + model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)]) |
| 98 | + |
| 99 | + model_pipeline.fit(x_train, y_train) |
| 100 | + |
| 101 | + predictions = model_pipeline.predict(x_test) |
| 102 | + |
| 103 | + return predictions, y_test, task_type |
| 104 | + |
| 105 | + |
| 106 | +def extract_features( |
| 107 | + synthetic_data: pd.DataFrame, |
| 108 | + challenge_data: pd.DataFrame, |
| 109 | + column_types: dict[str, list[str]], |
| 110 | + random_seed: int | None = None, |
| 111 | +) -> pd.DataFrame: |
| 112 | + """ |
| 113 | + Orchestrator function to run feature extraction for EPT attack: |
| 114 | + 1. For each attribute (column) in the synthetic data that is not an ID, train an attribute prediction model |
| 115 | + using the synthetic data. |
| 116 | + 2. Use the trained model to predict the values of that attribute in the challenge data, which also doesn't |
| 117 | + contain IDs. |
| 118 | + 3. Compute relevant metrics (accuracy for categorical data, error and error ratio for numerical data). |
| 119 | + 4. Compile the results into a DataFrame. |
| 120 | +
|
| 121 | + Args: |
| 122 | + synthetic_data: Synthetic data to extract features from. Note: This data should not contain any identifier |
| 123 | + columns, as the function will attempt to train a prediction model for every column included. |
| 124 | + challenge_data: The data the predictions are compared against, to compute prediction accuracy/errors. |
| 125 | + column_types: A dictionary specifying the types of columns (numerical or categorical) in the data. |
| 126 | + random_seed: Random seed for reproducibility. Defaults to None. |
| 127 | +
|
| 128 | + Returns: |
| 129 | + A DataFrame containing the extracted features for each attribute in the challenge data. |
| 130 | + It includes the following columns: |
| 131 | + - <column_name>: The true values for the attribute. |
| 132 | + - <column_name>_prediction: The predicted values for the attribute. |
| 133 | + If the data is categorical: |
| 134 | + - <column_name>_accuracy: The element-wise accuracy of the predictions. 0 for incorrect prediction, |
| 135 | + 1 for correct. |
| 136 | + If the data is numerical: |
| 137 | + - <column_name>_error (if regression): The absolute errors of the predictions. |
| 138 | + - <column_name>_error_ratio (if regression): The ratio of the errors to the true values, which is |
| 139 | + derived by dividing the absolute error by the true value in a zero-safe manner. |
| 140 | + """ |
| 141 | + features = [] |
| 142 | + columns = [] |
| 143 | + |
| 144 | + for column in synthetic_data.columns: |
| 145 | + log(INFO, f"Extracting features for column: {column}") |
| 146 | + |
| 147 | + predictions, y_test, task_type = preprocess_train_predict( |
| 148 | + train_points=synthetic_data, |
| 149 | + test_points=challenge_data, |
| 150 | + target_col=column, |
| 151 | + column_types=column_types, |
| 152 | + random_seed=random_seed, |
| 153 | + ) |
| 154 | + |
| 155 | + features.append(y_test) |
| 156 | + columns.append(column) |
| 157 | + |
| 158 | + if task_type == TaskType.MULTICLASS_CLASSIFICATION: |
| 159 | + # TODO: Maybe change the variable name from accuracy to correctness |
| 160 | + # Calculate accuracy |
| 161 | + accuracy = predictions == y_test |
| 162 | + accuracy = accuracy.astype(int) |
| 163 | + features.append(accuracy) |
| 164 | + columns.append(f"{column}_accuracy") |
| 165 | + |
| 166 | + elif task_type == TaskType.REGRESSION: |
| 167 | + # Calculate errors |
| 168 | + errors = pd.Series(np.abs(predictions - y_test), index=y_test.index) |
| 169 | + |
| 170 | + # Calculate the ratio of the error in a zero-safe manner |
| 171 | + denominator = y_test.replace(0, np.nan) |
| 172 | + error_ratio = errors / np.abs(denominator) |
| 173 | + |
| 174 | + # Replace infs and NaNs with a large number. If all values are NaN, replace with 1e9. |
| 175 | + finite_max = error_ratio[np.isfinite(error_ratio)].max() |
| 176 | + error_ratio = error_ratio.replace([np.inf, -np.inf], np.nan).fillna( |
| 177 | + finite_max if pd.notna(finite_max) else 1e9 |
| 178 | + ) |
| 179 | + |
| 180 | + # Save the error and the ratio error |
| 181 | + features.append(errors) |
| 182 | + features.append(error_ratio) |
| 183 | + |
| 184 | + columns.append(f"{column}_error") |
| 185 | + columns.append(f"{column}_error_ratio") |
| 186 | + |
| 187 | + else: |
| 188 | + raise ValueError(f"Unsupported task type: {task_type}") |
| 189 | + |
| 190 | + # predictions from the model |
| 191 | + features.append(pd.Series(predictions, index=y_test.index)) |
| 192 | + columns.append(f"{column}_prediction") |
| 193 | + |
| 194 | + # Create a DataFrame with the results |
| 195 | + df_results = pd.DataFrame(features).T |
| 196 | + df_results.columns = columns |
| 197 | + |
| 198 | + return df_results |
0 commit comments