Skip to content

Commit 5782ca8

Browse files
authored
EPT Attack: Feature Extraction (#89)
* run_ept_attack for feature extraction. * Add pre_process_and_train in feature extraction * Add tests * Packaged directory iterations, switched to the library's ENUM, resolved other comments.
1 parent 258ff33 commit 5782ca8

File tree

5 files changed

+509
-0
lines changed

5 files changed

+509
-0
lines changed

examples/common/utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Functions used for attacks across multiple examples.
2+
3+
from collections.abc import Generator
4+
from pathlib import Path
5+
6+
7+
def iterate_model_folders(input_data_path: Path, diffusion_model_names: list[str]) -> Generator[tuple[str, Path, str]]:
8+
"""
9+
Iterates over the competition's shadow model folder structure and yields model information.
10+
11+
Args:
12+
input_data_path: The base path for the input data.
13+
diffusion_model_names: A list of diffusion model names to iterate over.
14+
15+
Yields:
16+
A tuple containing the model name, the path to the model's data, and the model folder name.
17+
"""
18+
modes = ["train", "dev", "final"]
19+
for model_name in diffusion_model_names:
20+
model_path = input_data_path / f"{model_name}_black_box"
21+
for mode in modes:
22+
current_path = model_path / mode
23+
if not current_path.exists():
24+
continue
25+
26+
model_folders = [entry for entry in current_path.iterdir() if entry.is_dir()]
27+
for model_folder_path in model_folders:
28+
yield model_name, model_folder_path, model_folder_path.name

examples/ept_attack/config.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Ensemble example configuration
2+
# Base data directory (can be overridden from command line)
3+
base_data_dir: examples/ept_attack/data/
4+
base_example_dir: examples/ept_attack
5+
6+
# Data paths (relative to base_data_dir)
7+
data_paths:
8+
input_data_path: ${base_data_dir}/midst_data_black_box_attacks # Read-only input data directory
9+
output_data_path: ${base_data_dir}/output # Directory to save processed data and results
10+
data_types_file_path: ${base_data_dir}/data_configs/data_types.json # Path to the JSON file defining column types
11+
12+
# Pipeline control
13+
pipeline:
14+
run_data_processing: false # Whether to run data processing
15+
run_shadow_model_training: false # Whether to run shadow model training
16+
run_feature_extraction: true # Whether to run attribute prediction model training
17+
run_attack_classifier_training: false # Whether to run attack classifier training
18+
19+
attack_settings:
20+
single_table: true # Whether the data is single-table
21+
22+
# General settings
23+
random_seed: 42
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
"""
2+
This file is an incomplete example script for running the EPT-MIA Attack on MIDST challenge
3+
provided resources and data.
4+
Overall workflow and decisions are taken with from the Cyber@BGU team's attack implementation at
5+
https://github.com/eyalgerman/MIA-EPT.
6+
7+
"""
8+
9+
import json
10+
from logging import INFO
11+
from pathlib import Path
12+
13+
import hydra
14+
from omegaconf import DictConfig
15+
16+
from examples.common.utils import iterate_model_folders
17+
from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe
18+
from midst_toolkit.attacks.ept.feature_extraction import extract_features
19+
from midst_toolkit.common.logger import log
20+
21+
22+
# Step 2 and 3: Attribute prediction model training and feature extraction
23+
def run_attribute_prediction(config: DictConfig) -> None:
24+
"""
25+
Train attribute prediction models and extract features for EPT-MIA attack.
26+
The function is specifically designed to work with the MIDST challenge data structure,
27+
and the shadow models provided by the competition organizers.
28+
All the reading and writing of data is handled within this function.
29+
30+
Args:
31+
config: Configuration object set in config.yaml.
32+
"""
33+
log(INFO, "Running attribute prediction model training.")
34+
35+
diffusion_model_names = ["tabddpm", "tabsyn"] if config.attack_settings.single_table else ["clavaddpm"]
36+
input_data_path = Path(config.data_paths.input_data_path)
37+
output_features_path = Path(config.data_paths.output_data_path, "attribute_prediction_features")
38+
39+
# Load column types specific to the competition dataset
40+
with open(config.data_paths.data_types_file_path, "r") as f:
41+
column_types = json.load(f)
42+
43+
# Drop columns that end with '_id' from column_types, as they do not create meaningful features
44+
feature_column_types = {
45+
"numerical": [col for col in column_types.get("numerical", []) if not col.endswith("_id")],
46+
"categorical": [col for col in column_types.get("categorical", []) if not col.endswith("_id")],
47+
}
48+
49+
# TODO: Package iterating over competition structure (maybe into a utility function)
50+
# Iterating over directories specific to the shadow models folder structure in the competition
51+
for model_name, model_data_path, model_folder in iterate_model_folders(input_data_path, diffusion_model_names):
52+
# Load the data files as dataframes
53+
df_synthetic_data = load_dataframe(model_data_path, "trans_synthetic.csv")
54+
df_challenge_data = load_dataframe(model_data_path, "challenge_with_id.csv")
55+
56+
# Keep only the columns that are present in feature_column_types
57+
columns_to_keep = feature_column_types["numerical"] + feature_column_types["categorical"]
58+
df_synthetic_data = df_synthetic_data[columns_to_keep]
59+
df_challenge_data = df_challenge_data[columns_to_keep]
60+
61+
# Run feature extraction
62+
df_extracted_features = extract_features(
63+
synthetic_data=df_synthetic_data,
64+
challenge_data=df_challenge_data,
65+
column_types=feature_column_types,
66+
random_seed=config.random_seed,
67+
)
68+
69+
final_output_dir = output_features_path / f"{model_name}_black_box"
70+
71+
final_output_dir.mkdir(parents=True, exist_ok=True)
72+
73+
# Extract the number at the end of model_folder
74+
model_folder_number = int(model_folder.split("_")[-1])
75+
file_name = f"attribute_prediction_features_{model_folder_number}.csv"
76+
77+
save_dataframe(df=df_extracted_features, file_path=final_output_dir, file_name=file_name)
78+
79+
80+
@hydra.main(config_path=".", config_name="config", version_base=None)
81+
def main(config: DictConfig) -> None:
82+
"""
83+
Main orchestrator of the EPT-MIA Attack example pipeline.
84+
First step has yet to be implemented: shadow model training.
85+
Second and third steps are attribute prediction model training and feature extraction.
86+
87+
Args:
88+
config: Attack configuration as an OmegaConf DictConfig object.
89+
"""
90+
log(INFO, "Running EPT-MIA Attack Example Pipeline.")
91+
92+
if config.attack_settings.single_table:
93+
log(INFO, "Data: Single-table.")
94+
else:
95+
log(INFO, "Data: Multi-table.")
96+
97+
# TODO: Implement potential data preprocessing step.
98+
# TODO: Implement shadow model training step.
99+
100+
if config.pipeline.run_feature_extraction:
101+
run_attribute_prediction(config)
102+
103+
# TODO: Implement attack classifier training step.
104+
105+
106+
if __name__ == "__main__":
107+
main()
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
"""
2+
Module to run feature extraction for EPT attack steps 2 and 3.
3+
Overall workflow and decisions are taken with from the Cyber@BGU team's attack implementation at
4+
https://github.com/eyalgerman/MIA-EPT.
5+
6+
"""
7+
8+
from logging import INFO
9+
10+
import numpy as np
11+
import pandas as pd
12+
from sklearn.compose import ColumnTransformer
13+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
14+
from sklearn.pipeline import Pipeline
15+
from sklearn.preprocessing import OneHotEncoder, StandardScaler
16+
17+
from midst_toolkit.common.enumerations import TaskType
18+
from midst_toolkit.common.logger import log
19+
20+
21+
def preprocess_train_predict(
22+
train_points: pd.DataFrame,
23+
test_points: pd.DataFrame,
24+
target_col: str,
25+
column_types: dict[str, list[str]],
26+
random_seed: int | None = None,
27+
) -> tuple[np.ndarray, pd.Series, TaskType]:
28+
"""
29+
An attribute prediction model is trained on `train_points` to predict the `target_col`.
30+
31+
We determine the nature of the prediction task based on the data type of the target column.
32+
If the `target_col` is categorical, the model uses a classification approach. Otherwise, if
33+
the `target_col` is numerical, a regression model is used. This allows the
34+
model to effectively learn the relationship between the `target_col` and the other attributes
35+
present in the training data.
36+
37+
After the model is trained on `train_points`, it is then used to generate predictions for the `target_col`
38+
on `test_points`.
39+
40+
Args:
41+
train_points: Data to train the attribute prediction model on. Must include the target column.
42+
test_points: Data to test the attribute prediction model on. Must include the target column.
43+
target_col: Name of the target column to predict.
44+
column_types: Types of columns in the data. Relevant keys are "numerical", "categorical".
45+
random_seed: Seed for model reproducibility. Defaults to None.
46+
47+
Returns:
48+
predictions: Predicted values for the target column on the test data.
49+
y_test: True values for the target column on the test data.
50+
task_type: Whether the attribution prediction model was a classification or regression model.
51+
"""
52+
assert target_col in train_points.columns, f"Target column '{target_col}' not found in train_points."
53+
assert target_col in test_points.columns, f"Target column '{target_col}' not found in test_points."
54+
55+
assert set(train_points.columns) == set(test_points.columns), "Columns in df_train and df_test do not match"
56+
57+
x_train = train_points.drop([target_col], axis=1)
58+
y_train = train_points[target_col]
59+
x_test = test_points.drop([target_col], axis=1)
60+
y_test = test_points[target_col]
61+
62+
numeric_columns = column_types["numerical"]
63+
categorical_columns = column_types["categorical"]
64+
65+
# Assert that the target column appears exactly once in numeric_columns + categorical_columns
66+
assert (numeric_columns + categorical_columns).count(target_col) == 1, (
67+
f"The target column '{target_col}' must appear exactly once in numeric_columns + categorical_columns"
68+
)
69+
70+
# Assert that the union of numeric_columns and categorical_columns matches the columns in train_points
71+
assert set(numeric_columns + categorical_columns) == set(train_points.columns), (
72+
"The union of numeric_columns and categorical_columns must match the columns in the combined dataframe"
73+
)
74+
75+
task_type = TaskType.MULTICLASS_CLASSIFICATION if target_col in categorical_columns else TaskType.REGRESSION
76+
77+
# Remove target column from feature columns
78+
numeric_columns = [col for col in numeric_columns if col != target_col]
79+
categorical_columns = [col for col in categorical_columns if col != target_col]
80+
81+
numeric_transformer = StandardScaler()
82+
categorical_transformer = OneHotEncoder(drop="first", handle_unknown="ignore")
83+
84+
preprocessor = ColumnTransformer(
85+
transformers=[
86+
("num", numeric_transformer, numeric_columns),
87+
("cat", categorical_transformer, categorical_columns),
88+
]
89+
)
90+
91+
model = (
92+
RandomForestClassifier(random_state=random_seed)
93+
if task_type == TaskType.MULTICLASS_CLASSIFICATION
94+
else RandomForestRegressor(random_state=random_seed)
95+
)
96+
97+
model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
98+
99+
model_pipeline.fit(x_train, y_train)
100+
101+
predictions = model_pipeline.predict(x_test)
102+
103+
return predictions, y_test, task_type
104+
105+
106+
def extract_features(
107+
synthetic_data: pd.DataFrame,
108+
challenge_data: pd.DataFrame,
109+
column_types: dict[str, list[str]],
110+
random_seed: int | None = None,
111+
) -> pd.DataFrame:
112+
"""
113+
Orchestrator function to run feature extraction for EPT attack:
114+
1. For each attribute (column) in the synthetic data that is not an ID, train an attribute prediction model
115+
using the synthetic data.
116+
2. Use the trained model to predict the values of that attribute in the challenge data, which also doesn't
117+
contain IDs.
118+
3. Compute relevant metrics (accuracy for categorical data, error and error ratio for numerical data).
119+
4. Compile the results into a DataFrame.
120+
121+
Args:
122+
synthetic_data: Synthetic data to extract features from. Note: This data should not contain any identifier
123+
columns, as the function will attempt to train a prediction model for every column included.
124+
challenge_data: The data the predictions are compared against, to compute prediction accuracy/errors.
125+
column_types: A dictionary specifying the types of columns (numerical or categorical) in the data.
126+
random_seed: Random seed for reproducibility. Defaults to None.
127+
128+
Returns:
129+
A DataFrame containing the extracted features for each attribute in the challenge data.
130+
It includes the following columns:
131+
- <column_name>: The true values for the attribute.
132+
- <column_name>_prediction: The predicted values for the attribute.
133+
If the data is categorical:
134+
- <column_name>_accuracy: The element-wise accuracy of the predictions. 0 for incorrect prediction,
135+
1 for correct.
136+
If the data is numerical:
137+
- <column_name>_error (if regression): The absolute errors of the predictions.
138+
- <column_name>_error_ratio (if regression): The ratio of the errors to the true values, which is
139+
derived by dividing the absolute error by the true value in a zero-safe manner.
140+
"""
141+
features = []
142+
columns = []
143+
144+
for column in synthetic_data.columns:
145+
log(INFO, f"Extracting features for column: {column}")
146+
147+
predictions, y_test, task_type = preprocess_train_predict(
148+
train_points=synthetic_data,
149+
test_points=challenge_data,
150+
target_col=column,
151+
column_types=column_types,
152+
random_seed=random_seed,
153+
)
154+
155+
features.append(y_test)
156+
columns.append(column)
157+
158+
if task_type == TaskType.MULTICLASS_CLASSIFICATION:
159+
# TODO: Maybe change the variable name from accuracy to correctness
160+
# Calculate accuracy
161+
accuracy = predictions == y_test
162+
accuracy = accuracy.astype(int)
163+
features.append(accuracy)
164+
columns.append(f"{column}_accuracy")
165+
166+
elif task_type == TaskType.REGRESSION:
167+
# Calculate errors
168+
errors = pd.Series(np.abs(predictions - y_test), index=y_test.index)
169+
170+
# Calculate the ratio of the error in a zero-safe manner
171+
denominator = y_test.replace(0, np.nan)
172+
error_ratio = errors / np.abs(denominator)
173+
174+
# Replace infs and NaNs with a large number. If all values are NaN, replace with 1e9.
175+
finite_max = error_ratio[np.isfinite(error_ratio)].max()
176+
error_ratio = error_ratio.replace([np.inf, -np.inf], np.nan).fillna(
177+
finite_max if pd.notna(finite_max) else 1e9
178+
)
179+
180+
# Save the error and the ratio error
181+
features.append(errors)
182+
features.append(error_ratio)
183+
184+
columns.append(f"{column}_error")
185+
columns.append(f"{column}_error_ratio")
186+
187+
else:
188+
raise ValueError(f"Unsupported task type: {task_type}")
189+
190+
# predictions from the model
191+
features.append(pd.Series(predictions, index=y_test.index))
192+
columns.append(f"{column}_prediction")
193+
194+
# Create a DataFrame with the results
195+
df_results = pd.DataFrame(features).T
196+
df_results.columns = columns
197+
198+
return df_results

0 commit comments

Comments
 (0)