diff --git a/.gitignore b/.gitignore index 9c5092f2..8c9def2d 100644 --- a/.gitignore +++ b/.gitignore @@ -30,11 +30,12 @@ wheels/ **/workspace/*.bkp # Data files -examples/**/data/ +examples/**/*data/ +examples/**/*.csv +examples/**/*.npy -# Trained metaclassifiers -examples/ensemble_attack/trained_models/ -examples/ensemble_attack/attack_results/ +# Trained models +examples/ensemble_attack/**/*.pkl # hydra output outputs/ @@ -51,3 +52,7 @@ examples/synthesizing/single_table/data/** examples/synthesizing/single_table/results/** examples/synthesizing/multi_table/data/** examples/synthesizing/multi_table/results/** + +# Training Logs +*.err +*.out diff --git a/examples/ensemble_attack/configs/experiment_config.yaml b/examples/ensemble_attack/configs/experiment_config.yaml new file mode 100644 index 00000000..5973c624 --- /dev/null +++ b/examples/ensemble_attack/configs/experiment_config.yaml @@ -0,0 +1,108 @@ +# Ensemble experiment configuration +# This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``tets_attack_model.py``). +base_experiment_dir: examples/ensemble_attack/tabddpm_20k_experiment_data # Processed data, and experiment artifacts will be stored here +base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory + +# Pipeline control +pipeline: + run_data_processing: true # Set this to false if you have already saved the processed data + run_shadow_model_training: true # Set this to false if shadow models are already trained and saved + run_metaclassifier_training: true + +target_model: # This is only used for testing the attack on a real target model. + # This is for models trained on 20k data and generating 20k synthetic data + target_model_directory: /projects/midst-experiments/all_tabddpms/tabddpm_trained_with_20k/train/ + target_model_id: 21 # Will be overridden per SLURM array task + target_model_name: tabddpm_${target_model.target_model_id} + target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/synthetic_data/20k/20k.csv + challenge_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_with_id.csv + challenge_label_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_label.csv + + target_attack_artifact_dir: ${base_experiment_dir}/target_${target_model.target_model_id}_attack_artifacts/ + attack_probabilities_result_path: ${target_model.target_attack_artifact_dir}/attack_model_${target_model.target_model_id}_proba + target_shadow_models_output_path: ${target_model.target_attack_artifact_dir}/tabddpm_${target_model.target_model_id}_shadows_dir + + +# Data paths +data_paths: + midst_data_path: /projects/midst-experiments/all_tabddpms # Used to collect the data + population_path: ${base_experiment_dir}/population_data # Path where the collected population data will be stored + processed_attack_data_path: ${base_experiment_dir}/attack_data # Path where the processed attack real train and evaluation data is stored + attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack evaluation results will be stored + +model_paths: + metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved + + +# Dataset specific information used for processing in this example +data_processing_config: + population_attack_data_types_to_collect: + [ + "tabddpm_trained_with_20k", + ] + challenge_attack_data_types_to_collect: + [ + "tabddpm_trained_with_20k", + ] + population_splits: ["train"] # Data splits to be collected for population data + challenge_splits: ["train"] # Data splits to be collected for challenge points + # The column name in the data to be used for stratified splitting. + column_to_stratify: "trans_type" # Attention: This value is not documented in the original codebase. + folder_ranges: #Specify folder ranges for any of the mentioned splits. + train: [[1, 20]] # Folders to be used for train data collection in the experiments + # File names in MIDST data directories. + single_table_train_data_file_name: "train_with_id.csv" + multi_table_train_data_file_name: "trans.csv" + challenge_data_file_name: "challenge_with_id.csv" + population_sample_size: 40000 # Population size is the total data that your attack has access to. + # In experiments, this is sampled out of all the collected training data in case the available data + # is more than this number. Note that, half of this data is actually used for training, the other half + # is used for evaluation. For example, with 40k population size, only 20k is used for training the attack model. + # TODO: make sure to consider this in experiments. + +# Training and data settings for shadow models (temporary, numbers subject to change) +shadow_training: + # Data Config files path used for training a TabDDPM model + training_json_config_paths: # Config json files used for tabddpm training on the trans table + table_domain_file_path: ${base_data_config_dir}/trans_domain.json + dataset_meta_file_path: ${base_data_config_dir}/dataset_meta.json + tabddpm_training_config_path: ${base_data_config_dir}/trans.json + # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name + # Also, training configs for each shadow model are created under shadow_models_data_path. + shadow_models_output_path: ${base_experiment_dir}/shadow_models_and_data + target_model_output_path: ${base_experiment_dir}/shadow_target_model_and_data + # Paths to final shadow models used for metaclassifier training (relative to shadow_models_output_path) + # These paths are a result of running the shadow model training pipeline, specifically the + # train_three_sets_of_shadow_models in shadow_model_training.py + # Each .pkl file contains the training data, trained model and training results for all shadow models in a list. + final_shadow_models_path: [ + "${shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl", + "${shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl", + "${shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl", + ] + target_synthetic_data_path: ${shadow_training.target_model_output_path}/target_synthetic_data.csv + # Path to final shadow target's synthetic data (relative to target_model_output_path) + fine_tuning_config: + fine_tune_diffusion_iterations: 200000 # Original code: 200000 + fine_tune_classifier_iterations: 20000 # Original code: 20000 + pre_train_data_size: 60000 # Original code: 60000 + number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models. + # Original code: 20000 + + +# Metaclassifier settings +metaclassifier: + # Data types json file is used for xgboost model training. + data_types_file_path: ${base_data_config_dir}/data_types.json + model_type: "xgb" + # Model training parameters + num_optuna_trials: 100 # Original code: 100 + num_kfolds: 5 + use_gpu: false + # Temporary. Might remove having an epoch parameter. + epochs: 1 + meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model + + +# General settings +random_seed: 42 # Set to null for no seed, or an integer for a fixed seed diff --git a/examples/ensemble_attack/config.yaml b/examples/ensemble_attack/configs/original_attack_config.yaml similarity index 70% rename from examples/ensemble_attack/config.yaml rename to examples/ensemble_attack/configs/original_attack_config.yaml index c71b0ee7..4adaa181 100644 --- a/examples/ensemble_attack/config.yaml +++ b/examples/ensemble_attack/configs/original_attack_config.yaml @@ -8,21 +8,21 @@ data_paths: midst_data_path: ${base_data_dir}/midst_data_all_attacks # Used only for reading the data population_path: ${base_data_dir}/population_data # Path where the population data should be stored processed_attack_data_path: ${base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored - attack_results_path: ${base_example_dir}/attack_results # Path where the attack results will be stored + attack_evaluation_result_path: ${base_example_dir}/attack_results # Path where the attack evaluation results will be stored model_paths: metaclassifier_model_path: ${base_example_dir}/trained_models # Path where the trained metaclassifier model will be saved # Pipeline control pipeline: - run_data_processing: false # Set this to false if you have already saved the processed data - run_shadow_model_training: false # Set this to false if shadow models are already trained and saved + run_data_processing: true # Set this to false if you have already saved the processed data + run_shadow_model_training: true # Set this to false if shadow models are already trained and saved run_metaclassifier_training: true # Dataset specific information used for processing in this example data_processing_config: - collect_attack_data_types: + population_attack_data_types_to_collect: [ "tabddpm_black_box", "tabsyn_black_box", @@ -31,6 +31,12 @@ data_processing_config: "clavaddpm_black_box", "clavaddpm_white_box", ] + challenge_attack_data_types_to_collect: + [ + "tabddpm_black_box", + ] + population_splits: ["train"] # Data splits to be collected for population data + challenge_splits: ["train"] # Data splits to be collected for challenge points # The column name in the data to be used for stratified splitting. column_to_stratify: "trans_type" # Attention: This value is not documented in the original codebase. folder_ranges: @@ -41,8 +47,9 @@ data_processing_config: single_table_train_data_file_name: "train_with_id.csv" multi_table_train_data_file_name: "trans.csv" challenge_data_file_name: "challenge_with_id.csv" - population_sample_size: 11956 #Population size the total data that your attack has access to. - #The size of the master challenge dataset is half of the population size based on the attack design. + population_sample_size: 40000 # Population size is the total data that your attack has access to. + # The size of the master challenge dataset is half of the population size based on the attack design. + # The other half is used for evaluation. # Original code: 40000 # Training and data settings for shadow models (temporary, numbers subject to change) @@ -64,14 +71,14 @@ shadow_training: # These paths are a result of running the shadow model training pipeline, specifically the # train_three_sets_of_shadow_models in shadow_model_training.py # Each .pkl file contains the training data, trained model and training results for all shadow models in a list. - final_target_model_path: ${shadow_training.target_model_output_path}/target_model/shadow_workspace/trained_target_model/target_model.pkl - # Path to final target model (relative to target_model_output_path) + target_synthetic_data_path: ${shadow_training.target_model_output_path}/target_synthetic_data.csv + # Path to final target model's synthetic data (relative to target_model_output_path) fine_tuning_config: - fine_tune_diffusion_iterations: 2 # Original code: 200000 - fine_tune_classifier_iterations: 2 # Original code: 20000 - pre_train_data_size: 10 # 10 for test run. Original code: 60000 - number_of_points_to_synthesize: 200 # Number of synthetic data samples to be generated by shadow models. - # 200 for test run. Original code: 20000 + fine_tune_diffusion_iterations: 200000 # Original code: 200000 + fine_tune_classifier_iterations: 20000 # Original code: 20000 + pre_train_data_size: 60000 # Original code: 60000 + number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models. + # Original code: 20000 # Metaclassifier settings @@ -80,11 +87,12 @@ metaclassifier: data_types_file_path: ${base_example_dir}/data_configs/data_types.json model_type: "xgb" # Model training parameters - num_optuna_trials: 10 # Original code: 100 + num_optuna_trials: 100 # Original code: 100 num_kfolds: 5 use_gpu: false # Temporary. Might remove having an epoch parameter. epochs: 1 + meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model # General settings diff --git a/examples/ensemble_attack/data_configs/trans.json b/examples/ensemble_attack/data_configs/trans.json index 8727dc02..2ce4a8b5 100644 --- a/examples/ensemble_attack/data_configs/trans.json +++ b/examples/ensemble_attack/data_configs/trans.json @@ -8,42 +8,50 @@ }, "clustering": { "parent_scale": 1.0, - "num_clusters": 4, + "num_clusters": 50, "clustering_method": "kmeans_and_gmm" }, "diffusion": { "d_layers": [ - 4, - 8 + 512, + 1024, + 1024, + 1024, + 1024, + 512 ], - "dropout": 0.1, - "num_timesteps": 3, + "dropout": 0.0, + "num_timesteps": 2000, "model_type": "mlp", - "iterations": 3, - "batch_size": 1, + "iterations": 200000, + "batch_size": 4096, "lr": 0.0006, "gaussian_loss_type": "mse", "weight_decay": 1e-05, - "scheduler": "cosine", - "data_split_ratios": [0.5, 0.25, 0.25] + "scheduler": "cosine" }, "classifier": { "d_layers": [ - 4, - 4 + 128, + 256, + 512, + 1024, + 512, + 256, + 128 ], "lr": 0.0001, - "dim_t": 4, - "batch_size": 1, - "iterations": 2 + "dim_t": 128, + "batch_size": 4096, + "iterations": 20000 }, "sampling": { - "batch_size": 2, + "batch_size": 20000, "classifier_scale": 1.0 }, "matching": { "num_matching_clusters": 1, - "matching_batch_size": 1, + "matching_batch_size": 1000, "unique_matching": true, "no_matching": false } diff --git a/examples/ensemble_attack/real_data_collection.py b/examples/ensemble_attack/real_data_collection.py index 0334a5f2..264f71ae 100644 --- a/examples/ensemble_attack/real_data_collection.py +++ b/examples/ensemble_attack/real_data_collection.py @@ -21,6 +21,12 @@ class AttackType(Enum): TABSYN_WHITE_BOX = "tabsyn_white_box" CLAVADDPM_BLACK_BOX = "clavaddpm_black_box" CLAVADDPM_WHITE_BOX = "clavaddpm_white_box" + # Experiment attack types based on experiment settings + TABDDPM_5K = "tabddpm_trained_with_5k" + TABDDPM_10K = "tabddpm_trained_with_10k" + TABDDPM_20K = "tabddpm_trained_with_20k" + TABDDPM_50K = "tabddpm_trained_with_50k" + TABDDPM_100K = "tabddpm_trained_with_100k" def expand_ranges(ranges: list[tuple[int, int]]) -> list[int]: @@ -136,6 +142,8 @@ def collect_population_data_ensemble( midst_data_input_dir: Path, data_processing_config: DictConfig, save_dir: Path, + population_splits: list[str] | None = None, + challenge_splits: list[str] | None = None, ) -> pd.DataFrame: """ Collect the population data from the MIDST competition based on Ensemble Attack implementation. @@ -148,19 +156,34 @@ def collect_population_data_ensemble( midst_data_input_dir: The path where the MIDST data folders are stored. data_processing_config: Configuration dictionary containing data information and file names. save_dir: The path where the collected population data should be saved. + population_splits: A list indicating the data splits to be collected for population data. + Could be any of `train`, `dev`, or `final` data splits. If None, the default list of ``["train"]`` + is set in the function based on the original attack implementation. + challenge_splits: A list indicating the data splits to be collected for challenge points. + Could be any of `train`, `dev`, or `final` data splits. If None, the default list of + ``["train", "dev", "final"]`` is set in the function based on the original attack implementation. Returns: The collected population data as a dataframe. """ + # Population data will be saved under ``save_dir``. + save_dir.mkdir(parents=True, exist_ok=True) + + if population_splits is None: + population_splits = ["train"] + if challenge_splits is None: + # Original Ensemble collects all the challenge points from train, dev and final of "tabddpm_black_box" attack. + challenge_splits = ["train", "dev", "final"] + # Ensemble Attack collects train data of all the attack types (black box and white box) - attack_names = data_processing_config.collect_attack_data_types + attack_names = data_processing_config.population_attack_data_types_to_collect # Provided attack name are valid based on AttackType enum - attack_types: list[AttackType] = [AttackType(attack_name) for attack_name in attack_names] + population_attack_types: list[AttackType] = [AttackType(attack_name) for attack_name in attack_names] df_population = collect_midst_data( midst_data_input_dir, - attack_types, - data_splits=["train"], + population_attack_types, + data_splits=population_splits, dataset="train", data_processing_config=data_processing_config, ) @@ -170,12 +193,12 @@ def collect_population_data_ensemble( save_dataframe(df_population, save_dir, "population_all.csv") save_dataframe(df_population_no_id, save_dir, "population_all_no_id.csv") - # Collect all the challenge points from train, dev and final of "tabddpm_black_box" attack. - challenge_attack_types = [AttackType.TABDDPM_BLACK_BOX] + challenge_attack_names = data_processing_config.challenge_attack_data_types_to_collect + challenge_attack_types = [AttackType(attack_name) for attack_name in challenge_attack_names] df_challenge = collect_midst_data( midst_data_input_dir, attack_types=challenge_attack_types, - data_splits=["train", "dev", "final"], + data_splits=challenge_splits, dataset="challenge", data_processing_config=data_processing_config, ) diff --git a/examples/ensemble_attack/run.sh b/examples/ensemble_attack/run.sh deleted file mode 100755 index 70379ecb..00000000 --- a/examples/ensemble_attack/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# This script sets up the environment and runs the ensemble attack example. - -source .venv/bin/activate - -echo "Active Environment:" -which python - -echo "Experiments Launched" - -python -m examples.ensemble_attack.run_attack - -echo "Experiments Completed" diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py index 1a7eddee..5592e08e 100644 --- a/examples/ensemble_attack/run_attack.py +++ b/examples/ensemble_attack/run_attack.py @@ -13,6 +13,7 @@ from examples.ensemble_attack.real_data_collection import collect_population_data_ensemble from midst_toolkit.attacks.ensemble.process_split_data import process_split_data from midst_toolkit.common.logger import log +from midst_toolkit.common.random import set_all_random_seeds def run_data_processing(config: DictConfig) -> None: @@ -28,6 +29,8 @@ def run_data_processing(config: DictConfig) -> None: midst_data_input_dir=Path(config.data_paths.midst_data_path), data_processing_config=config.data_processing_config, save_dir=Path(config.data_paths.population_path), + population_splits=config.data_processing_config.population_splits, + challenge_splits=config.data_processing_config.challenge_splits, ) # The following function saves the required dataframe splits in the specified processed_attack_data_path path. process_split_data( @@ -41,7 +44,7 @@ def run_data_processing(config: DictConfig) -> None: log(INFO, "Data processing pipeline finished.") -@hydra.main(config_path=".", config_name="config", version_base=None) +@hydra.main(config_path="configs", config_name="experiment_config", version_base=None) def main(config: DictConfig) -> None: """ Run the Ensemble Attack example pipeline. @@ -52,8 +55,13 @@ def main(config: DictConfig) -> None: Args: config: Attack configuration as an OmegaConf DictConfig object. """ + if config.random_seed is not None: + set_all_random_seeds(seed=config.random_seed) + log(INFO, f"Training phase random seed set to {config.random_seed}.") + if config.pipeline.run_data_processing: run_data_processing(config) + # Note: Importing the following two modules causes a segmentation fault error if imported together in this file. # A quick solution is to load modules dynamically if any of the pipelines is called. # TODO: Investigate the source of error. @@ -62,22 +70,21 @@ def main(config: DictConfig) -> None: shadow_data_paths = shadow_pipeline.run_shadow_model_training(config) shadow_data_paths = [Path(path) for path in shadow_data_paths] - target_data_path = shadow_pipeline.run_target_model_training(config) - target_data_path = Path(target_data_path) + target_model_synthetic_path = shadow_pipeline.run_target_model_training(config) if config.pipeline.run_metaclassifier_training: if not config.pipeline.run_shadow_model_training: # If shadow model training is skipped, we need to provide the previous shadow model and target model paths. - shadow_data_paths = [Path(path) for path in config.shadow_training.final_shadow_models_path] - - target_data_path = Path(config.shadow_training.final_target_model_path) + target_model_synthetic_path = Path(config.shadow_training.target_synthetic_data_path) assert len(shadow_data_paths) == 3, "The attack_data_paths list must contain exactly three elements." - assert target_data_path is not None, "The target_data_path must be provided for metaclassifier training." + assert target_model_synthetic_path is not None, ( + "The target_data_path must be provided for metaclassifier training." + ) meta_pipeline = importlib.import_module("examples.ensemble_attack.run_metaclassifier_training") - meta_pipeline.run_metaclassifier_training(config, shadow_data_paths, target_data_path) + meta_pipeline.run_metaclassifier_training(config, shadow_data_paths, target_model_synthetic_path) if __name__ == "__main__": diff --git a/examples/ensemble_attack/run_metaclassifier_training.py b/examples/ensemble_attack/run_metaclassifier_training.py index 51e48563..e6a9c8e5 100644 --- a/examples/ensemble_attack/run_metaclassifier_training.py +++ b/examples/ensemble_attack/run_metaclassifier_training.py @@ -1,9 +1,9 @@ import pickle -from datetime import datetime from logging import INFO from pathlib import Path import numpy as np +import pandas as pd from omegaconf import DictConfig from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType @@ -14,16 +14,17 @@ def run_metaclassifier_training( config: DictConfig, shadow_data_paths: list[Path], - target_data_path: Path, + target_model_synthetic_path: Path, ) -> None: """ - Fuction to run the metaclassifier training and evaluation. + Function to run the metaclassifier training and evaluation. Args: config: Configuration object set in config.yaml. shadow_data_paths: List of paths to the trained shadow models and all their attributes and synthetic data. The list should contain three paths, one for each set of shadow models. - target_data_path: Path to the target model and all its attributes and synthetic data. + target_model_synthetic_path: Path to the target model's synthetic data. This is all we need from a target + model to train the metaclassifier in the black-box setting. """ log(INFO, "Running metaclassifier training...") @@ -63,16 +64,16 @@ def run_metaclassifier_training( shadow_data_and_result = pickle.load(f) shadow_data_collection.append(shadow_data_and_result) - assert target_data_path.exists(), ( - f"No file found at {target_data_path}. Make sure the path is correct and that you have trained the target model." + assert Path(target_model_synthetic_path).exists(), ( + f"No file found at {target_model_synthetic_path}. " + f"Make sure the path is correct and that you have access to target model's synthetic data." ) - with open(target_data_path, "rb") as f: - target_data_and_result = pickle.load(f) + # Load the target model's synthetic data + target_synthetic_data = pd.read_csv(target_model_synthetic_path) - target_synthetic = target_data_and_result["trained_results"][0].synthetic_data - assert target_synthetic is not None, "Target model pickle missing synthetic_data." - target_synthetic = target_synthetic.copy() + assert target_synthetic_data is not None, "Target model's synthetic data is missing." + target_synthetic_data = target_synthetic_data.copy() df_reference = load_dataframe( Path(config.data_paths.population_path), @@ -90,63 +91,54 @@ def run_metaclassifier_training( df_meta_test = df_meta_test.drop(columns=["trans_id", "account_id"]) # Fit the metaclassifier. - meta_classifier_enum = MetaClassifierType(config.metaclassifier.model_type) + meta_classifier_type = MetaClassifierType(config.metaclassifier.model_type) # 1. Initialize the attacker blending_attacker = BlendingPlusPlus( config=config, shadow_data_collection=shadow_data_collection, - target_data=target_data_and_result, - meta_classifier_type=meta_classifier_enum, + data_types_file_path=Path(config.metaclassifier.data_types_file_path), + meta_classifier_type=meta_classifier_type, random_seed=config.random_seed, ) - log(INFO, f"{meta_classifier_enum} created with random seed {config.random_seed}.") + log(INFO, f"{meta_classifier_type} created with random seed {config.random_seed}.") # 2. Train the attacker on the meta-train set - blending_attacker.fit( df_train=df_meta_train, y_train=y_meta_train, - df_target_synthetic=target_synthetic, + df_target_synthetic=target_synthetic_data, df_reference=df_reference, id_column_data=train_trans_ids, use_gpu=config.metaclassifier.use_gpu, epochs=config.metaclassifier.epochs, ) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - model_filename = f"{timestamp}_{config.metaclassifier.model_type}_trained_metaclassifier.pkl" - with open(Path(config.model_paths.metaclassifier_model_path) / model_filename, "wb") as f: + model_filename = config.metaclassifier.meta_classifier_model_name + model_path = Path(config.model_paths.metaclassifier_model_path) / f"{model_filename}.pkl" + model_path.parent.mkdir(parents=True, exist_ok=True) + with open(model_path, "wb") as f: pickle.dump(blending_attacker.trained_model, f) log(INFO, "Metaclassifier model saved, starting evaluation...") - # Get the synthetic data provided by the challenge for evaluation - # TODO: Check if the file is the correct one. - df_synthetic_original = load_dataframe( - Path(config.data_paths.processed_attack_data_path), - "synth.csv", - ) - - # 3. Get predictions on the test set + # 3. Get predictions on the meta test set (evaluation of the trained metaclassifier) + # For evaluation, we test the meta classifier on the meta test set provided the target's synthetic data. probabilities, pred_score = blending_attacker.predict( df_test=df_meta_test, - df_original_synthetic=df_synthetic_original, + df_original_synthetic=target_synthetic_data, # For evaluation only df_reference=df_reference, id_column_data=test_trans_ids, y_test=y_meta_test, ) - # Save the prediction probabilities - attack_results_path = Path(config.data_paths.attack_results_path) - attack_results_path.mkdir(parents=True, exist_ok=True) - np.save( - Path(config.data_paths.attack_results_path) - / f"{timestamp}_{config.metaclassifier.model_type}_test_pred_proba.npy", - probabilities, - ) - log(INFO, "Test set prediction probabilities saved.") + # Save the evaluation prediction probabilities + attack_evaluation_result_path = Path(config.data_paths.attack_evaluation_result_path) + attack_evaluation_result_path.mkdir(parents=True, exist_ok=True) + file_name = attack_evaluation_result_path / f"{model_filename}_val_pred_proba.npy" + np.save(file_name, probabilities) + log(INFO, f"Evaluation prediction probabilities saved at {file_name}.") if pred_score is not None: log(INFO, f"TPR at FPR=0.1: {pred_score:.4f}") diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py index 859464b6..d4a85cbc 100644 --- a/examples/ensemble_attack/run_shadow_model_training.py +++ b/examples/ensemble_attack/run_shadow_model_training.py @@ -1,8 +1,6 @@ -import pickle import shutil from logging import INFO from pathlib import Path -from typing import Any from omegaconf import DictConfig @@ -25,7 +23,7 @@ def run_target_model_training(config: DictConfig) -> Path: config: Configuration object set in config.yaml. Returns: - Path to the saved target model results. + Path to the saved target model's synthetic data. """ log(INFO, "Running target model training...") @@ -42,7 +40,6 @@ def run_target_model_training(config: DictConfig) -> Path: # TODO: Add this to config or .json files table_name = "trans" - id_column_name = "trans_id" target_folder = target_model_output_path / "target_model" @@ -70,24 +67,16 @@ def run_target_model_training(config: DictConfig) -> Path: number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize, ) - # TODO: Check: Selected_id_lists should be of form [[]] - selected_id_lists = [df_real_data[id_column_name].tolist()] + # To train the attack model (metaclassifier), we only need to save target's synthetic data, + # and not the entire target model's training result object. + assert train_result.synthetic_data is not None, "Target model synthetic data is not generated successfully." + target_synthetic_data = train_result.synthetic_data - attack_data: dict[str, Any] = { - "selected_sets": selected_id_lists, - "trained_results": [], - } + # Save the target model's synthetic data + target_model_synthetic_path = config.shadow_training.target_synthetic_data_path + target_synthetic_data.to_csv(target_model_synthetic_path, index=False) - attack_data["trained_results"].append(train_result) - - # Pickle dump the results - result_path = Path(save_dir, "target_model.pkl") - with open(result_path, "wb") as file: - pickle.dump(attack_data, file) - - log(INFO, f"Target model training finished and saved at {result_path}") - - return result_path + return target_model_synthetic_path def run_shadow_model_training(config: DictConfig) -> list[Path]: diff --git a/examples/ensemble_attack/run_test.sh b/examples/ensemble_attack/run_test.sh new file mode 100644 index 00000000..70d7c084 --- /dev/null +++ b/examples/ensemble_attack/run_test.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=1 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --qos=m +#SBATCH --job-name=ensemble_attack_test +#SBATCH --output=%j_%x_%a.out +#SBATCH --error=%j_%x_%a.err +#SBATCH --time=9:00:00 +#SBATCH --array=0-2 # For 3 target_model_ids (adjust range as needed) + +# This script sets up the environment and runs the ensemble attack test script. +source .venv/bin/activate + +echo "Active Environment:" +which python + +# Map SLURM_ARRAY_TASK_ID to target_model_id. +TARGET_IDS=(21 22 23) # List of target IDs +TARGET_ID=${TARGET_IDS[$SLURM_ARRAY_TASK_ID]} + +echo "Running test for target_model_id: $TARGET_ID" + +echo "Experiments Launched" + +python -m examples.ensemble_attack.test_attack_model target_model.target_model_id=$TARGET_ID # Overrides the target_model_id in config. + +echo "Experiments Completed" diff --git a/examples/ensemble_attack/run_train.sh b/examples/ensemble_attack/run_train.sh new file mode 100755 index 00000000..72151c4b --- /dev/null +++ b/examples/ensemble_attack/run_train.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=1 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --qos=m +#SBATCH --job-name=ensemble_attack_train +#SBATCH --output=%j_%x.out +#SBATCH --error=%j_%x.err +#SBATCH --time=12:00:00 + + +# This script sets up the environment and runs the ensemble attack example. +source .venv/bin/activate + +echo "Active Environment:" +which python + +echo "Experiments Launched" + +python -m examples.ensemble_attack.run_attack + +echo "Experiments Completed" diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py new file mode 100644 index 00000000..5beb90f8 --- /dev/null +++ b/examples/ensemble_attack/test_attack_model.py @@ -0,0 +1,172 @@ +"""This script loads the trained attack model and performs the attack on a target model given its synthetic data.""" + +import json +import pickle +from logging import INFO +from pathlib import Path +from typing import Any + +import hydra +import numpy as np +import pandas as pd +from omegaconf import DictConfig + +from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training +from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType +from midst_toolkit.attacks.ensemble.data_utils import load_dataframe +from midst_toolkit.common.logger import log +from midst_toolkit.common.random import set_all_random_seeds + + +def run_rmia_shadow_training(config: DictConfig) -> list[dict[str, list[Any]]]: + """ + Three sets of shadow models will be trained as a part of this attack. + Note that for every new target model, shadow models need to be trained. + RMIA signals (for the challenge points) are calculated based on these shadow models, + and will be fed into the metaclassifier. + + Args: + config: Configuration object set in ``experiments_config.yaml``. + + Return: + A list containing three dictionaries, each representing a collection of shadow + models with their training data and generated synthetic outputs. + """ + shadow_model_paths = run_shadow_model_training(config) + + assert len(shadow_model_paths) == 3, "For testing, meta classifier needs the path to three sets of shadow models." + + shadow_data_collection = [] + for model_path in shadow_model_paths: + assert model_path.exists(), ( + f"No file found at {model_path}. Make sure the path is correct, or run shadow model training first." + ) + + with open(model_path, "rb") as f: + shadow_data_and_result = pickle.load(f) + shadow_data_collection.append(shadow_data_and_result) + + return shadow_data_collection + + +@hydra.main(config_path="configs", config_name="experiment_config", version_base=None) +def run_metaclassifier_testing( + config: DictConfig, +) -> None: + """ + Function to run the attack on a target model using a trained metaclassifier. + Note that RMIA shadow models need to be trained for every new target model's challenge dataset. + However, we load the previously trained metaclassifier model and use it for new target models. + Unlike the training phase, in the testing phase, we don't need to train a shadow target model + since we already have access to the synthetic data of a real target model. + All the collected population data that is used for training, is still needed during testing to compute some + of the signals. + Test prediction probabilities are saved to the specified attack result path in the config. + + Args: + config: Configuration object set in ``experiments_config.yaml``. + """ + log(INFO, f"Running metaclassifier testing on target model {config.target_model.target_model_id}...") + + if config.random_seed is not None: + set_all_random_seeds(seed=config.random_seed) + log(INFO, f"Testing phase random seed set to {config.random_seed}.") + + # 1) Load the trained metaclassifier model to make sure it exists before proceeding. + meta_classifier_type = MetaClassifierType(config.metaclassifier.model_type) + + metaclassifier_model_name = config.metaclassifier.meta_classifier_model_name + mataclassifier_path = Path(config.model_paths.metaclassifier_model_path) / f"{metaclassifier_model_name}.pkl" + assert mataclassifier_path.exists(), ( + f"No metaclassifier model found at {mataclassifier_path}. Make sure to run the training script first." + ) + + with open(mataclassifier_path, "rb") as f: + trained_mataclassifier_model = pickle.load(f) + + log(INFO, "Metaclassifier model loaded, starting the test...") + + # 2) Read target model's challenge data and synthetic data. + + # Back-box attacker has only access to the target model's synthetic data and challenge points. + # We also load challenge labels to report the attack performance. + challenge_data_path = Path(config.target_model.challenge_data_path) + challenge_label_path = Path(config.target_model.challenge_label_path) + + test_data = pd.read_csv(challenge_data_path) + log(INFO, f"Challenge data loaded from {challenge_data_path} with a size of {len(test_data)}.") + + test_target = pd.read_csv(challenge_label_path).to_numpy().squeeze() + assert len(test_data) == len(test_target), "Number of challenge labels must match number of challenge data points." + + target_synthetic_path = Path(config.target_model.target_synthetic_data_path) + target_synthetic_data = pd.read_csv(target_synthetic_path) + log( + INFO, f"Target synthetic data loaded from {target_synthetic_path} with a size of {len(target_synthetic_data)}." + ) + + # Extract trans_id from the test dataframe + with open(Path(config.metaclassifier.data_types_file_path), "r") as f: + column_types = json.load(f) + id_column_name = column_types["id_column_name"] + + assert id_column_name in test_data.columns, f"Test data must have {id_column_name} column" + test_trans_ids = test_data[id_column_name] + + # Drop id columns from test data + id_column_names = [column_name for column_name in test_data.columns if column_name.endswith("_id")] + test_data = test_data.drop(columns=id_column_names) + + # 3) Shadow Model Training Step. + + # Make sure to assign a new path for shadow models trained for target's challenge points to + # avoid overriding train's shadow models. + config.shadow_training.shadow_models_output_path = config.target_model.target_shadow_models_output_path + shadow_data_collection = run_rmia_shadow_training(config) + + # 4) Initialize the attacker object, and assign the loaded metaclassifier to it. + target_synthetic_data = target_synthetic_data.copy() + + df_reference = load_dataframe( + Path(config.data_paths.population_path), + "population_all_with_challenge_no_id.csv", + ) + + blending_attacker = BlendingPlusPlus( + config=config, + shadow_data_collection=shadow_data_collection, + data_types_file_path=Path(config.metaclassifier.data_types_file_path), + meta_classifier_type=meta_classifier_type, + random_seed=config.random_seed, + ) + + # Assign the trained metaclassifier model to the attacker object. + blending_attacker.trained_model = trained_mataclassifier_model + + # 5) Get predictions on the challenge data (test set). + probabilities, pred_score = blending_attacker.predict( + df_test=test_data, + df_original_synthetic=target_synthetic_data, + df_reference=df_reference, + id_column_data=test_trans_ids, + y_test=test_target, + ) + + # Save the validation prediction probabilities + attack_results_path = Path(config.target_model.attack_probabilities_result_path) + attack_results_path.mkdir(parents=True, exist_ok=True) + file_name = attack_results_path / f"{metaclassifier_model_name}_test_pred_proba.npy" + np.save(file_name, probabilities) + log(INFO, f"Test prediction probabilities saved at {file_name}.") + + if pred_score is not None: + log(INFO, f"TPR at FPR=0.1: {pred_score:.4f}") + + # Save the metric results into a text file. + metric_save_path = attack_results_path / f"prediction_score_{metaclassifier_model_name}.txt" + with open(metric_save_path, "w") as f: + f.write(f"TPR at FPR=0.1: {pred_score:.4f}\n") + + +if __name__ == "__main__": + run_metaclassifier_testing() diff --git a/src/midst_toolkit/attacks/ensemble/blending.py b/src/midst_toolkit/attacks/ensemble/blending.py index 83c31804..e2a37f2a 100644 --- a/src/midst_toolkit/attacks/ensemble/blending.py +++ b/src/midst_toolkit/attacks/ensemble/blending.py @@ -3,6 +3,7 @@ import json from enum import Enum from logging import INFO +from pathlib import Path from typing import Any import numpy as np @@ -11,8 +12,8 @@ from sklearn.linear_model import LogisticRegression from midst_toolkit.attacks.ensemble.distance_features import calculate_domias_score, calculate_gower_features +from midst_toolkit.attacks.ensemble.metric_utils import get_tpr_at_fpr from midst_toolkit.attacks.ensemble.rmia.rmia_calculation import calculate_rmia_signals -from midst_toolkit.attacks.ensemble.train_utils import get_tpr_at_fpr from midst_toolkit.attacks.ensemble.xgboost_tuner import XgBoostHyperparameterTuner from midst_toolkit.common.logger import log @@ -27,7 +28,7 @@ def __init__( self, config: DictConfig, shadow_data_collection: list[dict[str, list[Any]]], - target_data: dict[str, list[Any]], + data_types_file_path: Path, meta_classifier_type: MetaClassifierType = MetaClassifierType.XGB, random_seed: int | None = None, ) -> None: @@ -50,23 +51,15 @@ def __init__( is a list of type TrainingResult containing model training information and generated synthetic data. For more details, see the documentation of `train_three_sets_of_shadow_models` at attacks/ensemble/rmia/shadow_model_training.py. - target_data: Dictionary containing the training data of the target model and its generated synthetic data. - The dictionary contains the keys "selected_sets" and "trained_results". - Selected_sets is a list of dataframes used to train the target model, and trained_results - is a list of type TrainingResult containing model training information and generated synthetic data. - For more details, see the documentation of `train_three_sets_of_shadow_models` at - attacks/ensemble/rmia/shadow_model_training.py. - + data_types_file_path: Path to the JSON file containing data column types information. meta_classifier_type: Type of meta classifier model. Defaults to MetaClassifierType.XGB. random_seed: Random seed for reproducibility. Defaults to None. """ - # TODO: We can directly pass the `data_types_file_path` as a parameter to this class. - with open(config.metaclassifier.data_types_file_path, "r") as f: + with open(data_types_file_path, "r") as f: self.column_types = json.load(f) self.shadow_data_collection = shadow_data_collection - self.target_data = target_data self.meta_classifier_type = meta_classifier_type self.trained_model = None self.random_seed = random_seed @@ -88,7 +81,7 @@ def _prepare_meta_features( Args: df_input: Input dataframe (e.g., meta-classifier train or test set). - df_synthetic: Synthetic dataframe. + df_synthetic: Synthetic dataframe generated by the target model. df_reference: Real population dataframe, used as a reference for calculating the DOMIAS score. id_column_data: The data in the ID column, used to ensure correct alignment of results. categorical_cols: Categorical column names. @@ -109,7 +102,7 @@ def _prepare_meta_features( rmia_signals = calculate_rmia_signals( df_input=df_input, shadow_data_collection=self.shadow_data_collection, - target_data=self.target_data, + target_synthetic_data=df_synthetic, categorical_column_names=categorical_cols, id_column_name=id_column_name, id_column_data=id_column_data, @@ -238,7 +231,10 @@ def predict( - Probabilities of membership for the test data. - TPR at FPR (if y_test is provided), or None otherwise. """ - assert self.trained_model is not None, "You must call .fit() before .predict()" + assert self.trained_model is not None, ( + "You must call .fit() before .predict() or provide a trained_model, " + "or assign the trained model to the BlengingPlusPlus object." + ) df_test_features = self._prepare_meta_features( df_input=df_test, diff --git a/src/midst_toolkit/attacks/ensemble/train_utils.py b/src/midst_toolkit/attacks/ensemble/metric_utils.py similarity index 69% rename from src/midst_toolkit/attacks/ensemble/train_utils.py rename to src/midst_toolkit/attacks/ensemble/metric_utils.py index 3104e26b..4a69e4c7 100644 --- a/src/midst_toolkit/attacks/ensemble/train_utils.py +++ b/src/midst_toolkit/attacks/ensemble/metric_utils.py @@ -14,7 +14,10 @@ def get_tpr_at_fpr( Args: true_membership: Array of true binary labels (0 or 1). - predictions: Array of predicted probabilities or scores. + predictions: A list of values in the range [0,1] indicating the confidence + that a challenge point is a member. The closer the value to 1, the more + confident the predictor is about the hypothesis that the challenge point is + a member. max_fpr: Maximum False Positive Rate threshold. Defaults to 0.1. Returns: diff --git a/src/midst_toolkit/attacks/ensemble/process_split_data.py b/src/midst_toolkit/attacks/ensemble/process_split_data.py index 55d0a33c..bbf0360f 100644 --- a/src/midst_toolkit/attacks/ensemble/process_split_data.py +++ b/src/midst_toolkit/attacks/ensemble/process_split_data.py @@ -167,6 +167,8 @@ def process_split_data( num_total_samples: The number of samples randomly selected from the population. Defaults to 40,000. random_seed: Seed for random number generation to ensure reproducibility. Defaults to None. """ + # First, make sure the output directory exists. + processed_attack_data_path.mkdir(parents=True, exist_ok=True) # Original Ensemble attack samples 40k data points to construct # 1) the main population (real data) used for training the synthetic data generator model, # 2) evaluation that is the meta train data (membership classification train dataset) used to train diff --git a/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py b/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py index 7b9cb4f4..3700c4d4 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py @@ -22,9 +22,8 @@ class Key(Enum): def get_rmia_gower( df_input: pd.DataFrame, - model_data: dict[str, list[Any]], + model_data: list[pd.DataFrame], min_length: int, - key: Key, categorical_column_names: list[str], id_column_name: str, random_seed: int | None = None, @@ -35,13 +34,9 @@ def get_rmia_gower( Args: df_input: The dataframe to generate features for (e.g., meta classifier train or test set), derived from the challenge dataset and processed in process_split_data.py. - model_data: A dictionary with keys "fine_tuning_sets" and "fine_tuned_results". - Fine_tuning_sets is a list of dataframes used to fine-tune the shadow models, and fine_tuned_results - is a list of type TrainingResult containing model training information and generated synthetic data. + model_data: A list of dataframes, each representing the synthetic data generated by a shadow model. min_length: Minimum length across all training data, fine tuning data, and synthetic data sizes. This length will be used for downsampling to ensure consistent Gower distance calculations. - key: An instance of the Key Enum, either Key.TRAINED_RESULTS or Key.FINE_TUNED_RESULTS, - depending on which set of shadow models to use. categorical_column_names: A list of categorical column names. We assume that all other columns are numerical. id_column_name: Name of the ID column. random_seed: Random seed for reproducibility. @@ -69,8 +64,8 @@ def get_rmia_gower( df_input[numerical_columns] = df_input[numerical_columns].astype(float) - for i in range(len(model_data[key.value])): - df_synthetic = model_data[key.value][i].synthetic_data.copy() + for i in range(len(model_data)): + df_synthetic = model_data[i].copy() # Convert numerical columns to float (otherwise error in the numpy divide) df_synthetic[numerical_columns] = df_synthetic[numerical_columns].astype(float) @@ -117,7 +112,7 @@ def conditional_average(values: np.ndarray, condition_mask: np.ndarray) -> np.nd def calculate_rmia_signals( df_input: pd.DataFrame, shadow_data_collection: list[dict[str, list[Any]]], - target_data: dict[str, list[Any]], + target_synthetic_data: pd.DataFrame, categorical_column_names: list[str], id_column_name: str, id_column_data: pd.Series, @@ -166,10 +161,7 @@ def calculate_rmia_signals( which store model training metadata and the corresponding generated synthetic data. See ``train_three_sets_of_shadow_models`` in attacks/ensemble/rmia/shadow_model_training.py for additional details. - target_data: A dictionary containing information about the target model. It includes: - - ``selected_sets``: A list of DataFrames used to train the target model. - - ``trained_results``: A list of ``TrainingResult`` objects, each containing details about the model's - training process and the synthetic data generated during training. + target_synthetic_data: Target model's synthetic data as a DataFrame. categorical_column_names: A list of categorical column names. id_column_name: Name of the ID column. id_column_data: The data in the ID column extracted from df_input, ensuring that output signals are @@ -226,31 +218,35 @@ def calculate_rmia_signals( if not (1 <= k <= min_length): raise ValueError(f"k={k} must be within [1, {min_length}]") + shadow_synthetic_list_0 = [ + train_result.synthetic_data for train_result in fine_tuned_shadow_data_0[Key.FINE_TUNED_RESULTS.value] + ] shadow_model_gower_0 = get_rmia_gower( df_input=df_input, - model_data=fine_tuned_shadow_data_0, + model_data=shadow_synthetic_list_0, min_length=min_length, - key=Key.FINE_TUNED_RESULTS, categorical_column_names=categorical_column_names, id_column_name=id_column_name, random_seed=random_seed, ) - + shadow_synthetic_list_1 = [ + train_result.synthetic_data for train_result in fine_tuned_shadow_data_1[Key.FINE_TUNED_RESULTS.value] + ] shadow_model_gower_1 = get_rmia_gower( df_input=df_input, - model_data=fine_tuned_shadow_data_1, + model_data=shadow_synthetic_list_1, min_length=min_length, - key=Key.FINE_TUNED_RESULTS, categorical_column_names=categorical_column_names, id_column_name=id_column_name, random_seed=random_seed, ) - + shadow_synthetic_list_2 = [ + train_result.synthetic_data for train_result in trained_shadow_data[Key.TRAINED_RESULTS.value] + ] shadow_model_gower_2 = get_rmia_gower( df_input=df_input, - model_data=trained_shadow_data, + model_data=shadow_synthetic_list_2, min_length=min_length, - key=Key.TRAINED_RESULTS, categorical_column_names=categorical_column_names, id_column_name=id_column_name, random_seed=random_seed, @@ -270,9 +266,8 @@ def calculate_rmia_signals( # TODO: check key after we have the official target model target_model_gower = get_rmia_gower( df_input=df_input, - model_data=target_data, + model_data=[target_synthetic_data], min_length=min_length, - key=Key.TRAINED_RESULTS, categorical_column_names=categorical_column_names, id_column_name=id_column_name, ) diff --git a/src/midst_toolkit/attacks/ensemble/xgboost_tuner.py b/src/midst_toolkit/attacks/ensemble/xgboost_tuner.py index 2ea41e02..5ff236bb 100644 --- a/src/midst_toolkit/attacks/ensemble/xgboost_tuner.py +++ b/src/midst_toolkit/attacks/ensemble/xgboost_tuner.py @@ -9,7 +9,7 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler -from midst_toolkit.attacks.ensemble.train_utils import get_tpr_at_fpr +from midst_toolkit.attacks.ensemble.metric_utils import get_tpr_at_fpr from midst_toolkit.common.variables import DEVICE diff --git a/tests/unit/attacks/ensemble/test_meta_classifier.py b/tests/unit/attacks/ensemble/test_meta_classifier.py index ecc16c1a..dcbae882 100644 --- a/tests/unit/attacks/ensemble/test_meta_classifier.py +++ b/tests/unit/attacks/ensemble/test_meta_classifier.py @@ -38,6 +38,7 @@ def mock_config_with_json_path(): "num_optuna_trials": 100, "num_kfolds": 5, "epochs": 1, + "meta_classifier_model_name": "mock_model_name", } } ) @@ -95,7 +96,7 @@ def test_init_success(self, mock_file, mock_config_with_json_path): bpp_xgb = BlendingPlusPlus( config=mock_config_with_json_path, shadow_data_collection=[], - target_data=MOCK_TARGET_DATA, + data_types_file_path=mock_config_with_json_path.metaclassifier.data_types_file_path, meta_classifier_type=MetaClassifierType("xgb"), ) @@ -111,7 +112,7 @@ def test_init_success(self, mock_file, mock_config_with_json_path): bpp_lr = BlendingPlusPlus( config=mock_config_with_json_path, shadow_data_collection=[], - target_data=MOCK_TARGET_DATA, + data_types_file_path=mock_config_with_json_path.metaclassifier.data_types_file_path, meta_classifier_type=MetaClassifierType("lr"), ) assert bpp_lr.meta_classifier_type == MetaClassifierType.LR @@ -129,7 +130,7 @@ def test_init_invalid_type_raises_error(self, mock_file, mock_config_with_json_p BlendingPlusPlus( config=mock_config_with_json_path, shadow_data_collection=[], - target_data=MOCK_TARGET_DATA, + data_types_file_path=mock_config_with_json_path.metaclassifier.data_types_file_path, meta_classifier_type=MetaClassifierType("svm"), ) @@ -151,7 +152,7 @@ def test_prepare_meta_features( bpp = BlendingPlusPlus( config=mock_config_with_json_path, shadow_data_collection=[], - target_data=MOCK_TARGET_DATA, + data_types_file_path=mock_config_with_json_path.metaclassifier.data_types_file_path, ) categorical_cols = MOCK_COLUMN_TYPES_CONTENT["categorical"] @@ -201,7 +202,7 @@ def test_prepare_meta_features_rmia_calculation( bpp = BlendingPlusPlus( config=mock_config_with_json_path, shadow_data_collection=attack_collection, - target_data=MOCK_TARGET_DATA, + data_types_file_path=mock_config_with_json_path.metaclassifier.data_types_file_path, ) df_train = sample_dataframes["df_train"] @@ -245,7 +246,7 @@ def test_fit_logistic_regression( bpp = BlendingPlusPlus( config=mock_config_with_json_path, shadow_data_collection=[], - target_data=MOCK_TARGET_DATA, + data_types_file_path=mock_config_with_json_path.metaclassifier.data_types_file_path, meta_classifier_type=MetaClassifierType("lr"), ) bpp.fit( @@ -279,7 +280,7 @@ def test_fit_xgboost( bpp = BlendingPlusPlus( config=mock_config_with_json_path, shadow_data_collection=[], - target_data=MOCK_TARGET_DATA, + data_types_file_path=mock_config_with_json_path.metaclassifier.data_types_file_path, meta_classifier_type=MetaClassifierType("xgb"), ) bpp.fit( @@ -305,7 +306,9 @@ def test_predict_raises_error_if_not_fit(self, mock_file, mock_config_with_json_ mock_file.return_value.read.return_value = json.dumps(MOCK_COLUMN_TYPES_CONTENT) bpp = BlendingPlusPlus( - config=mock_config_with_json_path, shadow_data_collection=[], target_data=MOCK_TARGET_DATA + config=mock_config_with_json_path, + shadow_data_collection=[], + data_types_file_path=mock_config_with_json_path.metaclassifier.data_types_file_path, ) with pytest.raises(AssertionError): bpp.predict( @@ -333,7 +336,7 @@ def test_predict_flow( bpp = BlendingPlusPlus( config=mock_config_with_json_path, shadow_data_collection=[], - target_data=MOCK_TARGET_DATA, + data_types_file_path=mock_config_with_json_path.metaclassifier.data_types_file_path, ) bpp.trained_model = mock_classifier diff --git a/tests/unit/attacks/ensemble/test_rmia.py b/tests/unit/attacks/ensemble/test_rmia.py index b349fb84..03ec8373 100644 --- a/tests/unit/attacks/ensemble/test_rmia.py +++ b/tests/unit/attacks/ensemble/test_rmia.py @@ -93,16 +93,13 @@ def rmia_signal_data() -> dict[str, Any]: }, ] - target_data = { - "selected_sets": [pd.DataFrame(np.random.rand(5, 2))], - "trained_results": [MockTrainingResult(syn_data_5.copy())], - } + target_synthetic_data = MockTrainingResult(syn_data_5.copy()).synthetic_data return { "df_input": df_input, "id_column_data": id_column_data, "shadow_data_collection": shadow_data_collection, - "target_data": target_data, + "target_synthetic_data": target_synthetic_data, "categorical_column_names": ["city"], "id_column_name": "id", "k": k, @@ -155,11 +152,13 @@ def test_get_rmia_gower_basic_run(self, base_data, mocker): ) min_length = 3 + shadow_synthetic_list = [ + train_result.synthetic_data for train_result in base_data["model_data"][Key.TRAINED_RESULTS.value] + ] results = get_rmia_gower( df_input=base_data["df_input"], - model_data=base_data["model_data"], + model_data=shadow_synthetic_list, min_length=min_length, - key=Key.TRAINED_RESULTS, categorical_column_names=base_data["categorical_column_names"], id_column_name=base_data["id_column_name"], random_seed=base_data["random_seed"], @@ -196,11 +195,11 @@ def test_get_rmia_gower_with_sampling(self, base_data, mocker): mock_sample = mocker.patch("pandas.DataFrame.sample", wraps=original_syn_data.sample) min_length = 2 + synthetic_data_list = [data.synthetic_data for data in base_data["model_data"][Key.TRAINED_RESULTS.value]] get_rmia_gower( df_input=base_data["df_input"], - model_data=base_data["model_data"], + model_data=synthetic_data_list, min_length=min_length, - key=Key.TRAINED_RESULTS, categorical_column_names=base_data["categorical_column_names"], id_column_name=base_data["id_column_name"], random_seed=base_data["random_seed"], @@ -224,11 +223,13 @@ def test_get_rmia_gower_missing_categorical_column(self, base_data, mocker, capl missing_cat_cols = ["city", "non_existent_column"] with caplog.at_level("INFO"): + synthetic_data_list = [ + data.synthetic_data for data in base_data["model_data"][Key.FINE_TUNED_RESULTS.value] + ] get_rmia_gower( df_input=base_data["df_input"], - model_data=base_data["model_data"], + model_data=synthetic_data_list, min_length=1, - key=Key.FINE_TUNED_RESULTS, categorical_column_names=missing_cat_cols, id_column_name=base_data["id_column_name"], )