VectorInstitute · fatemetkl · Nov 25, 2025 · Nov 12, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/.gitignore b/.gitignore
@@ -30,11 +30,12 @@ wheels/
 **/workspace/*.bkp
 
 # Data files
-examples/**/data/
+examples/**/*data/
+examples/**/*.csv
+examples/**/*.npy
 
-# Trained metaclassifiers
-examples/ensemble_attack/trained_models/
-examples/ensemble_attack/attack_results/
+# Trained models
+examples/ensemble_attack/**/*.pkl
 
 # hydra output
 outputs/
@@ -51,3 +52,7 @@ examples/synthesizing/single_table/data/**
 examples/synthesizing/single_table/results/**
 examples/synthesizing/multi_table/data/**
 examples/synthesizing/multi_table/results/**
+
+# Training Logs
+*.err
+*.out
diff --git a/examples/ensemble_attack/configs/experiment_config.yaml b/examples/ensemble_attack/configs/experiment_config.yaml
@@ -0,0 +1,108 @@
+# Ensemble experiment configuration
+# This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``tets_attack_model.py``).
+base_experiment_dir: examples/ensemble_attack/tabddpm_20k_experiment_data # Processed data, and experiment artifacts will be stored here
+base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory
+
+# Pipeline control
+pipeline:
+  run_data_processing: true # Set this to false if you have already saved the processed data
+  run_shadow_model_training: true # Set this to false if shadow models are already trained and saved
+  run_metaclassifier_training: true
+
+target_model: # This is only used for testing the attack on a real target model.
+  # This is for models trained on 20k data and generating 20k synthetic data
+  target_model_directory: /projects/midst-experiments/all_tabddpms/tabddpm_trained_with_20k/train/
+  target_model_id: 21  # Will be overridden per SLURM array task
+  target_model_name: tabddpm_${target_model.target_model_id}
+  target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/synthetic_data/20k/20k.csv
+  challenge_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_with_id.csv
+  challenge_label_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_label.csv
+
+  target_attack_artifact_dir: ${base_experiment_dir}/target_${target_model.target_model_id}_attack_artifacts/
+  attack_probabilities_result_path: ${target_model.target_attack_artifact_dir}/attack_model_${target_model.target_model_id}_proba
+  target_shadow_models_output_path: ${target_model.target_attack_artifact_dir}/tabddpm_${target_model.target_model_id}_shadows_dir
+
+
+# Data paths
+data_paths:
+  midst_data_path: /projects/midst-experiments/all_tabddpms # Used to collect the data
+  population_path: ${base_experiment_dir}/population_data  # Path where the collected population data will be stored
+  processed_attack_data_path: ${base_experiment_dir}/attack_data # Path where the processed attack real train and evaluation data is stored
+  attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack evaluation results will be stored
+
+model_paths:
+  metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved
+
+
+# Dataset specific information used for processing in this example
+data_processing_config:
+  population_attack_data_types_to_collect:
+        [
+          "tabddpm_trained_with_20k",
+        ]
+  challenge_attack_data_types_to_collect:
+        [
+          "tabddpm_trained_with_20k",
+        ]
+  population_splits: ["train"]  # Data splits to be collected for population data
+  challenge_splits: ["train"]  # Data splits to be collected for challenge points
+  # The column name in the data to be used for stratified splitting.
+  column_to_stratify: "trans_type"  # Attention: This value is not documented in the original codebase.
+  folder_ranges: #Specify folder ranges for any of the mentioned splits.
+    train: [[1, 20]] # Folders to be used for train data collection in the experiments
+  # File names in MIDST data directories.
+  single_table_train_data_file_name: "train_with_id.csv"
+  multi_table_train_data_file_name: "trans.csv"
+  challenge_data_file_name: "challenge_with_id.csv"
+  population_sample_size: 40000 # Population size is the total data that your attack has access to.
+  # In experiments, this is sampled out of all the collected training data in case the available data
+  # is more than this number. Note that, half of this data is actually used for training, the other half
+  # is used for evaluation. For example, with 40k population size, only 20k is used for training the attack model.
+  # TODO: make sure to consider this in experiments.
+
+# Training and data settings for shadow models (temporary, numbers subject to change)
+shadow_training:
+  # Data Config files path used for training a TabDDPM model
+  training_json_config_paths: # Config json files used for tabddpm training on the trans table
+    table_domain_file_path: ${base_data_config_dir}/trans_domain.json
+    dataset_meta_file_path: ${base_data_config_dir}/dataset_meta.json
+    tabddpm_training_config_path: ${base_data_config_dir}/trans.json
+  # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name
+  # Also, training configs for each shadow model are created under shadow_models_data_path.
+  shadow_models_output_path: ${base_experiment_dir}/shadow_models_and_data
+  target_model_output_path: ${base_experiment_dir}/shadow_target_model_and_data
+  # Paths to final shadow models used for metaclassifier training (relative to shadow_models_output_path)
+  # These paths are a result of running the shadow model training pipeline, specifically the
+  # train_three_sets_of_shadow_models in shadow_model_training.py
+  # Each .pkl file contains the training data, trained model and training results for all shadow models in a list.
+  final_shadow_models_path: [
+          "${shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl",
+          "${shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl",
+          "${shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl",
+      ]
+  target_synthetic_data_path: ${shadow_training.target_model_output_path}/target_synthetic_data.csv
+  # Path to final shadow target's synthetic data (relative to target_model_output_path)
+  fine_tuning_config:
+    fine_tune_diffusion_iterations: 200000 # Original code: 200000
+    fine_tune_classifier_iterations: 20000 # Original code: 20000
+    pre_train_data_size: 60000 # Original code: 60000
+  number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models.
+  # Original code: 20000
+
+
+# Metaclassifier settings
+metaclassifier:
+  # Data types json file is used for xgboost model training.
+  data_types_file_path: ${base_data_config_dir}/data_types.json
+  model_type: "xgb"
+  # Model training parameters
+  num_optuna_trials: 100 # Original code: 100
+  num_kfolds: 5
+  use_gpu: false
+  # Temporary. Might remove having an epoch parameter.
+  epochs: 1
+  meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model
+
+
+# General settings
+random_seed: 42 # Set to null for no seed, or an integer for a fixed seed
diff --git a/examples/ensemble_attack/config.yaml → ...ttack/configs/original_attack_config.yaml b/examples/ensemble_attack/config.yaml → ...ttack/configs/original_attack_config.yaml
@@ -8,21 +8,21 @@ data_paths:
   midst_data_path: ${base_data_dir}/midst_data_all_attacks # Used only for reading the data
   population_path: ${base_data_dir}/population_data  # Path where the population data should be stored
   processed_attack_data_path: ${base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored
-  attack_results_path: ${base_example_dir}/attack_results # Path where the attack results will be stored
+  attack_evaluation_result_path: ${base_example_dir}/attack_results # Path where the attack evaluation results will be stored
 
 model_paths:
   metaclassifier_model_path: ${base_example_dir}/trained_models # Path where the trained metaclassifier model will be saved
 
 # Pipeline control
 pipeline:
-  run_data_processing: false # Set this to false if you have already saved the processed data
-  run_shadow_model_training: false # Set this to false if shadow models are already trained and saved
+  run_data_processing: true # Set this to false if you have already saved the processed data
+  run_shadow_model_training: true # Set this to false if shadow models are already trained and saved
   run_metaclassifier_training: true
 
 
 # Dataset specific information used for processing in this example
 data_processing_config:
-  collect_attack_data_types:
+  population_attack_data_types_to_collect:
         [
           "tabddpm_black_box",
           "tabsyn_black_box",
@@ -31,6 +31,12 @@ data_processing_config:
           "clavaddpm_black_box",
           "clavaddpm_white_box",
         ]
+  challenge_attack_data_types_to_collect:
+        [
+          "tabddpm_black_box",
+        ]
+  population_splits: ["train"]  # Data splits to be collected for population data
+  challenge_splits: ["train"]  # Data splits to be collected for challenge points
   # The column name in the data to be used for stratified splitting.
   column_to_stratify: "trans_type"  # Attention: This value is not documented in the original codebase.
   folder_ranges:
@@ -41,8 +47,9 @@ data_processing_config:
   single_table_train_data_file_name: "train_with_id.csv"
   multi_table_train_data_file_name: "trans.csv"
   challenge_data_file_name: "challenge_with_id.csv"
-  population_sample_size: 11956 #Population size the total data that your attack has access to.
-  #The size of the master challenge dataset is half of the population size based on the attack design.
+  population_sample_size: 40000 # Population size is the total data that your attack has access to.
+  # The size of the master challenge dataset is half of the population size based on the attack design.
+  # The other half is used for evaluation.
   # Original code: 40000
 
 # Training and data settings for shadow models (temporary, numbers subject to change)
@@ -64,14 +71,14 @@ shadow_training:
       # These paths are a result of running the shadow model training pipeline, specifically the
       # train_three_sets_of_shadow_models in shadow_model_training.py
       # Each .pkl file contains the training data, trained model and training results for all shadow models in a list.
-  final_target_model_path: ${shadow_training.target_model_output_path}/target_model/shadow_workspace/trained_target_model/target_model.pkl
-  # Path to final target model (relative to target_model_output_path)
+  target_synthetic_data_path: ${shadow_training.target_model_output_path}/target_synthetic_data.csv
+  # Path to final target model's synthetic data (relative to target_model_output_path)
   fine_tuning_config:
-    fine_tune_diffusion_iterations: 2 # Original code: 200000
-    fine_tune_classifier_iterations: 2 # Original code: 20000
-    pre_train_data_size: 10 # 10 for test run. Original code: 60000
-  number_of_points_to_synthesize: 200 # Number of synthetic data samples to be generated by shadow models.
-  # 200 for test run. Original code: 20000
+    fine_tune_diffusion_iterations: 200000 # Original code: 200000
+    fine_tune_classifier_iterations: 20000 # Original code: 20000
+    pre_train_data_size: 60000 # Original code: 60000
+  number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models.
+  # Original code: 20000
 
 
 # Metaclassifier settings
@@ -80,11 +87,12 @@ metaclassifier:
   data_types_file_path: ${base_example_dir}/data_configs/data_types.json
   model_type: "xgb"
   # Model training parameters
-  num_optuna_trials: 10 # Original code: 100
+  num_optuna_trials: 100 # Original code: 100
   num_kfolds: 5
   use_gpu: false
   # Temporary. Might remove having an epoch parameter.
   epochs: 1
+  meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model
 
 
 # General settings

diff --git a/examples/ensemble_attack/data_configs/trans.json b/examples/ensemble_attack/data_configs/trans.json
@@ -8,42 +8,50 @@
     },
     "clustering": {
         "parent_scale": 1.0,
-        "num_clusters": 4,
+        "num_clusters": 50,
         "clustering_method": "kmeans_and_gmm"
     },
     "diffusion": {
         "d_layers": [
-            4,
-            8
+            512,
+            1024,
+            1024,
+            1024,
+            1024,
+            512
         ],
-        "dropout": 0.1,
-        "num_timesteps": 3,
+        "dropout": 0.0,
+        "num_timesteps": 2000,
         "model_type": "mlp",
-        "iterations": 3,
-        "batch_size": 1,
+        "iterations": 200000,
+        "batch_size": 4096,
         "lr": 0.0006,
         "gaussian_loss_type": "mse",
         "weight_decay": 1e-05,
-        "scheduler": "cosine",
-        "data_split_ratios": [0.5, 0.25, 0.25]
+        "scheduler": "cosine"
     },
     "classifier": {
         "d_layers": [
-            4,
-            4
+            128,
+            256,
+            512,
+            1024,
+            512,
+            256,
+            128
         ],
         "lr": 0.0001,
-        "dim_t": 4,
-        "batch_size": 1,
-        "iterations": 2
+        "dim_t": 128,
+        "batch_size": 4096,
+        "iterations": 20000
     },
     "sampling": {
-        "batch_size": 2,
+        "batch_size": 20000,
         "classifier_scale": 1.0
     },
     "matching": {
         "num_matching_clusters": 1,
-        "matching_batch_size": 1,
+        "matching_batch_size": 1000,
         "unique_matching": true,
         "no_matching": false
     }

diff --git a/examples/ensemble_attack/real_data_collection.py b/examples/ensemble_attack/real_data_collection.py
@@ -21,6 +21,12 @@ class AttackType(Enum):
     TABSYN_WHITE_BOX = "tabsyn_white_box"
     CLAVADDPM_BLACK_BOX = "clavaddpm_black_box"
     CLAVADDPM_WHITE_BOX = "clavaddpm_white_box"
+    # Experiment attack types based on experiment settings
+    TABDDPM_5K = "tabddpm_trained_with_5k"
+    TABDDPM_10K = "tabddpm_trained_with_10k"
+    TABDDPM_20K = "tabddpm_trained_with_20k"
+    TABDDPM_50K = "tabddpm_trained_with_50k"
+    TABDDPM_100K = "tabddpm_trained_with_100k"
 
 
 def expand_ranges(ranges: list[tuple[int, int]]) -> list[int]:
@@ -136,6 +142,8 @@ def collect_population_data_ensemble(
     midst_data_input_dir: Path,
     data_processing_config: DictConfig,
     save_dir: Path,
+    population_splits: list[str] | None = None,
+    challenge_splits: list[str] | None = None,
 ) -> pd.DataFrame:
     """
     Collect the population data from the MIDST competition based on Ensemble Attack implementation.
@@ -148,19 +156,34 @@ def collect_population_data_ensemble(
         midst_data_input_dir: The path where the MIDST data folders are stored.
         data_processing_config: Configuration dictionary containing data information and file names.
         save_dir: The path where the collected population data should be saved.
+        population_splits: A list indicating the data splits to be collected for population data.
+            Could be any of `train`, `dev`, or `final` data splits. If None, the default list of ``["train"]``
+            is set in the function based on the original attack implementation.
+        challenge_splits: A list indicating the data splits to be collected for challenge points.
+            Could be any of `train`, `dev`, or `final` data splits. If None, the default list of
+            ``["train", "dev", "final"]`` is set in the function based on the original attack implementation.
 
     Returns:
         The collected population data as a dataframe.
     """
+    # Population data will be saved under ``save_dir``.
+    save_dir.mkdir(parents=True, exist_ok=True)
+
+    if population_splits is None:
+        population_splits = ["train"]
+    if challenge_splits is None:
+        # Original Ensemble collects all the challenge points from train, dev and final of "tabddpm_black_box" attack.
+        challenge_splits = ["train", "dev", "final"]
+
     # Ensemble Attack collects train data of all the attack types (black box and white box)
-    attack_names = data_processing_config.collect_attack_data_types
+    attack_names = data_processing_config.population_attack_data_types_to_collect
     # Provided attack name are valid based on AttackType enum
-    attack_types: list[AttackType] = [AttackType(attack_name) for attack_name in attack_names]
+    population_attack_types: list[AttackType] = [AttackType(attack_name) for attack_name in attack_names]
 
     df_population = collect_midst_data(
         midst_data_input_dir,
-        attack_types,
-        data_splits=["train"],
+        population_attack_types,
+        data_splits=population_splits,
         dataset="train",
         data_processing_config=data_processing_config,
     )
@@ -170,12 +193,12 @@ def collect_population_data_ensemble(
     save_dataframe(df_population, save_dir, "population_all.csv")
     save_dataframe(df_population_no_id, save_dir, "population_all_no_id.csv")
 
-    # Collect all the challenge points from train, dev and final of "tabddpm_black_box" attack.
-    challenge_attack_types = [AttackType.TABDDPM_BLACK_BOX]
+    challenge_attack_names = data_processing_config.challenge_attack_data_types_to_collect
+    challenge_attack_types = [AttackType(attack_name) for attack_name in challenge_attack_names]
     df_challenge = collect_midst_data(
         midst_data_input_dir,
         attack_types=challenge_attack_types,
-        data_splits=["train", "dev", "final"],
+        data_splits=challenge_splits,
         dataset="challenge",
         data_processing_config=data_processing_config,
     )

diff --git a/examples/ensemble_attack/run.sh b/examples/ensemble_attack/run.sh