VectorInstitute · fatemetkl · Nov 12, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/examples/ensemble_attack/configs/experiment_config.yaml b/examples/ensemble_attack/configs/experiment_config.yaml
@@ -1,34 +1,35 @@
 # Ensemble experiment configuration
 # This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``tets_attack_model.py``).
-base_experiment_dir: examples/ensemble_attack/tabddpm_20k_experiment_data # Processed data, and experiment artifacts will be stored here
-base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory
+base_experiment_dir: /projects/midst-experiments/ensemble_attack/tabddpm_10k_experiment_data/10k/ # Processed data, and experiment artifacts will be stored under this directory.
+base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory.
 
-# Pipeline control
+# Training Pipeline Control
 pipeline:
   run_data_processing: true # Set this to false if you have already saved the processed data
   run_shadow_model_training: true # Set this to false if shadow models are already trained and saved
   run_metaclassifier_training: true
 
 target_model: # This is only used for testing the attack on a real target model.
-  # This is for models trained on 20k data and generating 20k synthetic data
-  target_model_directory: /projects/midst-experiments/all_tabddpms/tabddpm_trained_with_20k/train/
+  target_model_directory: /projects/midst-experiments/all_tabddpms/tabddpm_trained_with_10k/test/
   target_model_id: 21  # Will be overridden per SLURM array task
   target_model_name: tabddpm_${target_model.target_model_id}
-  target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/synthetic_data/20k/20k.csv
+  target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/synthetic_data/10k/10k.csv
   challenge_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_with_id.csv
   challenge_label_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_label.csv
 
-  target_attack_artifact_dir: ${base_experiment_dir}/target_${target_model.target_model_id}_attack_artifacts/
-  attack_probabilities_result_path: ${target_model.target_attack_artifact_dir}/attack_model_${target_model.target_model_id}_proba
-  target_shadow_models_output_path: ${target_model.target_attack_artifact_dir}/tabddpm_${target_model.target_model_id}_shadows_dir
+  target_shadow_models_output_path: ${base_experiment_dir}/test_all_targets # Sub-directory to store test shadows and results
+  attack_probabilities_result_path: ${target_model.target_shadow_models_output_path}/test_probabilities/attack_model_${target_model.target_model_id}_proba
+  attack_rmia_shadow_training_data_choice: "combined" # Options: "combined", "only_challenge", "only_train". This determines which data to use for training RMIA attack model in testing phase.
 
 
 # Data paths
 data_paths:
-  midst_data_path: /projects/midst-experiments/all_tabddpms # Used to collect the data
-  population_path: ${base_experiment_dir}/population_data  # Path where the collected population data will be stored
-  processed_attack_data_path: ${base_experiment_dir}/attack_data # Path where the processed attack real train and evaluation data is stored
-  attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack evaluation results will be stored
+  midst_data_path: /projects/midst-experiments/all_tabddpms/ # Used to collect the data (input) as defined in data_processing_config
+  processed_base_data_dir: ${base_experiment_dir} # To save new processed data for training, or read from previously collected and processed data (testing phase).
+  population_path: ${data_paths.processed_base_data_dir}/population_data  # Path where the collected population data will be stored (output/input)
+  processed_attack_data_path: ${data_paths.processed_base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored (output/input)
+  attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack (train phase) evaluation results will be stored (output)
+
 
 model_paths:
   metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved
@@ -38,23 +39,25 @@ model_paths:
 data_processing_config:
   population_attack_data_types_to_collect:
         [
-          "tabddpm_trained_with_20k",
+          "tabddpm_trained_with_10k",
         ]
   challenge_attack_data_types_to_collect:
         [
-          "tabddpm_trained_with_20k",
+          "tabddpm_trained_with_10k",
         ]
   population_splits: ["train"]  # Data splits to be collected for population data
-  challenge_splits: ["train"]  # Data splits to be collected for challenge points
+  challenge_splits: ["train" , "test"]  # Data splits to be collected for challenge points
+  original_population_data_path: /projects/midst-experiments/ensemble_attack/competition/population_data/ #Attack's collected population for DOMIAS
   # The column name in the data to be used for stratified splitting.
   column_to_stratify: "trans_type"  # Attention: This value is not documented in the original codebase.
-  folder_ranges: #Specify folder ranges for any of the mentioned splits.
-    train: [[1, 20]] # Folders to be used for train data collection in the experiments
+  folder_ranges: # Specify folder ranges for any of the mentioned splits.
+    train: [[1, 21]] # Folders to be used for train data collection in the experiments
+    test: [[21, 31] , [31, 41]]
   # File names in MIDST data directories.
   single_table_train_data_file_name: "train_with_id.csv"
   multi_table_train_data_file_name: "trans.csv"
   challenge_data_file_name: "challenge_with_id.csv"
-  population_sample_size: 40000 # Population size is the total data that your attack has access to.
+  population_sample_size: 20000 # Population size is the total data that your attack has access to.
   # In experiments, this is sampled out of all the collected training data in case the available data
   # is more than this number. Note that, half of this data is actually used for training, the other half
   # is used for evaluation. For example, with 40k population size, only 20k is used for training the attack model.
@@ -86,7 +89,7 @@ shadow_training:
     fine_tune_diffusion_iterations: 200000 # Original code: 200000
     fine_tune_classifier_iterations: 20000 # Original code: 20000
     pre_train_data_size: 60000 # Original code: 60000
-  number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models.
+  number_of_points_to_synthesize: 10000 # Number of synthetic data samples to be generated by shadow models.
   # Original code: 20000
 
 
@@ -104,7 +107,7 @@ metaclassifier:
   meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model
 
 attack_success_computation:
-  target_ids_to_test: [21,22,23] # List of target model IDs to compute the attack success for.
+  target_ids_to_test: [21, 22, 23, 24, 25, 26, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] # List of target model IDs to compute the attack success for.
 
 # General settings
 random_seed: 42 # Set to null for no seed, or an integer for a fixed seed
diff --git a/examples/ensemble_attack/real_data_collection.py b/examples/ensemble_attack/real_data_collection.py
@@ -4,12 +4,14 @@
 """
 
 from enum import Enum
+from logging import INFO
 from pathlib import Path
 
 import pandas as pd
 from omegaconf import DictConfig
 
 from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe
+from midst_toolkit.common.logger import log
 
 
 class AttackType(Enum):
@@ -59,18 +61,15 @@ def collect_midst_attack_data(
     Args:
         attack_type: The attack setting.
         data_dir: The path where the data is stored.
-        data_split: Indicates if this is train, dev, or final data.
+        data_split: Indicates if this is train, dev, or final data. Note that, this is in fact
+            the name of the folder that contains model folders for data collection. For example,
+            f"{generation_name}_{i}" should be located under ``data_split`` folder.
         dataset: The dataset to be collected. Either "train" or "challenge".
         data_processing_config: Configuration dictionary containing data specific information.
 
     Returns:
         pd.DataFrame: The specified dataset in this setting.
     """
-    assert data_split in [
-        "train",
-        "dev",
-        "final",
-    ], "data_split should be one of 'train', 'dev', or 'final'."
     # `data_id` is the folder numbering of each training or challenge dataset,
     #  and is defined with the provided config.
     data_id = expand_ranges(data_processing_config.folder_ranges[data_split])
@@ -80,7 +79,7 @@ def collect_midst_attack_data(
     generation_name = attack_type.value.split("_")[0]
     if dataset == "challenge":
         file_name = data_processing_config.challenge_data_file_name
-    else:  # dataset == "train"
+    else:
         # Multi-table attacks have different file names.
         file_name = (
             data_processing_config.multi_table_train_data_file_name
@@ -108,13 +107,18 @@ def collect_midst_data(
 ) -> pd.DataFrame:
     """
     Collect train or challenge data of the specified attack type from the provided data folders
-    in the MIDST competition.
+    in the MIDST competition. The data is going to be collected from all the folders specified
+    in ``data_splits`` argument under each attack type folder. For example, if ``data_splits``
+    contains `train` and `dev`, the function collects data from both `train` and `dev` folders
+    under each attack type folder. For more information about the data collection structure, see
+    the implementation of ``collect_midst_attack_data`` function.
 
     Args:
         midst_data_input_dir: The path where the MIDST data folders are stored.
         attack_types: List of attack types for data collection.
-        data_splits: A list indicating the data split to be collected.
-            Could be any of train, dev, or final data splits.
+        data_splits: A list indicating the data split to be collected. This is a list of folder names
+            under each attack type folder where we collect model's data from. For example, it could
+            contain strings like `train`, `dev`, `final`, or `test` based on the directory structure.
         dataset: The dataset to be collected. Either `train` or `challenge`.
         data_processing_config: Configuration dictionary containing data paths and file names.
 
@@ -133,7 +137,7 @@ def collect_midst_data(
                 data_processing_config=data_processing_config,
             )
 
-        population.append(df_real)
+            population.append(df_real)
 
     return pd.concat(population).drop_duplicates()
 
@@ -142,51 +146,72 @@ def collect_population_data_ensemble(
     midst_data_input_dir: Path,
     data_processing_config: DictConfig,
     save_dir: Path,
+    original_repo_population: pd.DataFrame,
     population_splits: list[str] | None = None,
     challenge_splits: list[str] | None = None,
 ) -> pd.DataFrame:
     """
     Collect the population data from the MIDST competition based on Ensemble Attack implementation.
     Returns real data population that consists of the train data of all the attacks
-    (black box and white box), and challenge points from `train`, `dev` and `final` of
-    "tabddpm_black_box" attack. The population data is saved in the provided path,
-    and returned as a dataframe.
+    (black box and white box) as specified in ``data_processing_config.population_attack_data_types_to_collect``
+    , and challenge points from `train`, `dev` and `final` of attacks as specified by
+    ``data_processing_config.challenge_attack_data_types_to_collect``. The collected population data is concatenated
+    with ``original_repo_population`` to be large enough for the attack (specially DOMIAS) and then is saved in
+    the provided path, and returned as a dataframe.
 
     Args:
         midst_data_input_dir: The path where the MIDST data folders are stored.
         data_processing_config: Configuration dictionary containing data information and file names.
         save_dir: The path where the collected population data should be saved.
+        original_repo_population: The original population data collected from the MIDST challenge repository.
         population_splits: A list indicating the data splits to be collected for population data.
-            Could be any of `train`, `dev`, or `final` data splits. If None, the default list of ``["train"]``
-            is set in the function based on the original attack implementation.
+            This is a list of strings containing the folder names under attack folders that are
+            considered for population collection. If None, the default list of ``["train"]`` is set in the
+            function based on the original attack implementation.
         challenge_splits: A list indicating the data splits to be collected for challenge points.
-            Could be any of `train`, `dev`, or `final` data splits. If None, the default list of
-            ``["train", "dev", "final"]`` is set in the function based on the original attack implementation.
+            This is a list of strings containing the folder names under attack folders that are
+            considered for challenge data collection. If None, the default list of ``["train", "dev", "final"]``
+            is set in the function based on the original attack implementation.
 
     Returns:
         The collected population data as a dataframe.
     """
     # Population data will be saved under ``save_dir``.
     save_dir.mkdir(parents=True, exist_ok=True)
 
+    if population_splits is None:
+        population_splits = ["train"]
+    if challenge_splits is None:
+        # Original Ensemble collects all the challenge points from train, dev and final of "tabddpm_black_box" attack.
+        challenge_splits = ["train", "dev", "final"]
+
+    # Population data will be saved under ``save_dir``.
+    save_dir.mkdir(parents=True, exist_ok=True)
+
     if population_splits is None:
         population_splits = ["train"]
     if challenge_splits is None:
         # Original Ensemble collects all the challenge points from train, dev and final of "tabddpm_black_box" attack.
         challenge_splits = ["train", "dev", "final"]
 
     # Ensemble Attack collects train data of all the attack types (black box and white box)
-    attack_names = data_processing_config.population_attack_data_types_to_collect
+    population_attack_names = data_processing_config.population_attack_data_types_to_collect
     # Provided attack name are valid based on AttackType enum
-    population_attack_types: list[AttackType] = [AttackType(attack_name) for attack_name in attack_names]
+    population_attack_types: list[AttackType] = [AttackType(attack_name) for attack_name in population_attack_names]
 
-    df_population = collect_midst_data(
+    df_population_experiment = collect_midst_data(
         midst_data_input_dir,
         population_attack_types,
         data_splits=population_splits,
         dataset="train",
         data_processing_config=data_processing_config,
     )
+
+    log(INFO, f"Collected experiment population data length before concatenation: {len(df_population_experiment)}")
+
+    df_population = pd.concat([df_population_experiment, original_repo_population]).drop_duplicates()
+    log(INFO, f"Concatenated population data length: {len(df_population)}")
+
     # Drop ids.
     df_population_no_id = df_population.drop(columns=["trans_id", "account_id"])
     # Save the population data
@@ -195,13 +220,15 @@ def collect_population_data_ensemble(
 
     challenge_attack_names = data_processing_config.challenge_attack_data_types_to_collect
     challenge_attack_types = [AttackType(attack_name) for attack_name in challenge_attack_names]
+
     df_challenge = collect_midst_data(
         midst_data_input_dir,
         attack_types=challenge_attack_types,
         data_splits=challenge_splits,
         dataset="challenge",
         data_processing_config=data_processing_config,
     )
+    log(INFO, f"Collected challenge data length: {len(df_challenge)} from splits: {challenge_splits}")
     # Save the challenge points
     save_dataframe(df_challenge, save_dir, "challenge_points_all.csv")
 

diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py
@@ -11,6 +11,7 @@
 from omegaconf import DictConfig
 
 from examples.ensemble_attack.real_data_collection import collect_population_data_ensemble
+from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
 from midst_toolkit.attacks.ensemble.process_split_data import process_split_data
 from midst_toolkit.common.logger import log
 from midst_toolkit.common.random import set_all_random_seeds
@@ -23,15 +24,25 @@ def run_data_processing(config: DictConfig) -> None:
     Args:
         config: Configuration object set in config.yaml.
     """
+    # Load original repo's population to be concatenated to the experiment's population data.
+    # This is done to align the experiments with the original attack code because
+    # this attack needs a large population dataset, and only using the experiment's collected population
+    # is not enough.
+    original_population_data = load_dataframe(
+        Path(config.data_processing_config.original_population_data_path),
+        "population_all_with_challenge.csv",
+    )
     log(INFO, "Running data processing pipeline...")
     # Collect the real data from the MIDST challenge resources.
     population_data = collect_population_data_ensemble(
         midst_data_input_dir=Path(config.data_paths.midst_data_path),
         data_processing_config=config.data_processing_config,
         save_dir=Path(config.data_paths.population_path),
+        original_repo_population=original_population_data,
         population_splits=config.data_processing_config.population_splits,
         challenge_splits=config.data_processing_config.challenge_splits,
     )
+
     # The following function saves the required dataframe splits in the specified processed_attack_data_path path.
     process_split_data(
         all_population_data=population_data,
@@ -67,7 +78,11 @@ def main(config: DictConfig) -> None:
     # TODO: Investigate the source of error.
     if config.pipeline.run_shadow_model_training:
         shadow_pipeline = importlib.import_module("examples.ensemble_attack.run_shadow_model_training")
-        shadow_data_paths = shadow_pipeline.run_shadow_model_training(config)
+        df_master_challenge_train = load_dataframe(
+            Path(config.data_paths.processed_attack_data_path),
+            "master_challenge_train.csv",
+        )
+        shadow_data_paths = shadow_pipeline.run_shadow_model_training(config, df_master_challenge_train)
         shadow_data_paths = [Path(path) for path in shadow_data_paths]
 
         target_model_synthetic_path = shadow_pipeline.run_target_model_training(config)