VectorInstitute · fatemetkl · Jan 30, 2026 · Nov 12, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/examples/ensemble_attack/configs/experiment_config.yaml b/examples/ensemble_attack/configs/experiment_config.yaml
@@ -1,34 +1,36 @@
 # Ensemble experiment configuration
 # This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``tets_attack_model.py``).
-base_experiment_dir: examples/ensemble_attack/tabddpm_20k_experiment_data # Processed data, and experiment artifacts will be stored here
-base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory
+base_experiment_dir: /projects/midst-experiments/ensemble_attack/tabddpm_10k_experiment_data/10k/ # Processed data, and experiment artifacts will be stored under this directory.
+base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory.
 
-# Pipeline control
+# Training Pipeline Control
 pipeline:
   run_data_processing: true # Set this to false if you have already saved the processed data
   run_shadow_model_training: true # Set this to false if shadow models are already trained and saved
   run_metaclassifier_training: true
 
 target_model: # This is only used for testing the attack on a real target model.
-  # This is for models trained on 20k data and generating 20k synthetic data
-  target_model_directory: /projects/midst-experiments/all_tabddpms/tabddpm_trained_with_20k/train/
+  target_model_directory: /projects/midst-experiments/all_tabddpms/tabddpm_trained_with_10k/test/
   target_model_id: 21  # Will be overridden per SLURM array task
   target_model_name: tabddpm_${target_model.target_model_id}
-  target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/synthetic_data/20k/20k.csv
+  target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/synthetic_data/10k/10k.csv
   challenge_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_with_id.csv
   challenge_label_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_label.csv
 
-  target_attack_artifact_dir: ${base_experiment_dir}/target_${target_model.target_model_id}_attack_artifacts/
-  attack_probabilities_result_path: ${target_model.target_attack_artifact_dir}/attack_model_${target_model.target_model_id}_proba
-  target_shadow_models_output_path: ${target_model.target_attack_artifact_dir}/tabddpm_${target_model.target_model_id}_shadows_dir
+  target_shadow_models_output_path: ${base_experiment_dir}/test_all_targets # Sub-directory to store test shadows and results
+  attack_probabilities_result_path: ${target_model.target_shadow_models_output_path}/test_probabilities/attack_model_${target_model.target_model_id}_proba
+  attack_rmia_shadow_training_data_choice: "combined" # Options: "combined", "only_challenge", "only_train". This determines which data to use for training RMIA attack model in testing phase.
+  # 
 
 
 # Data paths
 data_paths:
-  midst_data_path: /projects/midst-experiments/all_tabddpms # Used to collect the data
-  population_path: ${base_experiment_dir}/population_data  # Path where the collected population data will be stored
-  processed_attack_data_path: ${base_experiment_dir}/attack_data # Path where the processed attack real train and evaluation data is stored
-  attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack evaluation results will be stored
+  midst_data_path: /projects/midst-experiments/all_tabddpms/ # Used to collect the data (input) as defined in data_processing_config
+  processed_base_data_dir: ${base_experiment_dir} # To save new processed data for training, or read from previously collected and processed data (testing phase).
+  population_path: ${data_paths.processed_base_data_dir}/population_data  # Path where the collected population data will be stored (output/input)
+  processed_attack_data_path: ${data_paths.processed_base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored (output/input)
+  attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack (train phase) evaluation results will be stored (output)
+
 
 model_paths:
   metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved
@@ -38,23 +40,25 @@ model_paths:
 data_processing_config:
   population_attack_data_types_to_collect:
         [
-          "tabddpm_trained_with_20k",
+          "tabddpm_trained_with_10k",
         ]
   challenge_attack_data_types_to_collect:
         [
-          "tabddpm_trained_with_20k",
+          "tabddpm_trained_with_10k",
         ]
   population_splits: ["train"]  # Data splits to be collected for population data
-  challenge_splits: ["train"]  # Data splits to be collected for challenge points
+  challenge_splits: ["train" , "test"]  # Data splits to be collected for challenge points
+  original_population_data_path: /projects/midst-experiments/ensemble_attack/competition/population_data/ #Attack's collected population for DOMIAS
   # The column name in the data to be used for stratified splitting.
   column_to_stratify: "trans_type"  # Attention: This value is not documented in the original codebase.
   folder_ranges: #Specify folder ranges for any of the mentioned splits.
-    train: [[1, 20]] # Folders to be used for train data collection in the experiments
+    train: [[1, 21]] # Folders to be used for train data collection in the experiments
+    test: [[21, 31] , [31, 41]]
   # File names in MIDST data directories.
   single_table_train_data_file_name: "train_with_id.csv"
   multi_table_train_data_file_name: "trans.csv"
   challenge_data_file_name: "challenge_with_id.csv"
-  population_sample_size: 40000 # Population size is the total data that your attack has access to.
+  population_sample_size: 20000 # Population size is the total data that your attack has access to.
   # In experiments, this is sampled out of all the collected training data in case the available data
   # is more than this number. Note that, half of this data is actually used for training, the other half
   # is used for evaluation. For example, with 40k population size, only 20k is used for training the attack model.
@@ -86,7 +90,7 @@ shadow_training:
     fine_tune_diffusion_iterations: 200000 # Original code: 200000
     fine_tune_classifier_iterations: 20000 # Original code: 20000
     pre_train_data_size: 60000 # Original code: 60000
-  number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models.
+  number_of_points_to_synthesize: 10000 # Number of synthetic data samples to be generated by shadow models.
   # Original code: 20000
 
 
@@ -104,7 +108,7 @@ metaclassifier:
   meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model
 
 attack_success_computation:
-  target_ids_to_test: [21,22,23] # List of target model IDs to compute the attack success for.
+  target_ids_to_test: [21, 22, 23, 24, 25, 26, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] # List of target model IDs to compute the attack success for.
 
 # General settings
 random_seed: 42 # Set to null for no seed, or an integer for a fixed seed
diff --git a/examples/ensemble_attack/real_data_collection.py b/examples/ensemble_attack/real_data_collection.py
@@ -4,12 +4,14 @@
 """
 
 from enum import Enum
+from logging import INFO
 from pathlib import Path
 
 import pandas as pd
 from omegaconf import DictConfig
 
 from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe
+from midst_toolkit.common.logger import log
 
 
 class AttackType(Enum):
@@ -66,11 +68,11 @@ def collect_midst_attack_data(
     Returns:
         pd.DataFrame: The specified dataset in this setting.
     """
-    assert data_split in [
-        "train",
-        "dev",
-        "final",
-    ], "data_split should be one of 'train', 'dev', or 'final'."
+    # assert data_split in [
+    #     "train",
+    #     "dev",
+    #     "final",
+    # ], "data_split should be one of 'train', 'dev', or 'final'."
     # `data_id` is the folder numbering of each training or challenge dataset,
     #  and is defined with the provided config.
     data_id = expand_ranges(data_processing_config.folder_ranges[data_split])
@@ -133,7 +135,7 @@ def collect_midst_data(
                 data_processing_config=data_processing_config,
             )
 
-        population.append(df_real)
+            population.append(df_real)
 
     return pd.concat(population).drop_duplicates()
 
@@ -142,20 +144,30 @@ def collect_population_data_ensemble(
     midst_data_input_dir: Path,
     data_processing_config: DictConfig,
     save_dir: Path,
+    original_repo_population: pd.DataFrame,
     population_splits: list[str] | None = None,
     challenge_splits: list[str] | None = None,
 ) -> pd.DataFrame:
     """
     Collect the population data from the MIDST competition based on Ensemble Attack implementation.
     Returns real data population that consists of the train data of all the attacks
-    (black box and white box), and challenge points from `train`, `dev` and `final` of
-    "tabddpm_black_box" attack. The population data is saved in the provided path,
-    and returned as a dataframe.
+    (black box and white box) as specified in ``data_processing_config.population_attack_data_types_to_collect``
+    , and challenge points from `train`, `dev` and `final` of attacks as specified by
+    ``data_processing_config.challenge_attack_data_types_to_collect``. The collected population data is concatenated
+    with ``original_repo_population`` to be large enough for the attack (specially DOMIAS) and then is saved in
+    the provided path, and returned as a dataframe.
 
     Args:
         midst_data_input_dir: The path where the MIDST data folders are stored.
         data_processing_config: Configuration dictionary containing data information and file names.
         save_dir: The path where the collected population data should be saved.
+        original_repo_population: The original population data collected from the MIDST challenge repository.
+        population_splits: A list indicating the data splits to be collected for population data.
+            Could be any of `train`, `dev`, or `final` data splits. If None, the default list of ``["train"]``
+            is set in the function based on the original attack implementation.
+        challenge_splits: A list indicating the data splits to be collected for challenge points.
+            Could be any of `train`, `dev`, or `final` data splits. If None, the default list of
+            ``["train", "dev", "final"]`` is set in the function based on the original attack implementation.
         population_splits: A list indicating the data splits to be collected for population data.
             Could be any of `train`, `dev`, or `final` data splits. If None, the default list of ``["train"]``
             is set in the function based on the original attack implementation.
@@ -169,6 +181,15 @@ def collect_population_data_ensemble(
     # Population data will be saved under ``save_dir``.
     save_dir.mkdir(parents=True, exist_ok=True)
 
+    if population_splits is None:
+        population_splits = ["train"]
+    if challenge_splits is None:
+        # Original Ensemble collects all the challenge points from train, dev and final of "tabddpm_black_box" attack.
+        challenge_splits = ["train", "dev", "final"]
+
+    # Population data will be saved under ``save_dir``.
+    save_dir.mkdir(parents=True, exist_ok=True)
+
     if population_splits is None:
         population_splits = ["train"]
     if challenge_splits is None:
@@ -180,19 +201,27 @@ def collect_population_data_ensemble(
     # Provided attack name are valid based on AttackType enum
     population_attack_types: list[AttackType] = [AttackType(attack_name) for attack_name in attack_names]
 
-    df_population = collect_midst_data(
+    df_population_experiment = collect_midst_data(
         midst_data_input_dir,
         population_attack_types,
         data_splits=population_splits,
         dataset="train",
         data_processing_config=data_processing_config,
     )
+
+    log(INFO, f"Collected experiment population data length before concatenation: {len(df_population_experiment)}")
+
+    df_population = pd.concat([df_population_experiment, original_repo_population]).drop_duplicates()
+    log(INFO, f"Concatenated population data length: {len(df_population)}")
+
     # Drop ids.
     df_population_no_id = df_population.drop(columns=["trans_id", "account_id"])
     # Save the population data
     save_dataframe(df_population, save_dir, "population_all.csv")
     save_dataframe(df_population_no_id, save_dir, "population_all_no_id.csv")
 
+    challenge_attack_names = data_processing_config.challenge_attack_data_types_to_collect
+    challenge_attack_types = [AttackType(attack_name) for attack_name in challenge_attack_names]
     challenge_attack_names = data_processing_config.challenge_attack_data_types_to_collect
     challenge_attack_types = [AttackType(attack_name) for attack_name in challenge_attack_names]
     df_challenge = collect_midst_data(
@@ -202,6 +231,7 @@ def collect_population_data_ensemble(
         dataset="challenge",
         data_processing_config=data_processing_config,
     )
+    log(INFO, f"Collected challenge data length: {len(df_challenge)} from splits: {challenge_splits}")
     # Save the challenge points
     save_dataframe(df_challenge, save_dir, "challenge_points_all.csv")
 

diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py
@@ -11,6 +11,7 @@
 from omegaconf import DictConfig
 
 from examples.ensemble_attack.real_data_collection import collect_population_data_ensemble
+from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
 from midst_toolkit.attacks.ensemble.process_split_data import process_split_data
 from midst_toolkit.common.logger import log
 from midst_toolkit.common.random import set_all_random_seeds
@@ -23,15 +24,25 @@ def run_data_processing(config: DictConfig) -> None:
     Args:
         config: Configuration object set in config.yaml.
     """
+    # Load original repo's population to be concatenated to the experiment's population data.
+    # This is done to align the experiments with the original attack code because
+    # this attack needs a large population dataset, and only using the experiment's collected population
+    # is not enough.
+    original_population_data = load_dataframe(
+        Path(config.data_processing_config.original_population_data_path),
+        "population_all_with_challenge.csv",
+    )
     log(INFO, "Running data processing pipeline...")
     # Collect the real data from the MIDST challenge resources.
     population_data = collect_population_data_ensemble(
         midst_data_input_dir=Path(config.data_paths.midst_data_path),
         data_processing_config=config.data_processing_config,
         save_dir=Path(config.data_paths.population_path),
+        original_repo_population=original_population_data,
         population_splits=config.data_processing_config.population_splits,
         challenge_splits=config.data_processing_config.challenge_splits,
     )
+
     # The following function saves the required dataframe splits in the specified processed_attack_data_path path.
     process_split_data(
         all_population_data=population_data,
@@ -67,7 +78,11 @@ def main(config: DictConfig) -> None:
     # TODO: Investigate the source of error.
     if config.pipeline.run_shadow_model_training:
         shadow_pipeline = importlib.import_module("examples.ensemble_attack.run_shadow_model_training")
-        shadow_data_paths = shadow_pipeline.run_shadow_model_training(config)
+        df_master_challenge_train = load_dataframe(
+            Path(config.data_paths.processed_attack_data_path),
+            "master_challenge_train.csv",
+        )
+        shadow_data_paths = shadow_pipeline.run_shadow_model_training(config, df_master_challenge_train)
         shadow_data_paths = [Path(path) for path in shadow_data_paths]
 
         target_model_synthetic_path = shadow_pipeline.run_target_model_training(config)

diff --git a/examples/ensemble_attack/run_metaclassifier_training.py b/examples/ensemble_attack/run_metaclassifier_training.py
@@ -25,6 +25,8 @@ def run_metaclassifier_training(
             The list should contain three paths, one for each set of shadow models.
         target_model_synthetic_path: Path to the target model's synthetic data. This is all we need from a target
             model to train the metaclassifier in the black-box setting.
+        target_model_synthetic_path: Path to the target model's synthetic data. This is all we need from a target
+            model to train the metaclassifier in the black-box setting.
     """
     log(INFO, "Running metaclassifier training...")
 
@@ -63,6 +65,7 @@ def run_metaclassifier_training(
         with open(model_path, "rb") as f:
             shadow_data_and_result = pickle.load(f)
             shadow_data_collection.append(shadow_data_and_result)
+            log(INFO, f"Shadow model data loaded from {model_path}.")
 
     assert Path(target_model_synthetic_path).exists(), (
         f"No file found at {target_model_synthetic_path}. "
@@ -71,6 +74,10 @@ def run_metaclassifier_training(
 
     # Load the target model's synthetic data
     target_synthetic_data = pd.read_csv(target_model_synthetic_path)
+    log(
+        INFO,
+        f"Target model's synthetic data loaded from {target_model_synthetic_path} with size {len(target_synthetic_data)}.",
+    )
 
     assert target_synthetic_data is not None, "Target model's synthetic data is missing."
     target_synthetic_data = target_synthetic_data.copy()
@@ -79,6 +86,10 @@ def run_metaclassifier_training(
         Path(config.data_paths.population_path),
         "population_all_with_challenge_no_id.csv",
     )
+    log(
+        INFO,
+        f"Reference population data loaded from f{config.data_paths.population_path} with size {len(df_reference)}.",
+    )
 
     # Extract trans_id from both train and test dataframes
     assert "trans_id" in df_meta_train.columns, "Meta train data must have trans_id column"