VectorInstitute · bzamanlooy · Dec 17, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/examples/tartan_federer_attack/README.md b/examples/tartan_federer_attack/README.md
@@ -0,0 +1,33 @@
+# Tartan Federer Attack
+
+This example runs a Tartan–Federer membership inference attack using trained TabDDPM models. The pipeline optionally performs a data processing step to prepare population datasets and then executes the attack using a binary classifier.
+
+## Data Processing
+
+The data processing step constructs population datasets representing the real data available to the attacker.
+
+A selected subset of `train_with_id.csv` files is collected from `tabddpm_1` to `tabddpm_6` located under:
+
+```
+examples/tartan_federer_attack/tabddpm_trained_with_20k/tabddpm_white_box
+```
+
+For each selected model, both `train_with_id.csv` and `challenge_with_id.csv` are loaded. All training datasets are merged into a single dataframe and all challenge datasets are merged into a single dataframe. Any training samples that also appear in the challenge dataset are removed, and duplicate samples are dropped based on configured identifier columns.
+
+The model indices used to build the population datasets for training and validation are specified in the configuration file:
+
+```yaml
+data_processing_config:
+  population_attack_indices_to_collect_for_training: [1, 2]
+  population_attack_indices_to_collect_for_validation: [3, 4]
+  model_type: tabddpm
+  columns_for_deduplication: ['trans_id', 'balance']
+```
+
+## Running the Attack
+
+Before running the attack, activate your virtual environment and update `config.yaml` as needed. From the top-level directory of the library, run:
+
+```bash
+python -m examples.tartan_federer_attack.run_attack
+```
diff --git a/examples/tartan_federer_attack/__init__.py b/examples/tartan_federer_attack/__init__.py
diff --git a/examples/tartan_federer_attack/configs/experiment_config.yaml b/examples/tartan_federer_attack/configs/experiment_config.yaml
@@ -0,0 +1,47 @@
+# Ensemble experiment configuration
+# This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``tets_attack_model.py``).
+base_experiment_dir: examples/tartan_federer_attack/tabddpm_20k_experiment_data # Processed data, and experiment artifacts will be stored here
+base_data_config_dir: examples/tartan_federer_attack/data_configs # Training and data type configs are saved under this directory
+
+# Pipeline control
+pipeline:
+  run_data_processing: true
+
+
+# Data paths
+data_paths:
+  midst_data_path: examples/tartan_federer_attack/tabddpm_20k_experiment_data/tabddpm_white_box # Used to collect the data
+  population_data_path: ${base_experiment_dir}/population_data # Path where the processed population data will be stored
+  metadata_dir: ${base_experiment_dir}/data_configs # Path where metadata about the experiment will be stored
+
+
+# Dataset specific information used for processing in this example
+data_processing_config:
+  population_attack_indices_to_collect_for_training:
+        [1,2]
+  population_attack_indices_to_collect_for_validation:
+        [3,4]
+  model_type: tabddpm
+  columns_for_deduplication: ['trans_id', 'balance']
+
+
+attack_config:
+  model_type: tabddpm
+  models_base_dir: examples/tartan_federer_attack/tabddpm_20k_experiment_data/tabddpm_white_box
+  target_model_subdir: "."
+  samples_per_train_model: 3000
+  sample_per_val_model: 10
+  num_noise_per_time_step: 30
+  timesteps: [5, 10, 15]
+  additional_timesteps: [0]
+  predictions_file_format: challenge_label_predictions
+  results_path: ${base_experiment_dir}/tartan_federer_attack_results
+  test_indices: [3]
+  train_indices: [1]
+  val_indices: [2]
+  columns_for_deduplication: [trans_id, balance]
+
+classifier_config:
+  hidden_dim: 100
+  learning_rate: 1e-4
+  num_epochs: 200
diff --git a/examples/tartan_federer_attack/run_attack.py b/examples/tartan_federer_attack/run_attack.py
@@ -0,0 +1,162 @@
+import os
+from logging import INFO
+from pathlib import Path
+from typing import Any, cast
+
+import hydra
+import pandas as pd
+from omegaconf import DictConfig, OmegaConf
+
+from midst_toolkit.attacks.tartan_federer.tartan_federer_attack import tartan_federer_attack
+from midst_toolkit.common.logger import log
+from midst_toolkit.common.random import set_all_random_seeds, unset_all_random_seeds
+
+
+def prepare_population_dataset_for_attack(
+    model_indices: list[int],
+    model_type: str,
+    models_base_dir: Path,
+    columns_for_deduplication: list[str],
+) -> pd.DataFrame:
+    """
+    Prepares data for an attack by merging and deduplicating datasets.
+
+    Args:
+        model_indices: List of model indices over which to iterate and for which to gather information.
+        model_type: Name of the model type for which we're loading data.
+        models_base_dir: Where the various models' data lives.
+        columns_for_deduplication: Names of columns to use in de-duplicating the dataframes
+
+    Raises:
+        ValueError: Throws if the list of model indices is empty.
+        ValueError: Throws if any of the dataframes to be de-duplicated do not have the specified columns in
+            ``columns_for_deduplication``
+
+    Returns:
+        A DataFrame containing the merged trainig data that has been deduplicated and is free from challenge data.
+    """
+    if len(model_indices) == 0:
+        raise ValueError("The 'indices' list is empty. Please provide indices to process datasets.")
+
+    df_merge_list = []
+    df_challenge_list = []
+
+    for model_index in model_indices:
+        base_path = models_base_dir / f"{model_type}_{model_index}"
+        df_merge_list.append(pd.read_csv(os.path.join(base_path, "train_with_id.csv")))
+        df_challenge_list.append(pd.read_csv(os.path.join(base_path, "challenge_with_id.csv")))
+
+    df_merge = pd.concat(df_merge_list, ignore_index=True)
+    df_challenge = pd.concat(df_challenge_list, ignore_index=True)
+    # Deduplicate the datasets once
+    df_merge = df_merge.drop_duplicates(subset=columns_for_deduplication)
+    df_challenge = df_challenge.drop_duplicates(subset=columns_for_deduplication)
+
+    # Ensure all keys for deduplication exist in both DataFrames
+    missing_keys_merge = [key for key in columns_for_deduplication if key not in df_merge.columns]
+    missing_keys_challenge = [key for key in columns_for_deduplication if key not in df_challenge.columns]
+    if missing_keys_merge or missing_keys_challenge:
+        raise ValueError(f"Missing columns for deduplication: {missing_keys_merge + missing_keys_challenge}")
+
+    # Remove challenge entries from the merged dataset
+    return df_merge[
+        ~df_merge.set_index(columns_for_deduplication).index.isin(
+            df_challenge.set_index(columns_for_deduplication).index
+        )
+    ]
+
+
+def run_data_processing(config: dict[str, Any]) -> None:
+    """
+    Run the data processing pipeline for the Tartan–Federer attack example.
+    This function prepares the population datasets required for training and validating the attack.
+
+    Args:
+        config: Attack configuration as an OmegaConf DictConfig object.
+    """
+    log(INFO, "Running data processing pipeline...")
+
+    # 🔑 Hydra → plain Python
+    Path(config["data_paths"]["population_data_path"]).mkdir(parents=True, exist_ok=True)
+
+    population_data_for_training_attack = prepare_population_dataset_for_attack(
+        model_indices=config["data_processing_config"]["population_attack_indices_to_collect_for_training"],
+        model_type=config["data_processing_config"]["model_type"],
+        models_base_dir=Path(config["data_paths"]["midst_data_path"]),
+        columns_for_deduplication=config["data_processing_config"]["columns_for_deduplication"],
+    )
+
+    population_data_for_training_attack.to_csv(
+        Path(config["data_paths"]["population_data_path"]) / "population_dataset_for_training_attack.csv",
+        index=False,
+    )
+
+    population_data_for_validating_attack = prepare_population_dataset_for_attack(
+        model_indices=config["data_processing_config"]["population_attack_indices_to_collect_for_validation"],
+        model_type=config["data_processing_config"]["model_type"],
+        models_base_dir=Path(config["data_paths"]["midst_data_path"]),
+        columns_for_deduplication=config["data_processing_config"]["columns_for_deduplication"],
+    )
+
+    population_data_for_validating_attack.to_csv(
+        Path(config["data_paths"]["population_data_path"]) / "population_dataset_for_validating_attack.csv",
+        index=False,
+    )
+
+    log(INFO, "Data processing pipeline finished.")
+
+
+@hydra.main(config_path="configs", config_name="experiment_config", version_base=None)
+def run_attack(config: DictConfig) -> None:
+    """
+    Run the Tartan–Federer attack example pipeline.
+
+    Args:
+        config: Attack configuration as an OmegaConf DictConfig object.
+    """
+    log(INFO, "Running Tartan–Federer attack...")
+
+    set_all_random_seeds(
+        seed=133742,
+        use_deterministic_torch_algos=True,
+        disable_torch_benchmarking=True,
+    )
+
+    cfg = cast(dict[str, Any], OmegaConf.to_container(config, resolve=True))
+
+    if config["pipeline"]["run_data_processing"]:
+        run_data_processing(cfg)
+
+    data_cfg = cfg["data_paths"]
+    attack_cfg = cfg["attack_config"]
+    classifier_cfg = cfg["classifier_config"]
+
+    mia_performance_train, mia_performance_val, mia_performance_test = tartan_federer_attack(
+        model_type=attack_cfg["model_type"],
+        model_data_dir=Path(attack_cfg["models_base_dir"]),
+        target_model_subdir=Path(attack_cfg["target_model_subdir"]),
+        samples_per_train_model=attack_cfg["samples_per_train_model"],
+        sample_per_val_model=attack_cfg["sample_per_val_model"],
+        num_noise_per_time_step=attack_cfg["num_noise_per_time_step"],
+        timesteps=attack_cfg["timesteps"],
+        additional_timesteps=attack_cfg["additional_timesteps"],
+        predictions_file_format=attack_cfg["predictions_file_format"],
+        results_path=Path(attack_cfg["results_path"]),
+        test_indices=attack_cfg["test_indices"],
+        train_indices=attack_cfg["train_indices"],
+        val_indices=attack_cfg["val_indices"],
+        columns_for_deduplication=attack_cfg["columns_for_deduplication"],
+        classifier_hidden_dim=classifier_cfg["hidden_dim"],
+        classifier_num_epochs=classifier_cfg["num_epochs"],
+        classifier_learning_rate=classifier_cfg["learning_rate"],
+        meta_dir=Path(config["data_paths"]["metadata_dir"]),
+        population_data_dir=Path(data_cfg["population_data_path"]),
+    )
+
+    unset_all_random_seeds()
+
+    log(INFO, "Attack finished successfully.")
+
+
+if __name__ == "__main__":
+    run_attack()
diff --git a/src/midst_toolkit/attacks/tartan_federer/data_utils.py b/src/midst_toolkit/attacks/tartan_federer/data_utils.py
@@ -157,57 +157,6 @@ def save_results_and_plot_roc_curve(
     log(INFO, f"✅ All runs completed. Results saved to {results_summary_path}")
 
 
-def prepare_population_dataset_for_attack(
-    model_indices: list[int], model_type: str, models_base_dir: Path, columns_for_deduplication: list[str]
-) -> pd.DataFrame:
-    """
-    Prepares data for an attack by merging and deduplicating datasets.
-
-    Args:
-        model_indices: List of model indices over which to iterate and for which to gather information.
-        model_type: Name of the model type for which we're loading data.
-        models_base_dir: Where the various models' data lives.
-        columns_for_deduplication: Names of columns to use in de-duplicating the dataframes
-
-    Raises:
-        ValueError: Throws if the list of model indices is empty.
-        ValueError: Throws if any of the dataframes to be de-duplicated do not have the specified columns in
-            ``columns_for_deduplication``
-
-    Returns:
-        A DataFrame containing the merged trainig data that has been deduplicated and is free from challenge data.
-    """
-    if len(model_indices) == 0:
-        raise ValueError("The 'indices' list is empty. Please provide indices to process datasets.")
-
-    df_merge_list = []
-    df_challenge_list = []
-
-    for model_index in model_indices:
-        base_path = models_base_dir / f"{model_type}_{model_index}"
-        df_merge_list.append(pd.read_csv(os.path.join(base_path, "train_with_id.csv")))
-        df_challenge_list.append(pd.read_csv(os.path.join(base_path, "challenge_with_id.csv")))
-
-    df_merge = pd.concat(df_merge_list, ignore_index=True)
-    df_challenge = pd.concat(df_challenge_list, ignore_index=True)
-    # Deduplicate the datasets once
-    df_merge = df_merge.drop_duplicates(subset=columns_for_deduplication)
-    df_challenge = df_challenge.drop_duplicates(subset=columns_for_deduplication)
-
-    # Ensure all keys for deduplication exist in both DataFrames
-    missing_keys_merge = [key for key in columns_for_deduplication if key not in df_merge.columns]
-    missing_keys_challenge = [key for key in columns_for_deduplication if key not in df_challenge.columns]
-    if missing_keys_merge or missing_keys_challenge:
-        raise ValueError(f"Missing columns for deduplication: {missing_keys_merge + missing_keys_challenge}")
-
-    # Remove challenge entries from the merged dataset
-    return df_merge[
-        ~df_merge.set_index(columns_for_deduplication).index.isin(
-            df_challenge.set_index(columns_for_deduplication).index
-        )
-    ]
-
-
 def evaluate_attack_performance(
     model_indices: list[int],
     description: str,