Fixed ensemble attack bugs (#92)

fatemetkl · emersodb · web-flow · commit 258ff33bc11c · 2025-11-14T09:36:37.000-07:00
* Fixed 2 bugs: shadow synth data size, and var name

* David's comments

---------

Co-authored-by: David Emerson &lt;43939939+emersodb@users.noreply.github.com&gt;
diff --git a/examples/ensemble_attack/config.yaml b/examples/ensemble_attack/config.yaml
@@ -43,6 +43,7 @@ data_processing_config:
   challenge_data_file_name: "challenge_with_id.csv"
   population_sample_size: 11956 #Population size the total data that your attack has access to.
   #The size of the master challenge dataset is half of the population size based on the attack design.
+  # Original code: 40000
 
 # Training and data settings for shadow models (temporary, numbers subject to change)
 shadow_training:
@@ -66,9 +67,11 @@ shadow_training:
   final_target_model_path: ${shadow_training.target_model_output_path}/target_model/shadow_workspace/trained_target_model/target_model.pkl
   # Path to final target model (relative to target_model_output_path)
   fine_tuning_config:
-    fine_tune_diffusion_iterations: 2
-    fine_tune_classifier_iterations: 2
-    pre_train_data_size: 10 #10 for test run. Original code: 60000
+    fine_tune_diffusion_iterations: 2 # Original code: 200000
+    fine_tune_classifier_iterations: 2 # Original code: 20000
+    pre_train_data_size: 10 # 10 for test run. Original code: 60000
+  number_of_points_to_synthesize: 200 # Number of synthetic data samples to be generated by shadow models.
+  # 200 for test run. Original code: 20000
 
 
 # Metaclassifier settings
diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py
@@ -59,8 +59,8 @@ def main(config: DictConfig) -> None:
     # TODO: Investigate the source of error.
     if config.pipeline.run_shadow_model_training:
         shadow_pipeline = importlib.import_module("examples.ensemble_attack.run_shadow_model_training")
-        attack_data_paths = shadow_pipeline.run_shadow_model_training(config)
-        attack_data_paths = [Path(path) for path in attack_data_paths]
+        shadow_data_paths = shadow_pipeline.run_shadow_model_training(config)
+        shadow_data_paths = [Path(path) for path in shadow_data_paths]
 
         target_data_path = shadow_pipeline.run_target_model_training(config)
         target_data_path = Path(target_data_path)
diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py
@@ -67,6 +67,7 @@ def run_target_model_training(config: DictConfig) -> Path:
         configs=configs,
         save_dir=save_dir,
         synthesize=True,
+        number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
     )
 
     # TODO: Check: Selected_id_lists should be of form [[]]
@@ -84,6 +85,8 @@ def run_target_model_training(config: DictConfig) -> Path:
     with open(result_path, "wb") as file:
         pickle.dump(attack_data, file)
 
+    log(INFO, f"Target model training finished and saved at {result_path}")
+
     return result_path
 
 
@@ -133,6 +136,7 @@ def run_shadow_model_training(config: DictConfig) -> list[Path]:
         # ``4 * n_models_per_set`` total shadow models.
         n_models_per_set=4,  # 4 based on the original code, must be even
         n_reps=12,  # Number of repetitions of challenge points in each shadow model training set. `12` based on the original code
+        number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
         random_seed=config.random_seed,
     )
     log(
diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py
@@ -29,6 +29,7 @@ def train_fine_tuned_shadow_models(
     table_name: str,
     id_column_name: str,
     pre_training_data_size: int = 60000,
+    number_of_points_to_synthesize: int = 20000,
     init_data_seed: int | None = None,
     random_seed: int | None = None,
 ) -> Path:
@@ -71,6 +72,8 @@ def train_fine_tuned_shadow_models(
             table_name: Name of the main table to be used for training the TabDDPM model.
             id_column_name: Name of the ID column in the data.
             pre_training_data_size: Size of the initial training set, defaults to 60,000.
+            number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
+                defaults to 20,000.
             init_data_seed: Random seed for the initial training set.
             random_seed: Random seed used for reproducibility, defaults to None.
 
@@ -134,7 +137,10 @@ def train_fine_tuned_shadow_models(
             f"Initial model with ID {init_model_id} trained and saved at {initial_model_path}.",
         )
     else:
-        log(INFO, f"Initial model with ID {init_model_id} already exists, loading it from disk.")
+        log(
+            INFO,
+            f"Initial model with ID {init_model_id} already exists, loading it from disk.",
+        )
         with open(initial_model_path, "rb") as f:
             initial_model_training_results = pickle.load(f)
 
@@ -171,8 +177,13 @@ def train_fine_tuned_shadow_models(
             fine_tuning_diffusion_iterations=fine_tuning_config.fine_tune_diffusion_iterations,
             fine_tuning_classifier_iterations=fine_tuning_config.fine_tune_classifier_iterations,
             synthesize=True,
+            number_of_points_to_synthesize=number_of_points_to_synthesize,
         )
         assert train_result.synthetic_data is not None, "Fine-tuned models should generate synthetic data."
+        log(
+            INFO,
+            f"Fine-tuned model {model_id} generated {len(train_result.synthetic_data)} synthetic samples.",
+        )
         attack_data["fine_tuned_results"].append(train_result)
 
     # Pickle dump the results
@@ -191,6 +202,7 @@ def train_shadow_on_half_challenge_data(
     training_json_config_paths: DictConfig,
     table_name: str,
     id_column_name: str,
+    number_of_points_to_synthesize: int = 20000,
     random_seed: int | None = None,
 ) -> Path:
     """
@@ -214,6 +226,8 @@ def train_shadow_on_half_challenge_data(
                 - tabddpm_training_config_path (str): Path to table's training config json file.
             table_name: Name of the main table to be used for training the TabDDPM model.
             id_column_name: Name of the ID column in the data.
+            number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
+                defaults to 20,000.
             random_seed: Random seed used for reproducibility, defaults to None.
 
     Returns:
@@ -229,7 +243,8 @@ def train_shadow_on_half_challenge_data(
     selected_id_lists: list[list[int]] = [[] for _ in range(n_models)]
     # Assign each unique_id to half of the random lists
     for uid in unique_ids:
-        selected_lists = random.sample(range(n_models), half_models)  # Select 2 random list indices
+        # Select 2 random list indices
+        selected_lists = random.sample(range(n_models), half_models)
         for idx in selected_lists:
             selected_id_lists[idx].append(uid)
 
@@ -273,6 +288,12 @@ def train_shadow_on_half_challenge_data(
             configs,
             save_dir,
             synthesize=True,
+            number_of_points_to_synthesize=number_of_points_to_synthesize,
+        )
+        assert train_result.synthetic_data is not None, "Trained shadow model did not generate synthetic data."
+        log(
+            INFO,
+            f"Trained shadow model {model_id} generated {len(train_result.synthetic_data)} synthetic samples.",
         )
 
         attack_data["trained_results"].append(train_result)
@@ -295,6 +316,7 @@ def train_three_sets_of_shadow_models(
     id_column_name: str,
     n_models_per_set: int = 4,
     n_reps: int = 12,
+    number_of_points_to_synthesize: int = 20000,
     random_seed: int | None = None,
 ) -> tuple[Path, Path, Path]:
     """
@@ -342,6 +364,8 @@ def train_three_sets_of_shadow_models(
         id_column_name: Name of the ID column in the data.
         n_models_per_set: Number of shadow models to train by each approach. Must be an even number. Defaults to 4.
         n_reps: Number of repetitions for each challenge point in the fine-tuning or training sets, defaults to 12.
+        number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
+            defaults to 20,000.
         random_seed: Random seed used for reproducibility, defaults to None.
 
     Returns:
@@ -365,6 +389,7 @@ def train_three_sets_of_shadow_models(
         table_name=table_name,
         id_column_name=id_column_name,
         pre_training_data_size=fine_tuning_config.pre_train_data_size,
+        number_of_points_to_synthesize=number_of_points_to_synthesize,
         init_data_seed=random_seed,
         random_seed=random_seed,
     )
@@ -387,6 +412,7 @@ def train_three_sets_of_shadow_models(
         table_name=table_name,
         id_column_name=id_column_name,
         pre_training_data_size=fine_tuning_config.pre_train_data_size,
+        number_of_points_to_synthesize=number_of_points_to_synthesize,
         # Setting a different seed for the second train set
         init_data_seed=random_seed + 1 if random_seed is not None else None,
         random_seed=random_seed,
@@ -405,6 +431,7 @@ def train_three_sets_of_shadow_models(
         training_json_config_paths=training_json_config_paths,
         table_name=table_name,
         id_column_name=id_column_name,
+        number_of_points_to_synthesize=number_of_points_to_synthesize,
         random_seed=random_seed,
     )
     log(
diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py
@@ -82,7 +82,7 @@ def train_tabddpm_and_synthesize(
     configs: TrainingConfig,
     save_dir: Path,
     synthesize: bool = True,
-    sample_scale: float = 1.0,
+    number_of_points_to_synthesize: int = 20000,
 ) -> TrainingResult:
     """
     Train a TabDDPM model on the provided training set and optionally synthesize data using the trained models.
@@ -92,8 +92,7 @@ def train_tabddpm_and_synthesize(
         configs: Configuration dictionary for TabDDPM.
         save_dir: Directory path where models and results will be saved.
         synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True.
-        sample_scale: Factor to scale the number of synthesized samples relative to the training set size.
-            Defaults to 1.0.
+        number_of_points_to_synthesize: Number of synthetic data samples to be generated. Defaults to 20000.
 
     Returns:
         A dataclass TrainingResult object containing:
@@ -131,13 +130,14 @@ def train_tabddpm_and_synthesize(
     )
 
     if synthesize:
-        # By default, we want the length of the final synthetic data to be ``len(provided_synth_data) = 20,000``
-        # But with a smaller scale, we can generate less synthetic data for debugging purposes.
+        # By default, Ensemble attack generates a synthetic data of length ``20,000``.
         # Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to
-        # generate 20,000 samples regardless
-        # of the training data size.
-        # Sample scale is later multiplied by the size of training data (no id) to determine
+        # generate 20,000 samples regardless of the training data size. But we control the
+        # synthetic data size directly here with ``number_of_points_to_synthesize``.
+        # ``sample_scale`` is later multiplied by the size of training data (no id) to determine
         # the size of synthetic data.
+        assert len(tables["trans"].data) > 0, "Cannot synthesize: training data is empty"
+        sample_scale = number_of_points_to_synthesize / len(tables["trans"].data)
         cleaned_tables, _, _ = clava_synthesizing(
             tables,
             relation_order,
@@ -163,7 +163,7 @@ def fine_tune_tabddpm_and_synthesize(
     fine_tuning_diffusion_iterations: int = 100,
     fine_tuning_classifier_iterations: int = 10,
     synthesize: bool = True,
-    sample_scale: float = 1.0,
+    number_of_points_to_synthesize: int = 20000,
 ) -> TrainingResult:
     """
     Given the trained models and a new training set, fine-tune the TabDDPM models.
@@ -179,8 +179,8 @@ def fine_tune_tabddpm_and_synthesize(
         fine_tuning_classifier_iterations: Number of training iterations for the new classifier model.
             Defaults to 10.
         synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True.
-        sample_scale: Factor to scale the number of synthesized samples relative to the training set size.
-            Defaults to 1.0.
+        number_of_points_to_synthesize: Number of synthetic data samples to be generated. Defaults to 20000.
+
 
     Returns:
         A dataclass TrainingResult object containing:
@@ -223,11 +223,14 @@ def fine_tune_tabddpm_and_synthesize(
     )
 
     if synthesize:
-        # By default, we want the length of the final synthetic data to be ``len(provided_synth_data) = 20,000``
-        # But with a smaller scale, we can generate less synthetic data for debugging purposes.
-        # Ensemble Attack's default sample_scale is ``20000 / len(tables["trans"]["df"])`` to generate 20,000 samples
-        # regardless of the train data size.
-        # Sample scale is later multiplied by the size of training data to determine the size of synthetic data.
+        # By default, Ensemble attack generates a synthetic data of length ``20,000``.
+        # Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to
+        # generate 20,000 samples regardless of the training data size. But we control the
+        # synthetic data size directly here with ``number_of_points_to_synthesize``.
+        # ``sample_scale`` is later multiplied by the size of training data (no id) to determine
+        # the size of synthetic data.
+        assert len(new_tables["trans"].data) > 0, "Cannot synthesize: training data is empty"
+        sample_scale = number_of_points_to_synthesize / len(new_tables["trans"].data)
         cleaned_tables, _, _ = clava_synthesizing(
             new_tables,
             relation_order,
diff --git a/tests/integration/attacks/ensemble/test_shadow_model_training.py b/tests/integration/attacks/ensemble/test_shadow_model_training.py
@@ -55,6 +55,7 @@ def test_train_fine_tuned_shadow_models(cfg: DictConfig, tmp_path: Path) -> None
         table_name="trans",
         id_column_name="trans_id",
         pre_training_data_size=cfg.shadow_training.fine_tuning_config.pre_train_data_size,
+        number_of_points_to_synthesize=5,
         random_seed=cfg.random_seed,
     )
     # Expected saved models and synthesized data:
@@ -75,6 +76,7 @@ def test_train_fine_tuned_shadow_models(cfg: DictConfig, tmp_path: Path) -> None
         assert result.relation_order is not None
         assert result.all_group_lengths_probabilities is not None
         assert type(result.synthetic_data) is pd.DataFrame
+        assert len(result.synthetic_data) == 5
 
     # Fine tuning sets should be disjoint
     assert set(shadow_data["fine_tuning_sets"][0]).isdisjoint(set(shadow_data["fine_tuning_sets"][1]))
@@ -99,6 +101,7 @@ def test_train_shadow_on_half_challenge_data(cfg: DictConfig, tmp_path: Path) ->
         training_json_config_paths=cfg.shadow_training.training_json_config_paths,
         table_name="trans",
         id_column_name="trans_id",
+        number_of_points_to_synthesize=5,
         random_seed=cfg.random_seed,
     )
     # Expected saved models and synthesized data:
@@ -119,6 +122,7 @@ def test_train_shadow_on_half_challenge_data(cfg: DictConfig, tmp_path: Path) ->
         assert result.relation_order is not None
         assert result.all_group_lengths_probabilities is not None
         assert type(result.synthetic_data) is pd.DataFrame
+        assert len(result.synthetic_data) == 5
 
     # Training sets should be disjoint
     assert set(shadow_data["selected_sets"][0]).isdisjoint(set(shadow_data["selected_sets"][1]))
@@ -156,13 +160,8 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None:
     )
 
     train_result = train_tabddpm_and_synthesize(
-        train_set,
-        configs,
-        save_dir,
-        synthesize=True,
+        train_set, configs, save_dir, synthesize=True, number_of_points_to_synthesize=99
     )
-    # By default, with a sampling scale of 1, the size of the synthesized data is equal
-    # to the size of the training data.
     assert train_result.synthetic_data is not None
     assert type(train_result.synthetic_data) is pd.DataFrame
     assert len(train_result.synthetic_data) == 99