David's comments

fatemetkl · fatemetkl · commit 2a09c0da7bf2 · 2025-11-14T09:16:13.000-07:00
diff --git a/examples/ensemble_attack/config.yaml b/examples/ensemble_attack/config.yaml
@@ -68,9 +68,9 @@ shadow_training:
   # Path to final target model (relative to target_model_output_path)
   fine_tuning_config:
     fine_tune_diffusion_iterations: 2 # Original code: 200000
-    fine_tune_classifier_iterations: 2 #Original code: 20000
-    pre_train_data_size: 600 #10 for test run. Original code: 60000
-  synthetic_data_size: 200 # Number of synthetic data samples to be generated by shadow models.
+    fine_tune_classifier_iterations: 2 # Original code: 20000
+    pre_train_data_size: 10 # 10 for test run. Original code: 60000
+  number_of_points_to_synthesize: 200 # Number of synthetic data samples to be generated by shadow models.
   # 200 for test run. Original code: 20000
 
 
diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py
@@ -67,7 +67,7 @@ def run_target_model_training(config: DictConfig) -> Path:
         configs=configs,
         save_dir=save_dir,
         synthesize=True,
-        synthetic_data_size=config.shadow_training.synthetic_data_size,
+        number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
     )
 
     # TODO: Check: Selected_id_lists should be of form [[]]
@@ -136,7 +136,7 @@ def run_shadow_model_training(config: DictConfig) -> list[Path]:
         # ``4 * n_models_per_set`` total shadow models.
         n_models_per_set=4,  # 4 based on the original code, must be even
         n_reps=12,  # Number of repetitions of challenge points in each shadow model training set. `12` based on the original code
-        synthetic_data_size=config.shadow_training.synthetic_data_size,
+        number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
         random_seed=config.random_seed,
     )
     log(
diff --git a/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py b/src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py
@@ -29,7 +29,7 @@ def train_fine_tuned_shadow_models(
     table_name: str,
     id_column_name: str,
     pre_training_data_size: int = 60000,
-    synthetic_data_size: int = 20000,
+    number_of_points_to_synthesize: int = 20000,
     init_data_seed: int | None = None,
     random_seed: int | None = None,
 ) -> Path:
@@ -72,7 +72,8 @@ def train_fine_tuned_shadow_models(
             table_name: Name of the main table to be used for training the TabDDPM model.
             id_column_name: Name of the ID column in the data.
             pre_training_data_size: Size of the initial training set, defaults to 60,000.
-            synthetic_data_size: Size of the synthetic data to be generated by each shadow model, defaults to 20,000.
+            number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
+                defaults to 20,000.
             init_data_seed: Random seed for the initial training set.
             random_seed: Random seed used for reproducibility, defaults to None.
 
@@ -136,7 +137,10 @@ def train_fine_tuned_shadow_models(
             f"Initial model with ID {init_model_id} trained and saved at {initial_model_path}.",
         )
     else:
-        log(INFO, f"Initial model with ID {init_model_id} already exists, loading it from disk.")
+        log(
+            INFO,
+            f"Initial model with ID {init_model_id} already exists, loading it from disk.",
+        )
         with open(initial_model_path, "rb") as f:
             initial_model_training_results = pickle.load(f)
 
@@ -173,10 +177,13 @@ def train_fine_tuned_shadow_models(
             fine_tuning_diffusion_iterations=fine_tuning_config.fine_tune_diffusion_iterations,
             fine_tuning_classifier_iterations=fine_tuning_config.fine_tune_classifier_iterations,
             synthesize=True,
-            synthetic_data_size=synthetic_data_size,
+            number_of_points_to_synthesize=number_of_points_to_synthesize,
         )
         assert train_result.synthetic_data is not None, "Fine-tuned models should generate synthetic data."
-        log(INFO, f"Fine-tuned model {model_id} generated {len(train_result.synthetic_data)} synthetic samples.")
+        log(
+            INFO,
+            f"Fine-tuned model {model_id} generated {len(train_result.synthetic_data)} synthetic samples.",
+        )
         attack_data["fine_tuned_results"].append(train_result)
 
     # Pickle dump the results
@@ -195,7 +202,7 @@ def train_shadow_on_half_challenge_data(
     training_json_config_paths: DictConfig,
     table_name: str,
     id_column_name: str,
-    synthetic_data_size: int = 20000,
+    number_of_points_to_synthesize: int = 20000,
     random_seed: int | None = None,
 ) -> Path:
     """
@@ -219,7 +226,8 @@ def train_shadow_on_half_challenge_data(
                 - tabddpm_training_config_path (str): Path to table's training config json file.
             table_name: Name of the main table to be used for training the TabDDPM model.
             id_column_name: Name of the ID column in the data.
-            synthetic_data_size: Size of the synthetic data to be generated by each shadow model, defaults to 20,000.
+            number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
+                defaults to 20,000.
             random_seed: Random seed used for reproducibility, defaults to None.
 
     Returns:
@@ -235,7 +243,8 @@ def train_shadow_on_half_challenge_data(
     selected_id_lists: list[list[int]] = [[] for _ in range(n_models)]
     # Assign each unique_id to half of the random lists
     for uid in unique_ids:
-        selected_lists = random.sample(range(n_models), half_models)  # Select 2 random list indices
+        # Select 2 random list indices
+        selected_lists = random.sample(range(n_models), half_models)
         for idx in selected_lists:
             selected_id_lists[idx].append(uid)
 
@@ -279,10 +288,13 @@ def train_shadow_on_half_challenge_data(
             configs,
             save_dir,
             synthesize=True,
-            synthetic_data_size=synthetic_data_size,
+            number_of_points_to_synthesize=number_of_points_to_synthesize,
         )
         assert train_result.synthetic_data is not None, "Trained shadow model did not generate synthetic data."
-        log(INFO, f"Trained shadow model {model_id} generated {len(train_result.synthetic_data)} synthetic samples.")
+        log(
+            INFO,
+            f"Trained shadow model {model_id} generated {len(train_result.synthetic_data)} synthetic samples.",
+        )
 
         attack_data["trained_results"].append(train_result)
 
@@ -304,7 +316,7 @@ def train_three_sets_of_shadow_models(
     id_column_name: str,
     n_models_per_set: int = 4,
     n_reps: int = 12,
-    synthetic_data_size: int = 20000,
+    number_of_points_to_synthesize: int = 20000,
     random_seed: int | None = None,
 ) -> tuple[Path, Path, Path]:
     """
@@ -352,7 +364,8 @@ def train_three_sets_of_shadow_models(
         id_column_name: Name of the ID column in the data.
         n_models_per_set: Number of shadow models to train by each approach. Must be an even number. Defaults to 4.
         n_reps: Number of repetitions for each challenge point in the fine-tuning or training sets, defaults to 12.
-        synthetic_data_size: Size of the synthetic data to be generated by each shadow model, defaults to 20,000.
+        number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
+            defaults to 20,000.
         random_seed: Random seed used for reproducibility, defaults to None.
 
     Returns:
@@ -376,7 +389,7 @@ def train_three_sets_of_shadow_models(
         table_name=table_name,
         id_column_name=id_column_name,
         pre_training_data_size=fine_tuning_config.pre_train_data_size,
-        synthetic_data_size=synthetic_data_size,
+        number_of_points_to_synthesize=number_of_points_to_synthesize,
         init_data_seed=random_seed,
         random_seed=random_seed,
     )
@@ -399,7 +412,7 @@ def train_three_sets_of_shadow_models(
         table_name=table_name,
         id_column_name=id_column_name,
         pre_training_data_size=fine_tuning_config.pre_train_data_size,
-        synthetic_data_size=synthetic_data_size,
+        number_of_points_to_synthesize=number_of_points_to_synthesize,
         # Setting a different seed for the second train set
         init_data_seed=random_seed + 1 if random_seed is not None else None,
         random_seed=random_seed,
@@ -418,7 +431,7 @@ def train_three_sets_of_shadow_models(
         training_json_config_paths=training_json_config_paths,
         table_name=table_name,
         id_column_name=id_column_name,
-        synthetic_data_size=synthetic_data_size,
+        number_of_points_to_synthesize=number_of_points_to_synthesize,
         random_seed=random_seed,
     )
     log(
diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py
@@ -82,7 +82,7 @@ def train_tabddpm_and_synthesize(
     configs: TrainingConfig,
     save_dir: Path,
     synthesize: bool = True,
-    synthetic_data_size: int = 20000,
+    number_of_points_to_synthesize: int = 20000,
 ) -> TrainingResult:
     """
     Train a TabDDPM model on the provided training set and optionally synthesize data using the trained models.
@@ -92,7 +92,7 @@ def train_tabddpm_and_synthesize(
         configs: Configuration dictionary for TabDDPM.
         save_dir: Directory path where models and results will be saved.
         synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True.
-        synthetic_data_size: Number of synthetic data samples to be generated. Defaults to 20000.
+        number_of_points_to_synthesize: Number of synthetic data samples to be generated. Defaults to 20000.
 
     Returns:
         A dataclass TrainingResult object containing:
@@ -130,14 +130,14 @@ def train_tabddpm_and_synthesize(
     )
 
     if synthesize:
-        # By default, we want the length of the final synthetic data to be ``len(provided_synth_data) = 20,000``
-        # But with a smaller scale, we can generate less synthetic data for debugging purposes.
+        # By default, Ensemble attack generates a synthetic data of length ``20,000``.
         # Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to
-        # generate 20,000 samples regardless
-        # of the training data size.
-        # Sample scale is later multiplied by the size of training data (no id) to determine
+        # generate 20,000 samples regardless of the training data size. But we control the
+        # synthetic data size directly here with ``number_of_points_to_synthesize``.
+        # ``sample_scale`` is later multiplied by the size of training data (no id) to determine
         # the size of synthetic data.
-        sample_scale = synthetic_data_size / len(tables["trans"].data)
+        assert len(tables["trans"].data) > 0, "Cannot synthesize: training data is empty"
+        sample_scale = number_of_points_to_synthesize / len(tables["trans"].data)
         cleaned_tables, _, _ = clava_synthesizing(
             tables,
             relation_order,
@@ -163,7 +163,7 @@ def fine_tune_tabddpm_and_synthesize(
     fine_tuning_diffusion_iterations: int = 100,
     fine_tuning_classifier_iterations: int = 10,
     synthesize: bool = True,
-    synthetic_data_size: int = 20000,
+    number_of_points_to_synthesize: int = 20000,
 ) -> TrainingResult:
     """
     Given the trained models and a new training set, fine-tune the TabDDPM models.
@@ -179,7 +179,7 @@ def fine_tune_tabddpm_and_synthesize(
         fine_tuning_classifier_iterations: Number of training iterations for the new classifier model.
             Defaults to 10.
         synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True.
-        synthetic_data_size: Number of synthetic data samples to be generated. Defaults to 20000.
+        number_of_points_to_synthesize: Number of synthetic data samples to be generated. Defaults to 20000.
 
 
     Returns:
@@ -223,12 +223,14 @@ def fine_tune_tabddpm_and_synthesize(
     )
 
     if synthesize:
-        # By default, we want the length of the final synthetic data to be ``len(provided_synth_data) = 20,000``
-        # But with a smaller scale, we can generate less synthetic data for debugging purposes.
-        # Ensemble Attack's default sample_scale is ``20000 / len(tables["trans"]["df"])`` to generate 20,000 samples
-        # regardless of the train data size.
-        # Sample scale is later multiplied by the size of training data to determine the size of synthetic data.
-        sample_scale = synthetic_data_size / len(new_tables["trans"].data)
+        # By default, Ensemble attack generates a synthetic data of length ``20,000``.
+        # Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to
+        # generate 20,000 samples regardless of the training data size. But we control the
+        # synthetic data size directly here with ``number_of_points_to_synthesize``.
+        # ``sample_scale`` is later multiplied by the size of training data (no id) to determine
+        # the size of synthetic data.
+        assert len(new_tables["trans"].data) > 0, "Cannot synthesize: training data is empty"
+        sample_scale = number_of_points_to_synthesize / len(new_tables["trans"].data)
         cleaned_tables, _, _ = clava_synthesizing(
             new_tables,
             relation_order,
diff --git a/tests/integration/attacks/ensemble/test_shadow_model_training.py b/tests/integration/attacks/ensemble/test_shadow_model_training.py
@@ -55,7 +55,7 @@ def test_train_fine_tuned_shadow_models(cfg: DictConfig, tmp_path: Path) -> None
         table_name="trans",
         id_column_name="trans_id",
         pre_training_data_size=cfg.shadow_training.fine_tuning_config.pre_train_data_size,
-        synthetic_data_size=5,
+        number_of_points_to_synthesize=5,
         random_seed=cfg.random_seed,
     )
     # Expected saved models and synthesized data:
@@ -101,7 +101,7 @@ def test_train_shadow_on_half_challenge_data(cfg: DictConfig, tmp_path: Path) ->
         training_json_config_paths=cfg.shadow_training.training_json_config_paths,
         table_name="trans",
         id_column_name="trans_id",
-        synthetic_data_size=5,
+        number_of_points_to_synthesize=5,
         random_seed=cfg.random_seed,
     )
     # Expected saved models and synthesized data:
@@ -159,9 +159,9 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None:
         workspace_name="test_workspace",
     )
 
-    train_result = train_tabddpm_and_synthesize(train_set, configs, save_dir, synthesize=True, synthetic_data_size=99)
-    # By default, with a sampling scale of 1, the size of the synthesized data is equal
-    # to the size of the training data.
+    train_result = train_tabddpm_and_synthesize(
+        train_set, configs, save_dir, synthesize=True, number_of_points_to_synthesize=99
+    )
     assert train_result.synthetic_data is not None
     assert type(train_result.synthetic_data) is pd.DataFrame
     assert len(train_result.synthetic_data) == 99