Skip to content

Commit 258ff33

Browse files
fatemetklemersodb
andauthored
Fixed ensemble attack bugs (#92)
* Fixed 2 bugs: shadow synth data size, and var name * David's comments --------- Co-authored-by: David Emerson <43939939+emersodb@users.noreply.github.com>
1 parent dc2ae2d commit 258ff33

File tree

6 files changed

+65
-29
lines changed

6 files changed

+65
-29
lines changed

examples/ensemble_attack/config.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ data_processing_config:
4343
challenge_data_file_name: "challenge_with_id.csv"
4444
population_sample_size: 11956 #Population size the total data that your attack has access to.
4545
#The size of the master challenge dataset is half of the population size based on the attack design.
46+
# Original code: 40000
4647

4748
# Training and data settings for shadow models (temporary, numbers subject to change)
4849
shadow_training:
@@ -66,9 +67,11 @@ shadow_training:
6667
final_target_model_path: ${shadow_training.target_model_output_path}/target_model/shadow_workspace/trained_target_model/target_model.pkl
6768
# Path to final target model (relative to target_model_output_path)
6869
fine_tuning_config:
69-
fine_tune_diffusion_iterations: 2
70-
fine_tune_classifier_iterations: 2
71-
pre_train_data_size: 10 #10 for test run. Original code: 60000
70+
fine_tune_diffusion_iterations: 2 # Original code: 200000
71+
fine_tune_classifier_iterations: 2 # Original code: 20000
72+
pre_train_data_size: 10 # 10 for test run. Original code: 60000
73+
number_of_points_to_synthesize: 200 # Number of synthetic data samples to be generated by shadow models.
74+
# 200 for test run. Original code: 20000
7275

7376

7477
# Metaclassifier settings

examples/ensemble_attack/run_attack.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ def main(config: DictConfig) -> None:
5959
# TODO: Investigate the source of error.
6060
if config.pipeline.run_shadow_model_training:
6161
shadow_pipeline = importlib.import_module("examples.ensemble_attack.run_shadow_model_training")
62-
attack_data_paths = shadow_pipeline.run_shadow_model_training(config)
63-
attack_data_paths = [Path(path) for path in attack_data_paths]
62+
shadow_data_paths = shadow_pipeline.run_shadow_model_training(config)
63+
shadow_data_paths = [Path(path) for path in shadow_data_paths]
6464

6565
target_data_path = shadow_pipeline.run_target_model_training(config)
6666
target_data_path = Path(target_data_path)

examples/ensemble_attack/run_shadow_model_training.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def run_target_model_training(config: DictConfig) -> Path:
6767
configs=configs,
6868
save_dir=save_dir,
6969
synthesize=True,
70+
number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
7071
)
7172

7273
# TODO: Check: Selected_id_lists should be of form [[]]
@@ -84,6 +85,8 @@ def run_target_model_training(config: DictConfig) -> Path:
8485
with open(result_path, "wb") as file:
8586
pickle.dump(attack_data, file)
8687

88+
log(INFO, f"Target model training finished and saved at {result_path}")
89+
8790
return result_path
8891

8992

@@ -133,6 +136,7 @@ def run_shadow_model_training(config: DictConfig) -> list[Path]:
133136
# ``4 * n_models_per_set`` total shadow models.
134137
n_models_per_set=4, # 4 based on the original code, must be even
135138
n_reps=12, # Number of repetitions of challenge points in each shadow model training set. `12` based on the original code
139+
number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
136140
random_seed=config.random_seed,
137141
)
138142
log(

src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def train_fine_tuned_shadow_models(
2929
table_name: str,
3030
id_column_name: str,
3131
pre_training_data_size: int = 60000,
32+
number_of_points_to_synthesize: int = 20000,
3233
init_data_seed: int | None = None,
3334
random_seed: int | None = None,
3435
) -> Path:
@@ -71,6 +72,8 @@ def train_fine_tuned_shadow_models(
7172
table_name: Name of the main table to be used for training the TabDDPM model.
7273
id_column_name: Name of the ID column in the data.
7374
pre_training_data_size: Size of the initial training set, defaults to 60,000.
75+
number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
76+
defaults to 20,000.
7477
init_data_seed: Random seed for the initial training set.
7578
random_seed: Random seed used for reproducibility, defaults to None.
7679
@@ -134,7 +137,10 @@ def train_fine_tuned_shadow_models(
134137
f"Initial model with ID {init_model_id} trained and saved at {initial_model_path}.",
135138
)
136139
else:
137-
log(INFO, f"Initial model with ID {init_model_id} already exists, loading it from disk.")
140+
log(
141+
INFO,
142+
f"Initial model with ID {init_model_id} already exists, loading it from disk.",
143+
)
138144
with open(initial_model_path, "rb") as f:
139145
initial_model_training_results = pickle.load(f)
140146

@@ -171,8 +177,13 @@ def train_fine_tuned_shadow_models(
171177
fine_tuning_diffusion_iterations=fine_tuning_config.fine_tune_diffusion_iterations,
172178
fine_tuning_classifier_iterations=fine_tuning_config.fine_tune_classifier_iterations,
173179
synthesize=True,
180+
number_of_points_to_synthesize=number_of_points_to_synthesize,
174181
)
175182
assert train_result.synthetic_data is not None, "Fine-tuned models should generate synthetic data."
183+
log(
184+
INFO,
185+
f"Fine-tuned model {model_id} generated {len(train_result.synthetic_data)} synthetic samples.",
186+
)
176187
attack_data["fine_tuned_results"].append(train_result)
177188

178189
# Pickle dump the results
@@ -191,6 +202,7 @@ def train_shadow_on_half_challenge_data(
191202
training_json_config_paths: DictConfig,
192203
table_name: str,
193204
id_column_name: str,
205+
number_of_points_to_synthesize: int = 20000,
194206
random_seed: int | None = None,
195207
) -> Path:
196208
"""
@@ -214,6 +226,8 @@ def train_shadow_on_half_challenge_data(
214226
- tabddpm_training_config_path (str): Path to table's training config json file.
215227
table_name: Name of the main table to be used for training the TabDDPM model.
216228
id_column_name: Name of the ID column in the data.
229+
number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
230+
defaults to 20,000.
217231
random_seed: Random seed used for reproducibility, defaults to None.
218232
219233
Returns:
@@ -229,7 +243,8 @@ def train_shadow_on_half_challenge_data(
229243
selected_id_lists: list[list[int]] = [[] for _ in range(n_models)]
230244
# Assign each unique_id to half of the random lists
231245
for uid in unique_ids:
232-
selected_lists = random.sample(range(n_models), half_models) # Select 2 random list indices
246+
# Select 2 random list indices
247+
selected_lists = random.sample(range(n_models), half_models)
233248
for idx in selected_lists:
234249
selected_id_lists[idx].append(uid)
235250

@@ -273,6 +288,12 @@ def train_shadow_on_half_challenge_data(
273288
configs,
274289
save_dir,
275290
synthesize=True,
291+
number_of_points_to_synthesize=number_of_points_to_synthesize,
292+
)
293+
assert train_result.synthetic_data is not None, "Trained shadow model did not generate synthetic data."
294+
log(
295+
INFO,
296+
f"Trained shadow model {model_id} generated {len(train_result.synthetic_data)} synthetic samples.",
276297
)
277298

278299
attack_data["trained_results"].append(train_result)
@@ -295,6 +316,7 @@ def train_three_sets_of_shadow_models(
295316
id_column_name: str,
296317
n_models_per_set: int = 4,
297318
n_reps: int = 12,
319+
number_of_points_to_synthesize: int = 20000,
298320
random_seed: int | None = None,
299321
) -> tuple[Path, Path, Path]:
300322
"""
@@ -342,6 +364,8 @@ def train_three_sets_of_shadow_models(
342364
id_column_name: Name of the ID column in the data.
343365
n_models_per_set: Number of shadow models to train by each approach. Must be an even number. Defaults to 4.
344366
n_reps: Number of repetitions for each challenge point in the fine-tuning or training sets, defaults to 12.
367+
number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
368+
defaults to 20,000.
345369
random_seed: Random seed used for reproducibility, defaults to None.
346370
347371
Returns:
@@ -365,6 +389,7 @@ def train_three_sets_of_shadow_models(
365389
table_name=table_name,
366390
id_column_name=id_column_name,
367391
pre_training_data_size=fine_tuning_config.pre_train_data_size,
392+
number_of_points_to_synthesize=number_of_points_to_synthesize,
368393
init_data_seed=random_seed,
369394
random_seed=random_seed,
370395
)
@@ -387,6 +412,7 @@ def train_three_sets_of_shadow_models(
387412
table_name=table_name,
388413
id_column_name=id_column_name,
389414
pre_training_data_size=fine_tuning_config.pre_train_data_size,
415+
number_of_points_to_synthesize=number_of_points_to_synthesize,
390416
# Setting a different seed for the second train set
391417
init_data_seed=random_seed + 1 if random_seed is not None else None,
392418
random_seed=random_seed,
@@ -405,6 +431,7 @@ def train_three_sets_of_shadow_models(
405431
training_json_config_paths=training_json_config_paths,
406432
table_name=table_name,
407433
id_column_name=id_column_name,
434+
number_of_points_to_synthesize=number_of_points_to_synthesize,
408435
random_seed=random_seed,
409436
)
410437
log(

src/midst_toolkit/attacks/ensemble/shadow_model_utils.py

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def train_tabddpm_and_synthesize(
8282
configs: TrainingConfig,
8383
save_dir: Path,
8484
synthesize: bool = True,
85-
sample_scale: float = 1.0,
85+
number_of_points_to_synthesize: int = 20000,
8686
) -> TrainingResult:
8787
"""
8888
Train a TabDDPM model on the provided training set and optionally synthesize data using the trained models.
@@ -92,8 +92,7 @@ def train_tabddpm_and_synthesize(
9292
configs: Configuration dictionary for TabDDPM.
9393
save_dir: Directory path where models and results will be saved.
9494
synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True.
95-
sample_scale: Factor to scale the number of synthesized samples relative to the training set size.
96-
Defaults to 1.0.
95+
number_of_points_to_synthesize: Number of synthetic data samples to be generated. Defaults to 20000.
9796
9897
Returns:
9998
A dataclass TrainingResult object containing:
@@ -131,13 +130,14 @@ def train_tabddpm_and_synthesize(
131130
)
132131

133132
if synthesize:
134-
# By default, we want the length of the final synthetic data to be ``len(provided_synth_data) = 20,000``
135-
# But with a smaller scale, we can generate less synthetic data for debugging purposes.
133+
# By default, Ensemble attack generates a synthetic data of length ``20,000``.
136134
# Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to
137-
# generate 20,000 samples regardless
138-
# of the training data size.
139-
# Sample scale is later multiplied by the size of training data (no id) to determine
135+
# generate 20,000 samples regardless of the training data size. But we control the
136+
# synthetic data size directly here with ``number_of_points_to_synthesize``.
137+
# ``sample_scale`` is later multiplied by the size of training data (no id) to determine
140138
# the size of synthetic data.
139+
assert len(tables["trans"].data) > 0, "Cannot synthesize: training data is empty"
140+
sample_scale = number_of_points_to_synthesize / len(tables["trans"].data)
141141
cleaned_tables, _, _ = clava_synthesizing(
142142
tables,
143143
relation_order,
@@ -163,7 +163,7 @@ def fine_tune_tabddpm_and_synthesize(
163163
fine_tuning_diffusion_iterations: int = 100,
164164
fine_tuning_classifier_iterations: int = 10,
165165
synthesize: bool = True,
166-
sample_scale: float = 1.0,
166+
number_of_points_to_synthesize: int = 20000,
167167
) -> TrainingResult:
168168
"""
169169
Given the trained models and a new training set, fine-tune the TabDDPM models.
@@ -179,8 +179,8 @@ def fine_tune_tabddpm_and_synthesize(
179179
fine_tuning_classifier_iterations: Number of training iterations for the new classifier model.
180180
Defaults to 10.
181181
synthesize: Flag indicating whether to generate synthetic data after training. Defaults to True.
182-
sample_scale: Factor to scale the number of synthesized samples relative to the training set size.
183-
Defaults to 1.0.
182+
number_of_points_to_synthesize: Number of synthetic data samples to be generated. Defaults to 20000.
183+
184184
185185
Returns:
186186
A dataclass TrainingResult object containing:
@@ -223,11 +223,14 @@ def fine_tune_tabddpm_and_synthesize(
223223
)
224224

225225
if synthesize:
226-
# By default, we want the length of the final synthetic data to be ``len(provided_synth_data) = 20,000``
227-
# But with a smaller scale, we can generate less synthetic data for debugging purposes.
228-
# Ensemble Attack's default sample_scale is ``20000 / len(tables["trans"]["df"])`` to generate 20,000 samples
229-
# regardless of the train data size.
230-
# Sample scale is later multiplied by the size of training data to determine the size of synthetic data.
226+
# By default, Ensemble attack generates a synthetic data of length ``20,000``.
227+
# Attack's default sample_scale is set to ``20000 / len(tables["trans"]["df"])`` to
228+
# generate 20,000 samples regardless of the training data size. But we control the
229+
# synthetic data size directly here with ``number_of_points_to_synthesize``.
230+
# ``sample_scale`` is later multiplied by the size of training data (no id) to determine
231+
# the size of synthetic data.
232+
assert len(new_tables["trans"].data) > 0, "Cannot synthesize: training data is empty"
233+
sample_scale = number_of_points_to_synthesize / len(new_tables["trans"].data)
231234
cleaned_tables, _, _ = clava_synthesizing(
232235
new_tables,
233236
relation_order,

tests/integration/attacks/ensemble/test_shadow_model_training.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def test_train_fine_tuned_shadow_models(cfg: DictConfig, tmp_path: Path) -> None
5555
table_name="trans",
5656
id_column_name="trans_id",
5757
pre_training_data_size=cfg.shadow_training.fine_tuning_config.pre_train_data_size,
58+
number_of_points_to_synthesize=5,
5859
random_seed=cfg.random_seed,
5960
)
6061
# Expected saved models and synthesized data:
@@ -75,6 +76,7 @@ def test_train_fine_tuned_shadow_models(cfg: DictConfig, tmp_path: Path) -> None
7576
assert result.relation_order is not None
7677
assert result.all_group_lengths_probabilities is not None
7778
assert type(result.synthetic_data) is pd.DataFrame
79+
assert len(result.synthetic_data) == 5
7880

7981
# Fine tuning sets should be disjoint
8082
assert set(shadow_data["fine_tuning_sets"][0]).isdisjoint(set(shadow_data["fine_tuning_sets"][1]))
@@ -99,6 +101,7 @@ def test_train_shadow_on_half_challenge_data(cfg: DictConfig, tmp_path: Path) ->
99101
training_json_config_paths=cfg.shadow_training.training_json_config_paths,
100102
table_name="trans",
101103
id_column_name="trans_id",
104+
number_of_points_to_synthesize=5,
102105
random_seed=cfg.random_seed,
103106
)
104107
# Expected saved models and synthesized data:
@@ -119,6 +122,7 @@ def test_train_shadow_on_half_challenge_data(cfg: DictConfig, tmp_path: Path) ->
119122
assert result.relation_order is not None
120123
assert result.all_group_lengths_probabilities is not None
121124
assert type(result.synthetic_data) is pd.DataFrame
125+
assert len(result.synthetic_data) == 5
122126

123127
# Training sets should be disjoint
124128
assert set(shadow_data["selected_sets"][0]).isdisjoint(set(shadow_data["selected_sets"][1]))
@@ -156,13 +160,8 @@ def test_train_and_fine_tune_tabddpm(cfg: DictConfig, tmp_path: Path) -> None:
156160
)
157161

158162
train_result = train_tabddpm_and_synthesize(
159-
train_set,
160-
configs,
161-
save_dir,
162-
synthesize=True,
163+
train_set, configs, save_dir, synthesize=True, number_of_points_to_synthesize=99
163164
)
164-
# By default, with a sampling scale of 1, the size of the synthesized data is equal
165-
# to the size of the training data.
166165
assert train_result.synthetic_data is not None
167166
assert type(train_result.synthetic_data) is pd.DataFrame
168167
assert len(train_result.synthetic_data) == 99

0 commit comments

Comments
 (0)