@@ -29,6 +29,7 @@ def train_fine_tuned_shadow_models(
2929 table_name : str ,
3030 id_column_name : str ,
3131 pre_training_data_size : int = 60000 ,
32+ number_of_points_to_synthesize : int = 20000 ,
3233 init_data_seed : int | None = None ,
3334 random_seed : int | None = None ,
3435) -> Path :
@@ -71,6 +72,8 @@ def train_fine_tuned_shadow_models(
7172 table_name: Name of the main table to be used for training the TabDDPM model.
7273 id_column_name: Name of the ID column in the data.
7374 pre_training_data_size: Size of the initial training set, defaults to 60,000.
75+ number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
76+ defaults to 20,000.
7477 init_data_seed: Random seed for the initial training set.
7578 random_seed: Random seed used for reproducibility, defaults to None.
7679
@@ -134,7 +137,10 @@ def train_fine_tuned_shadow_models(
134137 f"Initial model with ID { init_model_id } trained and saved at { initial_model_path } ." ,
135138 )
136139 else :
137- log (INFO , f"Initial model with ID { init_model_id } already exists, loading it from disk." )
140+ log (
141+ INFO ,
142+ f"Initial model with ID { init_model_id } already exists, loading it from disk." ,
143+ )
138144 with open (initial_model_path , "rb" ) as f :
139145 initial_model_training_results = pickle .load (f )
140146
@@ -171,8 +177,13 @@ def train_fine_tuned_shadow_models(
171177 fine_tuning_diffusion_iterations = fine_tuning_config .fine_tune_diffusion_iterations ,
172178 fine_tuning_classifier_iterations = fine_tuning_config .fine_tune_classifier_iterations ,
173179 synthesize = True ,
180+ number_of_points_to_synthesize = number_of_points_to_synthesize ,
174181 )
175182 assert train_result .synthetic_data is not None , "Fine-tuned models should generate synthetic data."
183+ log (
184+ INFO ,
185+ f"Fine-tuned model { model_id } generated { len (train_result .synthetic_data )} synthetic samples." ,
186+ )
176187 attack_data ["fine_tuned_results" ].append (train_result )
177188
178189 # Pickle dump the results
@@ -191,6 +202,7 @@ def train_shadow_on_half_challenge_data(
191202 training_json_config_paths : DictConfig ,
192203 table_name : str ,
193204 id_column_name : str ,
205+ number_of_points_to_synthesize : int = 20000 ,
194206 random_seed : int | None = None ,
195207) -> Path :
196208 """
@@ -214,6 +226,8 @@ def train_shadow_on_half_challenge_data(
214226 - tabddpm_training_config_path (str): Path to table's training config json file.
215227 table_name: Name of the main table to be used for training the TabDDPM model.
216228 id_column_name: Name of the ID column in the data.
229+ number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
230+ defaults to 20,000.
217231 random_seed: Random seed used for reproducibility, defaults to None.
218232
219233 Returns:
@@ -229,7 +243,8 @@ def train_shadow_on_half_challenge_data(
229243 selected_id_lists : list [list [int ]] = [[] for _ in range (n_models )]
230244 # Assign each unique_id to half of the random lists
231245 for uid in unique_ids :
232- selected_lists = random .sample (range (n_models ), half_models ) # Select 2 random list indices
246+ # Select 2 random list indices
247+ selected_lists = random .sample (range (n_models ), half_models )
233248 for idx in selected_lists :
234249 selected_id_lists [idx ].append (uid )
235250
@@ -273,6 +288,12 @@ def train_shadow_on_half_challenge_data(
273288 configs ,
274289 save_dir ,
275290 synthesize = True ,
291+ number_of_points_to_synthesize = number_of_points_to_synthesize ,
292+ )
293+ assert train_result .synthetic_data is not None , "Trained shadow model did not generate synthetic data."
294+ log (
295+ INFO ,
296+ f"Trained shadow model { model_id } generated { len (train_result .synthetic_data )} synthetic samples." ,
276297 )
277298
278299 attack_data ["trained_results" ].append (train_result )
@@ -295,6 +316,7 @@ def train_three_sets_of_shadow_models(
295316 id_column_name : str ,
296317 n_models_per_set : int = 4 ,
297318 n_reps : int = 12 ,
319+ number_of_points_to_synthesize : int = 20000 ,
298320 random_seed : int | None = None ,
299321) -> tuple [Path , Path , Path ]:
300322 """
@@ -342,6 +364,8 @@ def train_three_sets_of_shadow_models(
342364 id_column_name: Name of the ID column in the data.
343365 n_models_per_set: Number of shadow models to train by each approach. Must be an even number. Defaults to 4.
344366 n_reps: Number of repetitions for each challenge point in the fine-tuning or training sets, defaults to 12.
367+ number_of_points_to_synthesize: Size of the synthetic data to be generated by each shadow model,
368+ defaults to 20,000.
345369 random_seed: Random seed used for reproducibility, defaults to None.
346370
347371 Returns:
@@ -365,6 +389,7 @@ def train_three_sets_of_shadow_models(
365389 table_name = table_name ,
366390 id_column_name = id_column_name ,
367391 pre_training_data_size = fine_tuning_config .pre_train_data_size ,
392+ number_of_points_to_synthesize = number_of_points_to_synthesize ,
368393 init_data_seed = random_seed ,
369394 random_seed = random_seed ,
370395 )
@@ -387,6 +412,7 @@ def train_three_sets_of_shadow_models(
387412 table_name = table_name ,
388413 id_column_name = id_column_name ,
389414 pre_training_data_size = fine_tuning_config .pre_train_data_size ,
415+ number_of_points_to_synthesize = number_of_points_to_synthesize ,
390416 # Setting a different seed for the second train set
391417 init_data_seed = random_seed + 1 if random_seed is not None else None ,
392418 random_seed = random_seed ,
@@ -405,6 +431,7 @@ def train_three_sets_of_shadow_models(
405431 training_json_config_paths = training_json_config_paths ,
406432 table_name = table_name ,
407433 id_column_name = id_column_name ,
434+ number_of_points_to_synthesize = number_of_points_to_synthesize ,
408435 random_seed = random_seed ,
409436 )
410437 log (
0 commit comments