-
Notifications
You must be signed in to change notification settings - Fork 1
Ft/ensemble experiments test script #96
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
27fa9c8
f64b650
0d4a3b6
9c1217d
0a12084
81c0bb3
a19a595
e93b3c1
70cb1af
942475d
de1cb95
819826c
59e52d4
cb7e65c
c58db99
1df7528
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,108 @@ | ||
| # Ensemble experiment configuration | ||
| # This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``tets_attack_model.py``). | ||
| base_experiment_dir: examples/ensemble_attack/tabddpm_20k_experiment_data # Processed data, and experiment artifacts will be stored here | ||
| base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory | ||
|
|
||
| # Pipeline control | ||
| pipeline: | ||
| run_data_processing: true # Set this to false if you have already saved the processed data | ||
| run_shadow_model_training: true # Set this to false if shadow models are already trained and saved | ||
| run_metaclassifier_training: true | ||
|
|
||
| target_model: # This is only used for testing the attack on a real target model. | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are attacking the target model |
||
| # This is for models trained on 20k data and generating 20k synthetic data | ||
| target_model_directory: /projects/midst-experiments/all_tabddpms/tabddpm_trained_with_20k/train/ | ||
| target_model_id: 21 # Will be overridden per SLURM array task | ||
| target_model_name: tabddpm_${target_model.target_model_id} | ||
| target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/synthetic_data/20k/20k.csv | ||
| challenge_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_with_id.csv | ||
| challenge_label_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_label.csv | ||
|
|
||
| target_attack_artifact_dir: ${base_experiment_dir}/target_${target_model.target_model_id}_attack_artifacts/ | ||
| attack_probabilities_result_path: ${target_model.target_attack_artifact_dir}/attack_model_${target_model.target_model_id}_proba | ||
| target_shadow_models_output_path: ${target_model.target_attack_artifact_dir}/tabddpm_${target_model.target_model_id}_shadows_dir | ||
|
|
||
|
|
||
| # Data paths | ||
| data_paths: | ||
| midst_data_path: /projects/midst-experiments/all_tabddpms # Used to collect the data | ||
| population_path: ${base_experiment_dir}/population_data # Path where the collected population data will be stored | ||
| processed_attack_data_path: ${base_experiment_dir}/attack_data # Path where the processed attack real train and evaluation data is stored | ||
| attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack evaluation results will be stored | ||
|
|
||
| model_paths: | ||
| metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved | ||
|
|
||
|
|
||
| # Dataset specific information used for processing in this example | ||
| data_processing_config: | ||
| population_attack_data_types_to_collect: | ||
| [ | ||
| "tabddpm_trained_with_20k", | ||
| ] | ||
| challenge_attack_data_types_to_collect: | ||
| [ | ||
| "tabddpm_trained_with_20k", | ||
| ] | ||
| population_splits: ["train"] # Data splits to be collected for population data | ||
| challenge_splits: ["train"] # Data splits to be collected for challenge points | ||
| # The column name in the data to be used for stratified splitting. | ||
| column_to_stratify: "trans_type" # Attention: This value is not documented in the original codebase. | ||
| folder_ranges: #Specify folder ranges for any of the mentioned splits. | ||
| train: [[1, 20]] # Folders to be used for train data collection in the experiments | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Model IDs used for training the metaclassifier (attack model).
fatemetkl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # File names in MIDST data directories. | ||
| single_table_train_data_file_name: "train_with_id.csv" | ||
| multi_table_train_data_file_name: "trans.csv" | ||
| challenge_data_file_name: "challenge_with_id.csv" | ||
| population_sample_size: 40000 # Population size is the total data that your attack has access to. | ||
| # In experiments, this is sampled out of all the collected training data in case the available data | ||
| # is more than this number. Note that, half of this data is actually used for training, the other half | ||
| # is used for evaluation. For example, with 40k population size, only 20k is used for training the attack model. | ||
| # TODO: make sure to consider this in experiments. | ||
|
|
||
| # Training and data settings for shadow models (temporary, numbers subject to change) | ||
| shadow_training: | ||
| # Data Config files path used for training a TabDDPM model | ||
| training_json_config_paths: # Config json files used for tabddpm training on the trans table | ||
| table_domain_file_path: ${base_data_config_dir}/trans_domain.json | ||
| dataset_meta_file_path: ${base_data_config_dir}/dataset_meta.json | ||
| tabddpm_training_config_path: ${base_data_config_dir}/trans.json | ||
| # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name | ||
| # Also, training configs for each shadow model are created under shadow_models_data_path. | ||
| shadow_models_output_path: ${base_experiment_dir}/shadow_models_and_data | ||
| target_model_output_path: ${base_experiment_dir}/shadow_target_model_and_data | ||
| # Paths to final shadow models used for metaclassifier training (relative to shadow_models_output_path) | ||
| # These paths are a result of running the shadow model training pipeline, specifically the | ||
| # train_three_sets_of_shadow_models in shadow_model_training.py | ||
| # Each .pkl file contains the training data, trained model and training results for all shadow models in a list. | ||
| final_shadow_models_path: [ | ||
| "${shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl", | ||
| "${shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl", | ||
| "${shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl", | ||
| ] | ||
| target_synthetic_data_path: ${shadow_training.target_model_output_path}/target_synthetic_data.csv | ||
| # Path to final shadow target's synthetic data (relative to target_model_output_path) | ||
| fine_tuning_config: | ||
| fine_tune_diffusion_iterations: 200000 # Original code: 200000 | ||
| fine_tune_classifier_iterations: 20000 # Original code: 20000 | ||
| pre_train_data_size: 60000 # Original code: 60000 | ||
lotif marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models. | ||
| # Original code: 20000 | ||
|
|
||
|
|
||
| # Metaclassifier settings | ||
| metaclassifier: | ||
| # Data types json file is used for xgboost model training. | ||
| data_types_file_path: ${base_data_config_dir}/data_types.json | ||
| model_type: "xgb" | ||
| # Model training parameters | ||
| num_optuna_trials: 100 # Original code: 100 | ||
| num_kfolds: 5 | ||
| use_gpu: false | ||
| # Temporary. Might remove having an epoch parameter. | ||
| epochs: 1 | ||
| meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model | ||
|
|
||
|
|
||
| # General settings | ||
| random_seed: 42 # Set to null for no seed, or an integer for a fixed seed | ||
This file was deleted.
Uh oh!
There was an error while loading. Please reload this page.