diff --git a/.gitignore b/.gitignore index 611e0bc1..9c5092f2 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,7 @@ examples/training/single_table/data/** examples/training/single_table/results/** examples/training/multi_table/data/** examples/training/multi_table/results/** +examples/synthesizing/single_table/data/** +examples/synthesizing/single_table/results/** +examples/synthesizing/multi_table/data/** +examples/synthesizing/multi_table/results/** diff --git a/examples/synthesizing/multi_table/README.md b/examples/synthesizing/multi_table/README.md new file mode 100644 index 00000000..737b49ec --- /dev/null +++ b/examples/synthesizing/multi_table/README.md @@ -0,0 +1,60 @@ +# Multi-Table Synthesizing Example + +This example will go over synthesizing data for a multi-table dataset from the ground +up using the code in this toolkit. + + +## Downloading data + +First, we need the data. Download it from this +[Google Drive link](https://drive.google.com/file/d/1Ao222l4AJjG54-HDEGCWkIfzRbl9_IKa/view?usp=drive_link), +extract the files and place them in a `/data` folder within this folder +(`examples/synthesizing/multi_table`). + +> [!NOTE] +> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute +> of the [`config.yaml`](config.yaml) file. + +It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`, +and `trans`. For each table there will be two files: +- `{table_name}.csv`: The table's data. +- `{table_name}_domain.json`: Metadata about the columns in the table's data, such as data types and sizes. + +Additionally, you will find one more file: +- `dataset_meta.json`: Metadata about the relationship between the tables. It will describe which tables +are associated with which other tables. + + +## Kicking off synthesizing + +If there is a `/results` folder within this folder (`examples/synthesizing/multi_table`) +from a previous training run, we will use that data to kick off synthesizing. +For example, you can copy the results from another run (e.g. `examples.training.multi_table.run_training`) +and paste them here and it will be picked up by this example. + +The [`config.yaml`](config.yaml) file contains the parameters for the synthesizing and also +for training, in case there is a need to run that. Please take a look at them before kicking +off the synthesizing process and edit them as necessary. + +To kick off synthesizing, simply run the command below from the project's root folder: + +```bash +python -m examples.synthesizing.multi_table.run_synthesizing +``` + +## Results + +It will save the result files inside a `/results` folder within this folder +(`examples/synthesizing/multi_table`). + +> [!NOTE] +> If you wish to change the save folder, you can do so by editing the `results_dir` attribute +> of the [`config.yaml`](config.yaml) file. + +In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`, +which is a pickle file containing the synthetic data before the matching process, in case +it's needed. + +The `/results/multi_table_synthesizing` folder will contain the final synthesized +data, organized per table, in the form of `.csv` files with the following naming pattern: +`/results/multi_table_synthesizing/{table_name}/_final/{table_name}_synthetic.csv`. diff --git a/examples/synthesizing/multi_table/config.yaml b/examples/synthesizing/multi_table/config.yaml new file mode 100644 index 00000000..bbfcff30 --- /dev/null +++ b/examples/synthesizing/multi_table/config.yaml @@ -0,0 +1,49 @@ +# Training example configuration +# Base data directory (can be overridden from command line) +base_data_dir: examples/synthesizing/multi_table/data +results_dir: examples/synthesizing/multi_table/results + +# diffusion_config, clustering_config, and classifier_config are only required +# when training a new model from scratch +diffusion_config: + d_layers: [512, 1024, 1024, 1024, 1024, 512] + dropout: 0.0 + num_timesteps: 2000 + model_type: mlp + iterations: 20000 + batch_size: 4096 + lr: 0.0006 + gaussian_loss_type: mse + weight_decay: 1e-05 + scheduler: cosine + data_split_ratios: [0.99, 0.005, 0.005] + +clustering_config: + parent_scale: 1.0 + num_clusters: 50 + clustering_method: kmeans_and_gmm + +classifier_config: + d_layers: [128, 256, 512, 1024, 512, 256, 128] + lr: 0.0001 + dim_t: 128 + batch_size: 4096 + iterations: 20000 + +# Synthesizing configuration +general_config: + data_dir: examples/synthesizing/multi_table/data + test_data_dir: examples/synthesizing/multi_table/data + exp_name: multi_table_synthesizing + workspace_dir: examples/synthesizing/multi_table/results + sample_prefix: "" + +sampling_config: + batch_size: 20000 + classifier_scale: 1.0 + +matching_config: + num_matching_clusters: 1 + matching_batch_size: 1000 + unique_matching: True + no_matching: False diff --git a/examples/synthesizing/multi_table/run_synthesizing.py b/examples/synthesizing/multi_table/run_synthesizing.py new file mode 100644 index 00000000..9d845e2e --- /dev/null +++ b/examples/synthesizing/multi_table/run_synthesizing.py @@ -0,0 +1,88 @@ +import pickle +from logging import INFO +from pathlib import Path +from typing import Any + +import hydra +from omegaconf import DictConfig + +from examples.training.multi_table import run_training +from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig +from midst_toolkit.common.logger import TOOLKIT_LOGGER, log +from midst_toolkit.models.clavaddpm.data_loaders import load_tables +from midst_toolkit.models.clavaddpm.enumerations import Relation +from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing + + +# Preventing some excessive logging +TOOLKIT_LOGGER.setLevel(INFO) + + +@hydra.main(config_path=".", config_name="config", version_base=None) +def main(config: DictConfig) -> None: + """ + Run the synthesizing pipeline for a multi-table diffusion model. + + It will load the config and then data from the `config.base_data_dir` folder, + train the model, synthesize the data and save the results in the + `config.results_dir` folder. + + It will first look for a pre-trained model in the `config.results_dir` folder. + If it doesn't find one, it will train a new model from scratch. + + Args: + config: Training and synthesizing configuration as an OmegaConf DictConfig object. + """ + log(INFO, f"Checking for a pre-trained model in {config.results_dir}...") + + _, relation_order, _ = load_tables(Path(config.base_data_dir)) + + model_file_paths: dict[Relation, dict[str, Any]] = {} + for relation in relation_order: + model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl" + model_file_paths[relation] = { + "file_path": model_file_path, + "exists": model_file_path.exists(), + } + + clustering_results_file = Path(config.results_dir) / "cluster_ckpt.pkl" + + if all(result["exists"] for result in model_file_paths.values()) and clustering_results_file.exists(): + log(INFO, f"Found previous results in {config.results_dir}. Skipping training.") + else: + log(INFO, "Not all previous results found. Training a new model from scratch.") + log(INFO, f"Summary of results: {model_file_paths}") + log(INFO, f"Clustering results file: {clustering_results_file} exists? {clustering_results_file.exists()}") + run_training.main(config) + + log(INFO, "Loading models...") + + models = {} + for relation in relation_order: + with open(model_file_paths[relation]["file_path"], "rb") as f: + models[relation] = pickle.load(f) + + with open(clustering_results_file, "rb") as f: + clustering_result = pickle.load(f) + + tables = clustering_result["tables"] + all_group_lengths_prob_dicts = clustering_result["all_group_lengths_prob_dicts"] + + log(INFO, "Synthesizing data...") + + clava_synthesizing( + tables, + relation_order, + Path(config.results_dir), + models, + GeneralConfig(**config.general_config), + SamplingConfig(**config.sampling_config), + MatchingConfig(**config.matching_config), + all_group_lengths_prob_dicts, + ) + + log(INFO, "Data synthesized successfully.") + + +if __name__ == "__main__": + main() diff --git a/examples/synthesizing/single_table/README.md b/examples/synthesizing/single_table/README.md new file mode 100644 index 00000000..5f6f1f51 --- /dev/null +++ b/examples/synthesizing/single_table/README.md @@ -0,0 +1,58 @@ +# Single-Table Synthesizing Example + +This example will go over synthesizing data for a single-table dataset from the ground +up using the code in this toolkit. + + +## Downloading data + +First, we need the data. Download it from this +[Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link), +extract the files and place them in a `/data` folder within this folder +(`examples/synthesizing/single_table`). + +> [!NOTE] +> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute +> of the [`config.yaml`](config.yaml) file. + +Here is a description of the files that have been extracted: +- `trans.csv`: The training data. It consists of information about bank transactions and it +contains 20,000 data points. +- `trans_domain.json`: Metadata about the columns in `trans.csv`, such as data types and sizes. +- `dataset_meta.json`: Metadata about the relationship between the tables. Since this is a +single-table example, it will only contain information about the `trans` table. + + +## Kicking off synthesizing + +If there is a `/results` folder within this folder (`examples/synthesizing/single_table`) +from a previous training run, we will use that data to kick off synthesizing. +For example, you can copy the results from another run (e.g. `examples.training.single_table.run_training`) +and paste them here and it will be picked up by this example. + +The [`config.yaml`](config.yaml) file contains the parameters for the synthesizing and also +for training, in case there is a need to run that. Please take a look at them before kicking +off the synthesizing process and edit them as necessary. + +To kick off synthesizing, simply run the command below from the project's root folder: + +```bash +python -m examples.synthesizing.single_table.run_synthesizing +``` + +## Results + +It will save the result files inside a `/results` folder within this folder +(`examples/synthesizing/single_table`). + +> [!NOTE] +> If you wish to change the save folder, you can do so by editing the `results_dir` attribute +> of the [`config.yaml`](config.yaml) file. + +In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`, +which is a pickle file containing the synthetic data before the matching process, in case +it's needed. + +The `/results/single_table_synthesizing` folder will contain the final synthesized +data, organized per table. In this single-table example, there is only going to be one +synthesized table under `/results/single_table_synthesizing/trans/_final/trans_synthetic.csv`. diff --git a/examples/synthesizing/single_table/config.yaml b/examples/synthesizing/single_table/config.yaml new file mode 100644 index 00000000..b3cbb0e2 --- /dev/null +++ b/examples/synthesizing/single_table/config.yaml @@ -0,0 +1,36 @@ +# Training example configuration +# Base data directory (can be overridden from command line) +base_data_dir: examples/synthesizing/single_table/data +results_dir: examples/synthesizing/single_table/results + +# diffusion_config is only required when training a new model from scratch +diffusion_config: + d_layers: [512, 1024, 1024, 1024, 1024, 512] + dropout: 0.0 + num_timesteps: 2000 + model_type: mlp + iterations: 20000 + batch_size: 4096 + lr: 0.0006 + gaussian_loss_type: mse + weight_decay: 1e-05 + scheduler: cosine + data_split_ratios: [0.99, 0.005, 0.005] + +# Synthesizing configuration +general_config: + data_dir: examples/synthesizing/single_table/data + test_data_dir: examples/synthesizing/single_table/data + exp_name: single_table_synthesizing + workspace_dir: examples/synthesizing/single_table/results + sample_prefix: "" + +sampling_config: + batch_size: 20000 + classifier_scale: 1.0 + +matching_config: + num_matching_clusters: 1 + matching_batch_size: 1000 + unique_matching: True + no_matching: False diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py new file mode 100644 index 00000000..b9f6a649 --- /dev/null +++ b/examples/synthesizing/single_table/run_synthesizing.py @@ -0,0 +1,84 @@ +import pickle +from logging import INFO +from pathlib import Path +from typing import Any + +import hydra +from omegaconf import DictConfig + +from examples.training.single_table import run_training +from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig +from midst_toolkit.common.logger import TOOLKIT_LOGGER, log +from midst_toolkit.models.clavaddpm.data_loaders import load_tables +from midst_toolkit.models.clavaddpm.enumerations import Relation +from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing + + +# Preventing some excessive logging +TOOLKIT_LOGGER.setLevel(INFO) + + +@hydra.main(config_path=".", config_name="config", version_base=None) +def main(config: DictConfig) -> None: + """ + Run the synthesizing pipeline for a single-table diffusion model. + + It will load the config and then data from the `config.base_data_dir` folder, + train the model, synthesize the data and save the results in the + `config.results_dir` folder. + + It will first look for a pre-trained model in the `config.results_dir` folder. + If it doesn't find one, it will train a new model from scratch. + + Args: + config: Training and synthesizing configuration as an OmegaConf DictConfig object. + """ + log(INFO, f"Checking for a pre-trained model in {config.results_dir}...") + + tables, relation_order, _ = load_tables(Path(config.base_data_dir)) + + assert len(relation_order) == 1 and relation_order[0][0] is None, ( + "Relation order is not configured for single-table. " + "For multi-table synthesizing, please use the `examples.synthesizing.multi_table.run_synthesizing` example. " + f"Relation order: {relation_order}" + ) + + model_file_paths: dict[Relation, dict[str, Any]] = {} + for relation in relation_order: + model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl" + model_file_paths[relation] = { + "file_path": model_file_path, + "exists": model_file_path.exists(), + } + + if all(result["exists"] for result in model_file_paths.values()): + log(INFO, f"Found previous results in {config.results_dir}. Skipping training.") + else: + log(INFO, "Not all previous results found. Training a new model from scratch.") + log(INFO, f"Summary of results: {model_file_paths}") + run_training.main(config) + + log(INFO, "Loading models...") + + models = {} + for relation in relation_order: + with open(model_file_paths[relation]["file_path"], "rb") as f: + models[relation] = pickle.load(f) + + log(INFO, "Synthesizing data...") + + clava_synthesizing( + tables, + relation_order, + Path(config.results_dir), + models, + GeneralConfig(**config.general_config), + SamplingConfig(**config.sampling_config), + MatchingConfig(**config.matching_config), + ) + + log(INFO, "Data synthesized successfully.") + + +if __name__ == "__main__": + main() diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md index 31791112..31e2584a 100644 --- a/examples/training/multi_table/README.md +++ b/examples/training/multi_table/README.md @@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder > [!NOTE] > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`, and `trans`. For each table there will be two files: @@ -44,7 +44,7 @@ The result files will be saved inside a `/results` folder within this folder > [!NOTE] > If you wish to change the save folder, you can do so by editing the `results_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. One of the results file is `/results/cluster_ckpt.pkl`, which will contain the results of the clustering step. diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md index ac6fa12b..683e7431 100644 --- a/examples/training/single_table/README.md +++ b/examples/training/single_table/README.md @@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder > [!NOTE] > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. Here is a description of the files that have been extracted: - `trans.csv`: The training data. It consists of information about bank transactions and it @@ -42,7 +42,7 @@ The result files will be saved inside a `/results` folder within this folder > [!NOTE] > If you wish to change the save folder, you can do so by editing the `results_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. In the `/results/models/` folder, there will be a file called `None_trans_ckpt.pkl`, which is a pickle file containing the training results. You can load it using Python's diff --git a/examples/training/single_table/config.yaml b/examples/training/single_table/config.yaml index 7fc54556..719be647 100644 --- a/examples/training/single_table/config.yaml +++ b/examples/training/single_table/config.yaml @@ -8,7 +8,7 @@ diffusion_config: dropout: 0.0 num_timesteps: 2000 model_type: mlp - iterations: 200000 + iterations: 20000 batch_size: 4096 lr: 0.0006 gaussian_loss_type: mse diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index 1678a3cd..f1693c8e 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -142,11 +142,11 @@ def train_tabddpm_and_synthesize( tables, relation_order, save_dir, - all_group_lengths_prob_dicts, models, configs.general, configs.sampling, configs.matching, + all_group_lengths_prob_dicts, sample_scale=sample_scale, ) @@ -235,11 +235,11 @@ def fine_tune_tabddpm_and_synthesize( new_tables, relation_order, save_dir, - all_group_lengths_prob_dicts, new_models, configs.general, configs.sampling, configs.matching, + all_group_lengths_prob_dicts, sample_scale=sample_scale, ) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 1299d85b..b841ec6d 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -59,6 +59,8 @@ def clava_clustering( "tables": tables, "all_group_lengths_prob_dicts": all_group_lengths_prob_dicts, } + + save_dir.mkdir(parents=True, exist_ok=True) with open(save_dir / "cluster_ckpt.pkl", "wb") as f: pickle.dump(cluster_ckpt, f) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index 008e0d6c..2cb10746 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -96,7 +96,7 @@ def load_tables( with open(data_dir / "dataset_meta.json", "r") as f: dataset_meta = json.load(f) - relation_order = dataset_meta["relation_order"] + relation_order = [tuple(relation) for relation in dataset_meta["relation_order"]] tables = {} diff --git a/src/midst_toolkit/models/clavaddpm/synthesizer.py b/src/midst_toolkit/models/clavaddpm/synthesizer.py index 2204ec52..44741bcd 100644 --- a/src/midst_toolkit/models/clavaddpm/synthesizer.py +++ b/src/midst_toolkit/models/clavaddpm/synthesizer.py @@ -17,7 +17,7 @@ from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig from midst_toolkit.common.enumerations import DataSplit from midst_toolkit.common.logger import log -from midst_toolkit.models.clavaddpm.data_loaders import Tables +from midst_toolkit.models.clavaddpm.data_loaders import NO_PARENT_COLUMN_NAME, Tables from midst_toolkit.models.clavaddpm.dataset import Dataset, TableMetadata, Transformations from midst_toolkit.models.clavaddpm.enumerations import ( CategoricalEncoding, @@ -626,9 +626,9 @@ def sample_from_dict(probabilities: dict[int, float]) -> int: Returns: The sampled key. """ - assert sum(probabilities.values()) == 1.0, "The sum of all probabilities must be 1.0." + assert np.isclose(sum(probabilities.values()), 1), "The sum of all probabilities must be 1." - # Generate a random number between 0 and 1 + # Generate a random number between [0, 1) random_number = random.random() # Initialize cumulative sum and the selected key @@ -711,11 +711,11 @@ def clava_synthesizing( tables: Tables, relation_order: RelationOrder, save_dir: Path, - all_group_lengths_prob_dicts: GroupLengthsProbDicts, models: dict[Relation, ModelArtifacts], general_config: GeneralConfig, sampling_config: SamplingConfig, matching_config: MatchingConfig, + all_group_lengths_prob_dicts: GroupLengthsProbDicts | None = None, sample_scale: float = 1.0, ) -> tuple[dict[str, pd.DataFrame], float, float]: """ @@ -726,12 +726,13 @@ def clava_synthesizing( tables: Tables containing dataframes and clustering information. relation_order: List of parent-child table relationships. save_dir: Directory to save intermediate and final results. - all_group_lengths_prob_dicts: Dictionary containing group length probabilities for each - parent-child relationship. models: Trained models for each parent-child relationship. general_config: General configuration settings. sampling_config: Configuration settings for sampling. matching_config: Configuration settings for matching. + all_group_lengths_prob_dicts: Dictionary containing group length probabilities for each + parent-child relationship. Optional for single-table synthesizing, required for + multi-table synthesizing. Defaults to None. sample_scale: Scale factor for the number of samples to generate based on the train data size. Defaults to 1.0. @@ -754,6 +755,10 @@ def clava_synthesizing( log(INFO, "Sample size: {}".format(int(sample_scale * len(df_without_id)))) if parent is None: + # Adding the no parent placeholder column in case it doesn't have it + if NO_PARENT_COLUMN_NAME not in df_without_id.columns: + df_without_id[NO_PARENT_COLUMN_NAME] = list(range(len(df_without_id))) + # synthesize data for single table or tables with no parent synthesized_df, table_keys = _synthesize_single_table( child, @@ -762,7 +767,12 @@ def clava_synthesizing( sample_scale, sampling_config.batch_size, ) + else: + assert all_group_lengths_prob_dicts is not None, ( + "all_group_lengths_prob_dicts is required for multi-table synthesizing." + ) + # Finding previously synthesized data and training results for the parent parent_synthetic_data = None parent_training_results = None diff --git a/tests/integration/models/clavaddpm/test_model.py b/tests/integration/models/clavaddpm/test_model.py index 4bb3b965..43689ff0 100644 --- a/tests/integration/models/clavaddpm/test_model.py +++ b/tests/integration/models/clavaddpm/test_model.py @@ -122,7 +122,7 @@ def test_load_single_table(): }, ) - assert relation_order == [[None, "trans"]] + assert relation_order == [(None, "trans")] assert dataset_meta["relation_order"] == [[None, "trans"]] assert dataset_meta["tables"] == {"trans": {"children": [], "parents": []}} @@ -225,7 +225,7 @@ def test_load_tables(): }, ) - assert relation_order == [[None, "account"], ["account", "trans"]] + assert relation_order == [(None, "account"), ("account", "trans")] assert dataset_meta["relation_order"] == [[None, "account"], ["account", "trans"]] assert dataset_meta["tables"] == { "account": {"children": ["trans"], "parents": []}, diff --git a/tests/integration/models/clavaddpm/test_synthesizer.py b/tests/integration/models/clavaddpm/test_synthesizer.py index 267160df..ae88477e 100644 --- a/tests/integration/models/clavaddpm/test_synthesizer.py +++ b/tests/integration/models/clavaddpm/test_synthesizer.py @@ -92,11 +92,11 @@ def test_clava_synthesize_multi_table(tmp_path: Path): tables, relation_order, tmp_path, - all_group_lengths_prob_dicts, models[1], synthesizing_config, SAMPLING_CONFIG, MATCHING_CONFIG, + all_group_lengths_prob_dicts, ) # Assert