From 2550c85c8f233f56780498c89c0ae40c2e07294a Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 11 Nov 2025 16:27:41 -0500 Subject: [PATCH 01/23] WIP needs to save results --- .gitignore | 6 +++ examples/training/single_table/README.md | 38 +++++++++++++++++ examples/training/single_table/config.yaml | 17 ++++++++ .../training/single_table/run_training.py | 42 +++++++++++++++++++ 4 files changed, 103 insertions(+) create mode 100644 examples/training/single_table/README.md create mode 100644 examples/training/single_table/config.yaml create mode 100644 examples/training/single_table/run_training.py diff --git a/.gitignore b/.gitignore index dde72bbc..611e0bc1 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,9 @@ outputs/ # mkdocs site site/ + +# Training examples +examples/training/single_table/data/** +examples/training/single_table/results/** +examples/training/multi_table/data/** +examples/training/multi_table/results/** diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md new file mode 100644 index 00000000..450078b8 --- /dev/null +++ b/examples/training/single_table/README.md @@ -0,0 +1,38 @@ +# Single-Table Training Example + +This example will go over traning a single-table diffusion model from the ground up using the +code in this toolkit. + +## Downloading data + +First, we need the data. Download it from this +[Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link), +extract the files and place them `/data` folder in within this folder (`examples/training/single_table`). + +> [!NOTE] +> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute +> of the [`config.yaml`](config.yaml) file. + +Here is a description of the files: +- `trans.csv`: The training data. It consists of information about bank transactions and it +contains 20,000 data points. +- `trans_domain.json`: Metadata about the columns in `trans.csv`, such as data types and sizes. +- `dataset_meta.json`: Metadata about the relationship between the tables. Since this is a +single-table example, it will only contain information about the `trans` table. + +## Kicking off traning + +The [`config.yaml`] file contains the parameters for the training. Please take a look a them +beforee kicking off the training and edit them as necessary. + +To kick off training, simply run the command below from the project's root folder: + +```bash +python -m examples.training.single_table.run_training +``` + +It will save the files inside a `/results` folder within this folder (`examples/training/single_table`). + +> [!NOTE] +> If you wish to change the save folder, you can do so by editing the `results_dir` attribute +> of the [`config.yaml`](config.yaml) file. diff --git a/examples/training/single_table/config.yaml b/examples/training/single_table/config.yaml new file mode 100644 index 00000000..a0710316 --- /dev/null +++ b/examples/training/single_table/config.yaml @@ -0,0 +1,17 @@ +# Training example configuration +# Base data directory (can be overridden from command line) +base_data_dir: examples/training/data +results_dir: examples/training/results + +diffusion_config: + d_layers: [512, 1024, 1024, 1024, 1024, 512] + dropout: 0.0 + num_timesteps: 2000 + model_type: mlp + iterations: 200000 + batch_size: 4096 + lr: 0.0006 + gaussian_loss_type: mse + weight_decay: 1e-05 + scheduler: cosine + data_split_ratios: [0.99, 0.005, 0.005] diff --git a/examples/training/single_table/run_training.py b/examples/training/single_table/run_training.py new file mode 100644 index 00000000..f89cfeb1 --- /dev/null +++ b/examples/training/single_table/run_training.py @@ -0,0 +1,42 @@ +from logging import INFO +from pathlib import Path + +import hydra +from omegaconf import DictConfig + +from midst_toolkit.common.config import DiffusionConfig +from midst_toolkit.common.logger import log +from midst_toolkit.common.variables import DEVICE +from midst_toolkit.models.clavaddpm.data_loaders import load_tables +from midst_toolkit.models.clavaddpm.train import clava_training + + +@hydra.main(config_path=".", config_name="config", version_base=None) +def main(config: DictConfig) -> None: + """ + Run the training pipeline. + + It will load the config and then data from the `config.base_data_dir` folder, + train the model and save the results in the `config.results_dir` folder. + + Args: + config: Training configuration as an OmegaConf DictConfig object. + """ + log(INFO, f"Loading data from {config.base_data_dir}...") + tables, relation_order, _ = load_tables(Path(config.base_data_dir)) + + log(INFO, "Training model...") + diffusion_config = DiffusionConfig(**config.diffusion_config) + + tables, _ = clava_training( + tables, + relation_order, + Path(config.results_dir), + diffusion_config, + device=DEVICE, + ) + log(INFO, "Model trained successfully.") + + +if __name__ == "__main__": + main() From 8146b8a54b91cf159ce47dd01cf65242a91832c4 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 11 Nov 2025 16:58:25 -0500 Subject: [PATCH 02/23] Done the single table --- examples/training/single_table/README.md | 28 +++++++++++++++++-- .../training/single_table/run_training.py | 14 +++++++++- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md index 450078b8..f521106f 100644 --- a/examples/training/single_table/README.md +++ b/examples/training/single_table/README.md @@ -3,23 +3,26 @@ This example will go over traning a single-table diffusion model from the ground up using the code in this toolkit. + ## Downloading data First, we need the data. Download it from this [Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link), -extract the files and place them `/data` folder in within this folder (`examples/training/single_table`). +extract the files and place them in a `/data` folder in within this folder +(`examples/training/single_table`). > [!NOTE] > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute > of the [`config.yaml`](config.yaml) file. -Here is a description of the files: +Here is a description of the files that have been extracted: - `trans.csv`: The training data. It consists of information about bank transactions and it contains 20,000 data points. - `trans_domain.json`: Metadata about the columns in `trans.csv`, such as data types and sizes. - `dataset_meta.json`: Metadata about the relationship between the tables. Since this is a single-table example, it will only contain information about the `trans` table. + ## Kicking off traning The [`config.yaml`] file contains the parameters for the training. Please take a look a them @@ -31,8 +34,27 @@ To kick off training, simply run the command below from the project's root folde python -m examples.training.single_table.run_training ``` -It will save the files inside a `/results` folder within this folder (`examples/training/single_table`). +It will save the result files inside a `/results` folder within this folder +(`examples/training/single_table`). > [!NOTE] > If you wish to change the save folder, you can do so by editing the `results_dir` attribute > of the [`config.yaml`](config.yaml) file. + +In the `/results/models/` folder, there will be a file called `None_trans_ckpt.pkl`, +which is a pickle file containing the training results. You can load it using Python's +`pickle` and it will yield an instance of +`midst_toolkit.models.clavaddpm.train.ModelArtifacts`, which contains the trained +diffusion model along with some additional metadata about the training process: + +```python +import pickle +from midst_toolkit.models.clavaddpm.train import ModelArtifacts + +results_file = Path("examples/single_table/results/models/None_trans_ckpt.pkl") + + with open(results_file, "rb") as f: + result = pickle.load(f) + +assert isinstance(result, ModelArtifacts) +``` diff --git a/examples/training/single_table/run_training.py b/examples/training/single_table/run_training.py index f89cfeb1..ae7c4ed0 100644 --- a/examples/training/single_table/run_training.py +++ b/examples/training/single_table/run_training.py @@ -1,3 +1,4 @@ +import pickle from logging import INFO from pathlib import Path @@ -8,7 +9,7 @@ from midst_toolkit.common.logger import log from midst_toolkit.common.variables import DEVICE from midst_toolkit.models.clavaddpm.data_loaders import load_tables -from midst_toolkit.models.clavaddpm.train import clava_training +from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training @hydra.main(config_path=".", config_name="config", version_base=None) @@ -37,6 +38,17 @@ def main(config: DictConfig) -> None: ) log(INFO, "Model trained successfully.") + results_file = Path(config.results_dir) / "models" / "None_trans_ckpt.pkl" + log(INFO, f"Checking the results from {results_file}...") + + with open(results_file, "rb") as f: + result = pickle.load(f) + + # Asserting the results are the correct type + assert isinstance(result, ModelArtifacts) + + log(INFO, f"Result size (in bytes): {results_file.stat().st_size}") + if __name__ == "__main__": main() From 00d879a85512d50213353f26740a09295095ed45 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 12 Nov 2025 14:55:11 -0500 Subject: [PATCH 03/23] Finished adding the multi table example --- examples/training/multi_table/README.md | 66 +++++++++++++++++ examples/training/multi_table/config.yaml | 29 ++++++++ examples/training/multi_table/run_training.py | 74 +++++++++++++++++++ examples/training/single_table/config.yaml | 4 +- .../training/single_table/run_training.py | 6 +- .../models/clavaddpm/clustering.py | 4 + 6 files changed, 180 insertions(+), 3 deletions(-) create mode 100644 examples/training/multi_table/README.md create mode 100644 examples/training/multi_table/config.yaml create mode 100644 examples/training/multi_table/run_training.py diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md new file mode 100644 index 00000000..380b5c86 --- /dev/null +++ b/examples/training/multi_table/README.md @@ -0,0 +1,66 @@ +# Multi-Table Training Example + +This example will go over traning a multi-table diffusion model from the ground up using the +code in this toolkit. + + +## Downloading data + +First, we need the data. Download it from this +[Google Drive link](https://drive.google.com/file/d/1Ao222l4AJjG54-HDEGCWkIfzRbl9_IKa/view?usp=drive_link), +extract the files and place them in a `/data` folder in within this folder +(`examples/training/multi_table`). + +> [!NOTE] +> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute +> of the [`config.yaml`](config.yaml) file. + +It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`, +and `trans`. For each table there will be two files: +- `{table_name}.csv`: The table's data. +- `{table_name}_domain.json`: Metadata about the columns in the table's data, such as data types and sizes. + +Additionally, you will find one more file: +- `dataset_meta.json`: Metadata about the relationship between the tables. It will describe which tables +are associated with which other tables. + + +## Kicking off traning + +The [`config.yaml`] file contains the parameters for the training. Please take a look a them +beforee kicking off the training and edit them as necessary. + +To kick off training, simply run the command below from the project's root folder: + +```bash +python -m examples.training.multi_table.run_training +``` + +It will save the result files inside a `/results` folder within this folder +(`examples/training/multi_table`). + +> [!NOTE] +> If you wish to change the save folder, you can do so by editing the `results_dir` attribute +> of the [`config.yaml`](config.yaml) file. + +One of the results file is `/results/cluster_ckpt.pkl`, which will contain the results +of the clustering step. + +The other result files are in the `/results/models/` folder. They will be named after the +table relations defined in `dataset_meta.json`. For eexample: for the `("client", "account")` +relation, there will be a file called `client_account_ckpt.pkl`, which is a pickle file +containing the training results. You can load it using Python's `pickle` and it will yield +an instance of `midst_toolkit.models.clavaddpm.train.ModelArtifacts`, which contains the +trained diffusion model along with some additional metadata about the training process: + +```python +import pickle +from midst_toolkit.models.clavaddpm.train import ModelArtifacts + +results_file = Path("examples/multi_table/results/models/client_account_ckpt.pkl") + + with open(results_file, "rb") as f: + result = pickle.load(f) + +assert isinstance(result, ModelArtifacts) +``` diff --git a/examples/training/multi_table/config.yaml b/examples/training/multi_table/config.yaml new file mode 100644 index 00000000..6d5e701d --- /dev/null +++ b/examples/training/multi_table/config.yaml @@ -0,0 +1,29 @@ +# Multi-table training example configuration +# Base data directory (can be overridden from command line) +base_data_dir: examples/training/multi_table/data +results_dir: examples/training/multi_table/results + +diffusion_config: + d_layers: [512, 1024, 1024, 1024, 1024, 512] + dropout: 0.0 + num_timesteps: 2000 + model_type: mlp + iterations: 20000 + batch_size: 4096 + lr: 0.0006 + gaussian_loss_type: mse + weight_decay: 1e-05 + scheduler: cosine + data_split_ratios: [0.99, 0.005, 0.005] + +clustering_config: + parent_scale: 1.0 + num_clusters: 50 + clustering_method: kmeans_and_gmm + +classifier_config: + d_layers: [128, 256, 512, 1024, 512, 256, 128] + lr: 0.0001 + dim_t: 128 + batch_size: 4096 + iterations: 20000 diff --git a/examples/training/multi_table/run_training.py b/examples/training/multi_table/run_training.py new file mode 100644 index 00000000..b0d1e2f8 --- /dev/null +++ b/examples/training/multi_table/run_training.py @@ -0,0 +1,74 @@ +import pickle +from logging import INFO +from pathlib import Path + +import hydra +from omegaconf import DictConfig + +from midst_toolkit.common.config import ClassifierConfig, ClusteringConfig, DiffusionConfig +from midst_toolkit.common.logger import TOOLKIT_LOGGER, log +from midst_toolkit.common.variables import DEVICE +from midst_toolkit.models.clavaddpm.clustering import clava_clustering +from midst_toolkit.models.clavaddpm.data_loaders import Table, load_tables +from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training + + +# Preventing some excessive logging +TOOLKIT_LOGGER.setLevel(INFO) + + +@hydra.main(config_path=".", config_name="config", version_base=None) +def main(config: DictConfig) -> None: + """ + Run the training pipeline. + + It will load the config and then data from the `config.base_data_dir` folder, + train the model and save the results in the `config.results_dir` folder. + + Args: + config: Training configuration as an OmegaConf DictConfig object. + """ + log(INFO, f"Loading data from {config.base_data_dir}...") + tables, relation_order, _ = load_tables(Path(config.base_data_dir)) + + log(INFO, "Clustering data...") + clustering_config = ClusteringConfig(**config.clustering_config) + tables, _ = clava_clustering(tables, relation_order, Path(config.results_dir), clustering_config) + + log(INFO, "Training model...") + diffusion_config = DiffusionConfig(**config.diffusion_config) + classifier_config = ClassifierConfig(**config.classifier_config) + + tables, _ = clava_training( + tables, + relation_order, + Path(config.results_dir), + diffusion_config, + classifier_config, + device=DEVICE, + ) + log(INFO, "Model trained successfully.") + + log(INFO, "Checking the clustering results...") + clustering_results_file = Path(config.results_dir) / "cluster_ckpt.pkl" + with open(clustering_results_file, "rb") as f: + clustering_result = pickle.load(f) + + assert all(isinstance(table, Table) for table in clustering_result["tables"].values()) + assert isinstance(clustering_result["all_group_lengths_prob_dicts"], dict) + + for relation in relation_order: + results_file = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl" + log(INFO, f"Checking the results from {results_file}...") + + with open(results_file, "rb") as f: + result = pickle.load(f) + + # Asserting the results are the correct type + assert isinstance(result, ModelArtifacts) + + log(INFO, f"Result size (in bytes): {results_file.stat().st_size}") + + +if __name__ == "__main__": + main() diff --git a/examples/training/single_table/config.yaml b/examples/training/single_table/config.yaml index a0710316..7fc54556 100644 --- a/examples/training/single_table/config.yaml +++ b/examples/training/single_table/config.yaml @@ -1,7 +1,7 @@ # Training example configuration # Base data directory (can be overridden from command line) -base_data_dir: examples/training/data -results_dir: examples/training/results +base_data_dir: examples/training/single_table/data +results_dir: examples/training/single_table/results diffusion_config: d_layers: [512, 1024, 1024, 1024, 1024, 512] diff --git a/examples/training/single_table/run_training.py b/examples/training/single_table/run_training.py index ae7c4ed0..e5dd7202 100644 --- a/examples/training/single_table/run_training.py +++ b/examples/training/single_table/run_training.py @@ -6,12 +6,16 @@ from omegaconf import DictConfig from midst_toolkit.common.config import DiffusionConfig -from midst_toolkit.common.logger import log +from midst_toolkit.common.logger import TOOLKIT_LOGGER, log from midst_toolkit.common.variables import DEVICE from midst_toolkit.models.clavaddpm.data_loaders import load_tables from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training +# Preventing some excessive logging +TOOLKIT_LOGGER.setLevel(INFO) + + @hydra.main(config_path=".", config_name="config", version_base=None) def main(config: DictConfig) -> None: """ diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 646b6de3..5c383122 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -753,6 +753,10 @@ def _min_max_normalize_sklearn(matrix: np.ndarray) -> np.ndarray: Returns: Numpy array of the normalized data. """ + if matrix.shape[1] == 0: + # If theree are no features to normalize, then no-op + return matrix + scaler = MinMaxScaler(feature_range=(-1, 1)) return scaler.fit_transform(matrix) From 414892241c75790568160b4bed4c158854f5adb7 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 12 Nov 2025 14:59:30 -0500 Subject: [PATCH 04/23] Adding test for the bug fix --- tests/unit/models/clavaddpm/test_clustering.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/models/clavaddpm/test_clustering.py b/tests/unit/models/clavaddpm/test_clustering.py index 8d684796..c4a8a77c 100644 --- a/tests/unit/models/clavaddpm/test_clustering.py +++ b/tests/unit/models/clavaddpm/test_clustering.py @@ -51,6 +51,16 @@ def test_min_max_normalize_sklearn() -> None: unset_all_random_seeds() +def test_min_max_normalize_sklearn_empty_matrix() -> None: + set_all_random_seeds(42) + + data_to_normalize = np.random.randint(0, 3, (5, 0)) + normalized_data = _min_max_normalize_sklearn(data_to_normalize) + assert data_to_normalize is normalized_data + + unset_all_random_seeds() + + def test_get_normalized_numerical_columns() -> None: set_all_random_seeds(42) child_data = np.random.randint(0, 3, (3, 3)) From f075e46dfffaa8f69beb4e10e5b97493d04fdc97 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 12 Nov 2025 15:03:55 -0500 Subject: [PATCH 05/23] Better docstrings --- examples/training/multi_table/run_training.py | 2 +- examples/training/single_table/run_training.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/training/multi_table/run_training.py b/examples/training/multi_table/run_training.py index b0d1e2f8..6d5548a5 100644 --- a/examples/training/multi_table/run_training.py +++ b/examples/training/multi_table/run_training.py @@ -20,7 +20,7 @@ @hydra.main(config_path=".", config_name="config", version_base=None) def main(config: DictConfig) -> None: """ - Run the training pipeline. + Run the training pipeline for a multi-table diffusion model. It will load the config and then data from the `config.base_data_dir` folder, train the model and save the results in the `config.results_dir` folder. diff --git a/examples/training/single_table/run_training.py b/examples/training/single_table/run_training.py index e5dd7202..74897db7 100644 --- a/examples/training/single_table/run_training.py +++ b/examples/training/single_table/run_training.py @@ -19,7 +19,7 @@ @hydra.main(config_path=".", config_name="config", version_base=None) def main(config: DictConfig) -> None: """ - Run the training pipeline. + Run the training pipeline for a single-table diffusion model. It will load the config and then data from the `config.base_data_dir` folder, train the model and save the results in the `config.results_dir` folder. From 7d890f90abe3920c60b9d85b4263872b79211c1c Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 12 Nov 2025 15:04:20 -0500 Subject: [PATCH 06/23] Fixing typo --- src/midst_toolkit/models/clavaddpm/clustering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 5c383122..1299d85b 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -754,7 +754,7 @@ def _min_max_normalize_sklearn(matrix: np.ndarray) -> np.ndarray: Numpy array of the normalized data. """ if matrix.shape[1] == 0: - # If theree are no features to normalize, then no-op + # If there are no features to normalize, then no-op return matrix scaler = MinMaxScaler(feature_range=(-1, 1)) From 402e35d68af18cf489eabf0649df68c3a72e2c00 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 12 Nov 2025 15:09:03 -0500 Subject: [PATCH 07/23] Fixing the config yaml link --- examples/training/multi_table/README.md | 6 +++--- examples/training/single_table/README.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md index 380b5c86..ddbec0ee 100644 --- a/examples/training/multi_table/README.md +++ b/examples/training/multi_table/README.md @@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder > [!NOTE] > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`, and `trans`. For each table there will be two files: @@ -27,7 +27,7 @@ are associated with which other tables. ## Kicking off traning -The [`config.yaml`] file contains the parameters for the training. Please take a look a them +The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them beforee kicking off the training and edit them as necessary. To kick off training, simply run the command below from the project's root folder: @@ -41,7 +41,7 @@ It will save the result files inside a `/results` folder within this folder > [!NOTE] > If you wish to change the save folder, you can do so by editing the `results_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. One of the results file is `/results/cluster_ckpt.pkl`, which will contain the results of the clustering step. diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md index f521106f..4e9b6f9e 100644 --- a/examples/training/single_table/README.md +++ b/examples/training/single_table/README.md @@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder > [!NOTE] > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. Here is a description of the files that have been extracted: - `trans.csv`: The training data. It consists of information about bank transactions and it @@ -25,7 +25,7 @@ single-table example, it will only contain information about the `trans` table. ## Kicking off traning -The [`config.yaml`] file contains the parameters for the training. Please take a look a them +The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them beforee kicking off the training and edit them as necessary. To kick off training, simply run the command below from the project's root folder: @@ -39,7 +39,7 @@ It will save the result files inside a `/results` folder within this folder > [!NOTE] > If you wish to change the save folder, you can do so by editing the `results_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. In the `/results/models/` folder, there will be a file called `None_trans_ckpt.pkl`, which is a pickle file containing the training results. You can load it using Python's From 5fd0cc88f1d455ea2f7a09784a2047f79f27adca Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 12 Nov 2025 15:12:06 -0500 Subject: [PATCH 08/23] CR by coderabbit --- examples/training/multi_table/README.md | 12 ++++++------ examples/training/single_table/README.md | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md index ddbec0ee..a6c47238 100644 --- a/examples/training/multi_table/README.md +++ b/examples/training/multi_table/README.md @@ -1,6 +1,6 @@ # Multi-Table Training Example -This example will go over traning a multi-table diffusion model from the ground up using the +This example will go over training a multi-table diffusion model from the ground up using the code in this toolkit. @@ -25,10 +25,10 @@ Additionally, you will find one more file: are associated with which other tables. -## Kicking off traning +## Kicking off training -The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them -beforee kicking off the training and edit them as necessary. +The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a +look at them before kicking off the training and edit them as necessary. To kick off training, simply run the command below from the project's root folder: @@ -47,7 +47,7 @@ One of the results file is `/results/cluster_ckpt.pkl`, which will contain the r of the clustering step. The other result files are in the `/results/models/` folder. They will be named after the -table relations defined in `dataset_meta.json`. For eexample: for the `("client", "account")` +table relations defined in `dataset_meta.json`. For example: for the `("client", "account")` relation, there will be a file called `client_account_ckpt.pkl`, which is a pickle file containing the training results. You can load it using Python's `pickle` and it will yield an instance of `midst_toolkit.models.clavaddpm.train.ModelArtifacts`, which contains the @@ -57,7 +57,7 @@ trained diffusion model along with some additional metadata about the training p import pickle from midst_toolkit.models.clavaddpm.train import ModelArtifacts -results_file = Path("examples/multi_table/results/models/client_account_ckpt.pkl") +results_file = Path("examples/training/multi_table/results/models/client_account_ckpt.pkl") with open(results_file, "rb") as f: result = pickle.load(f) diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md index 4e9b6f9e..ea78d325 100644 --- a/examples/training/single_table/README.md +++ b/examples/training/single_table/README.md @@ -1,6 +1,6 @@ # Single-Table Training Example -This example will go over traning a single-table diffusion model from the ground up using the +This example will go over training a single-table diffusion model from the ground up using the code in this toolkit. @@ -23,10 +23,10 @@ contains 20,000 data points. single-table example, it will only contain information about the `trans` table. -## Kicking off traning +## Kicking off training -The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them -beforee kicking off the training and edit them as necessary. +The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a +look at them before kicking off the training and edit them as necessary. To kick off training, simply run the command below from the project's root folder: @@ -51,7 +51,7 @@ diffusion model along with some additional metadata about the training process: import pickle from midst_toolkit.models.clavaddpm.train import ModelArtifacts -results_file = Path("examples/single_table/results/models/None_trans_ckpt.pkl") +results_file = Path("examples/training/single_table/results/models/None_trans_ckpt.pkl") with open(results_file, "rb") as f: result = pickle.load(f) From e9d385b4075b36fcd9784553123fd63b7b6140ee Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 12 Nov 2025 15:09:03 -0500 Subject: [PATCH 09/23] Fixing the config yaml link --- examples/training/multi_table/README.md | 6 +++--- examples/training/single_table/README.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md index 380b5c86..ddbec0ee 100644 --- a/examples/training/multi_table/README.md +++ b/examples/training/multi_table/README.md @@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder > [!NOTE] > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`, and `trans`. For each table there will be two files: @@ -27,7 +27,7 @@ are associated with which other tables. ## Kicking off traning -The [`config.yaml`] file contains the parameters for the training. Please take a look a them +The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them beforee kicking off the training and edit them as necessary. To kick off training, simply run the command below from the project's root folder: @@ -41,7 +41,7 @@ It will save the result files inside a `/results` folder within this folder > [!NOTE] > If you wish to change the save folder, you can do so by editing the `results_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. One of the results file is `/results/cluster_ckpt.pkl`, which will contain the results of the clustering step. diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md index f521106f..4e9b6f9e 100644 --- a/examples/training/single_table/README.md +++ b/examples/training/single_table/README.md @@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder > [!NOTE] > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. Here is a description of the files that have been extracted: - `trans.csv`: The training data. It consists of information about bank transactions and it @@ -25,7 +25,7 @@ single-table example, it will only contain information about the `trans` table. ## Kicking off traning -The [`config.yaml`] file contains the parameters for the training. Please take a look a them +The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them beforee kicking off the training and edit them as necessary. To kick off training, simply run the command below from the project's root folder: @@ -39,7 +39,7 @@ It will save the result files inside a `/results` folder within this folder > [!NOTE] > If you wish to change the save folder, you can do so by editing the `results_dir` attribute -> of the [`config.yaml`](config.yaml) file. +> of the (`config.yaml`)[config.yaml] file. In the `/results/models/` folder, there will be a file called `None_trans_ckpt.pkl`, which is a pickle file containing the training results. You can load it using Python's From ac1a885d9f765fd160a96097bf3becf56e385318 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 12 Nov 2025 15:12:06 -0500 Subject: [PATCH 10/23] CR by coderabbit --- examples/training/multi_table/README.md | 12 ++++++------ examples/training/single_table/README.md | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md index ddbec0ee..a6c47238 100644 --- a/examples/training/multi_table/README.md +++ b/examples/training/multi_table/README.md @@ -1,6 +1,6 @@ # Multi-Table Training Example -This example will go over traning a multi-table diffusion model from the ground up using the +This example will go over training a multi-table diffusion model from the ground up using the code in this toolkit. @@ -25,10 +25,10 @@ Additionally, you will find one more file: are associated with which other tables. -## Kicking off traning +## Kicking off training -The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them -beforee kicking off the training and edit them as necessary. +The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a +look at them before kicking off the training and edit them as necessary. To kick off training, simply run the command below from the project's root folder: @@ -47,7 +47,7 @@ One of the results file is `/results/cluster_ckpt.pkl`, which will contain the r of the clustering step. The other result files are in the `/results/models/` folder. They will be named after the -table relations defined in `dataset_meta.json`. For eexample: for the `("client", "account")` +table relations defined in `dataset_meta.json`. For example: for the `("client", "account")` relation, there will be a file called `client_account_ckpt.pkl`, which is a pickle file containing the training results. You can load it using Python's `pickle` and it will yield an instance of `midst_toolkit.models.clavaddpm.train.ModelArtifacts`, which contains the @@ -57,7 +57,7 @@ trained diffusion model along with some additional metadata about the training p import pickle from midst_toolkit.models.clavaddpm.train import ModelArtifacts -results_file = Path("examples/multi_table/results/models/client_account_ckpt.pkl") +results_file = Path("examples/training/multi_table/results/models/client_account_ckpt.pkl") with open(results_file, "rb") as f: result = pickle.load(f) diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md index 4e9b6f9e..ea78d325 100644 --- a/examples/training/single_table/README.md +++ b/examples/training/single_table/README.md @@ -1,6 +1,6 @@ # Single-Table Training Example -This example will go over traning a single-table diffusion model from the ground up using the +This example will go over training a single-table diffusion model from the ground up using the code in this toolkit. @@ -23,10 +23,10 @@ contains 20,000 data points. single-table example, it will only contain information about the `trans` table. -## Kicking off traning +## Kicking off training -The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them -beforee kicking off the training and edit them as necessary. +The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a +look at them before kicking off the training and edit them as necessary. To kick off training, simply run the command below from the project's root folder: @@ -51,7 +51,7 @@ diffusion model along with some additional metadata about the training process: import pickle from midst_toolkit.models.clavaddpm.train import ModelArtifacts -results_file = Path("examples/single_table/results/models/None_trans_ckpt.pkl") +results_file = Path("examples/training/single_table/results/models/None_trans_ckpt.pkl") with open(results_file, "rb") as f: result = pickle.load(f) From c904e9bf401333d21db120c541c3739d4dcc8aeb Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Wed, 12 Nov 2025 15:17:57 -0500 Subject: [PATCH 11/23] Actually fixing the config file links --- examples/training/multi_table/README.md | 6 +++--- examples/training/single_table/README.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md index a6c47238..c2fd173d 100644 --- a/examples/training/multi_table/README.md +++ b/examples/training/multi_table/README.md @@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder > [!NOTE] > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute -> of the (`config.yaml`)[config.yaml] file. +> of the [`config.yaml`](config.yaml) file. It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`, and `trans`. For each table there will be two files: @@ -27,7 +27,7 @@ are associated with which other tables. ## Kicking off training -The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a +The [`config.yaml`](config.yaml) file contains the parameters for the training. Please take a look at them before kicking off the training and edit them as necessary. To kick off training, simply run the command below from the project's root folder: @@ -41,7 +41,7 @@ It will save the result files inside a `/results` folder within this folder > [!NOTE] > If you wish to change the save folder, you can do so by editing the `results_dir` attribute -> of the (`config.yaml`)[config.yaml] file. +> of the [`config.yaml`](config.yaml) file. One of the results file is `/results/cluster_ckpt.pkl`, which will contain the results of the clustering step. diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md index ea78d325..50b162c7 100644 --- a/examples/training/single_table/README.md +++ b/examples/training/single_table/README.md @@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder > [!NOTE] > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute -> of the (`config.yaml`)[config.yaml] file. +> of the [`config.yaml`](config.yaml) file. Here is a description of the files that have been extracted: - `trans.csv`: The training data. It consists of information about bank transactions and it @@ -25,7 +25,7 @@ single-table example, it will only contain information about the `trans` table. ## Kicking off training -The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a +The [`config.yaml`](config.yaml) file contains the parameters for the training. Please take a look at them before kicking off the training and edit them as necessary. To kick off training, simply run the command below from the project's root folder: @@ -39,7 +39,7 @@ It will save the result files inside a `/results` folder within this folder > [!NOTE] > If you wish to change the save folder, you can do so by editing the `results_dir` attribute -> of the (`config.yaml`)[config.yaml] file. +> of the [`config.yaml`](config.yaml) file. In the `/results/models/` folder, there will be a file called `None_trans_ckpt.pkl`, which is a pickle file containing the training results. You can load it using Python's From 3d0b9c6c8afcf629dcedf4a5ccb5a7260a26e38c Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 13 Nov 2025 11:44:24 -0500 Subject: [PATCH 12/23] Synthesizing single table first files --- .../synthesizing/single_table/config.yaml | 34 +++++++++++ .../single_table/run_synthesizing.py | 58 +++++++++++++++++++ examples/training/multi_table/config.yaml | 8 +-- 3 files changed, 96 insertions(+), 4 deletions(-) create mode 100644 examples/synthesizing/single_table/config.yaml create mode 100644 examples/synthesizing/single_table/run_synthesizing.py diff --git a/examples/synthesizing/single_table/config.yaml b/examples/synthesizing/single_table/config.yaml new file mode 100644 index 00000000..492c43c6 --- /dev/null +++ b/examples/synthesizing/single_table/config.yaml @@ -0,0 +1,34 @@ +# Training example configuration +# Base data directory (can be overridden from command line) +base_data_dir: examples/training/single_table/data +results_dir: examples/training/single_table/results + +diffusion_config: + d_layers: [512, 1024, 1024, 1024, 1024, 512] + dropout: 0.0 + num_timesteps: 2000 + model_type: mlp + iterations: 200000 + batch_size: 4096 + lr: 0.0006 + gaussian_loss_type: mse + weight_decay: 1e-05 + scheduler: cosine + data_split_ratios: [0.99, 0.005, 0.005] + +general_config: + data_dir: examples/training/single_table/data + test_data_dir: examples/training/single_table/data + exp_name: single_table_synthesizing + workspace_dir: examples/training/single_table/results + sample_prefix: "" + +sampling_config: + batch_size: 20000 + classifier_scale: 1.0 + +matching_config: + num_matching_clusters: 1 + matching_batch_size: 1000 + unique_matching: True + no_matching: False diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py new file mode 100644 index 00000000..6a7f1a8e --- /dev/null +++ b/examples/synthesizing/single_table/run_synthesizing.py @@ -0,0 +1,58 @@ +import pickle +from logging import INFO +from pathlib import Path + +import hydra +from omegaconf import DictConfig + +from midst_toolkit.common.config import DiffusionConfig +from midst_toolkit.common.logger import TOOLKIT_LOGGER, log +from midst_toolkit.common.variables import DEVICE +from midst_toolkit.models.clavaddpm.data_loaders import load_tables +from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training + + +# Preventing some excessive logging +TOOLKIT_LOGGER.setLevel(INFO) + + +@hydra.main(config_path=".", config_name="config", version_base=None) +def main(config: DictConfig) -> None: + """ + Run the synthesizing pipeline for a single-table diffusion model. + + It will load the config and then data from the `config.base_data_dir` folder, + train the model and save the results in the `config.results_dir` folder. + + Args: + config: Training configuration as an OmegaConf DictConfig object. + """ + log(INFO, f"Loading data from {config.base_data_dir}...") + tables, relation_order, _ = load_tables(Path(config.base_data_dir)) + + log(INFO, "Training model...") + diffusion_config = DiffusionConfig(**config.diffusion_config) + + tables, _ = clava_training( + tables, + relation_order, + Path(config.results_dir), + diffusion_config, + device=DEVICE, + ) + log(INFO, "Model trained successfully.") + + results_file = Path(config.results_dir) / "models" / "None_trans_ckpt.pkl" + log(INFO, f"Checking the results from {results_file}...") + + with open(results_file, "rb") as f: + result = pickle.load(f) + + # Asserting the results are the correct type + assert isinstance(result, ModelArtifacts) + + log(INFO, f"Result size (in bytes): {results_file.stat().st_size}") + + +if __name__ == "__main__": + main() diff --git a/examples/training/multi_table/config.yaml b/examples/training/multi_table/config.yaml index 6d5e701d..c63e7aaa 100644 --- a/examples/training/multi_table/config.yaml +++ b/examples/training/multi_table/config.yaml @@ -6,10 +6,10 @@ results_dir: examples/training/multi_table/results diffusion_config: d_layers: [512, 1024, 1024, 1024, 1024, 512] dropout: 0.0 - num_timesteps: 2000 + num_timesteps: 100 model_type: mlp - iterations: 20000 - batch_size: 4096 + iterations: 1000 + batch_size: 24 lr: 0.0006 gaussian_loss_type: mse weight_decay: 1e-05 @@ -26,4 +26,4 @@ classifier_config: lr: 0.0001 dim_t: 128 batch_size: 4096 - iterations: 20000 + iterations: 1000 From baa9824b5135ad30585dcaf2e4dc3ae3663ab6f4 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 13 Nov 2025 13:50:13 -0500 Subject: [PATCH 13/23] Finishing the sythesizer single table example --- .gitignore | 4 ++ examples/synthesizing/single_table/README.md | 58 +++++++++++++++++++ .../synthesizing/single_table/config.yaml | 10 ++-- .../single_table/run_synthesizing.py | 54 ++++++++++------- .../attacks/ensemble/shadow_model_utils.py | 4 +- .../models/clavaddpm/data_loaders.py | 7 ++- .../models/clavaddpm/synthesizer.py | 12 +++- .../models/clavaddpm/test_synthesizer.py | 2 +- 8 files changed, 118 insertions(+), 33 deletions(-) create mode 100644 examples/synthesizing/single_table/README.md diff --git a/.gitignore b/.gitignore index 611e0bc1..9c5092f2 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,7 @@ examples/training/single_table/data/** examples/training/single_table/results/** examples/training/multi_table/data/** examples/training/multi_table/results/** +examples/synthesizing/single_table/data/** +examples/synthesizing/single_table/results/** +examples/synthesizing/multi_table/data/** +examples/synthesizing/multi_table/results/** diff --git a/examples/synthesizing/single_table/README.md b/examples/synthesizing/single_table/README.md new file mode 100644 index 00000000..15c1bca8 --- /dev/null +++ b/examples/synthesizing/single_table/README.md @@ -0,0 +1,58 @@ +# Single-Table Synthesizing Example + +This example will go over synthesizing data for a single-table dataset from the ground +up using the code in this toolkit. + + +## Downloading data + +First, we need the data. Download it from this +[Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link), +extract the files and place them in a `/data` folder in within this folder +(`examples/synthesizing/single_table`). + +> [!NOTE] +> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute +> of the (`config.yaml`)[config.yaml] file. + +Here is a description of the files that have been extracted: +- `trans.csv`: The training data. It consists of information about bank transactions and it +contains 20,000 data points. +- `trans_domain.json`: Metadata about the columns in `trans.csv`, such as data types and sizes. +- `dataset_meta.json`: Metadata about the relationship between the tables. Since this is a +single-table example, it will only contain information about the `trans` table. + + +## Kicking off synthesizing + +If there is a `/results` folder within this folder (`examples/synthesizing/single_table`) +from a previous training run, we will use that data to kick off synthesizing. +For example, you can copy the results from another run (e.g. `examples.training.single_table.run_training`) +and paste them here and it will be picked up by this example. + +The [`config.yaml`](config.yaml) file contains the parameters for the synthesizing and also +for training, in case there is a need to run that. Please take a look at them before kicking +off the synthesizing process and edit them as necessary. + +To kick off synthesizing, simply run the command below from the project's root folder: + +```bash +python -m examples.synthesizing.single_table.run_synthesizing +``` + +## Results + +It will save the result files inside a `/results` folder within this folder +(`examples/synthesizing/single_table`). + +> [!NOTE] +> If you wish to change the save folder, you can do so by editing the `results_dir` attribute +> of the (`config.yaml`)[config.yaml] file. + +In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`, +which is a pickle file containing the synthetic data before the matching process, in case +it's needed. + +The `/results/single_table_synthesizing` folder will contain the final synthesized +data, organized per table. In this single-table example, there is only going to be one +synthesized table under `/results/single_table_synthesizing/trans/_final/trans_synthetic.csv`. diff --git a/examples/synthesizing/single_table/config.yaml b/examples/synthesizing/single_table/config.yaml index 492c43c6..662a4fe1 100644 --- a/examples/synthesizing/single_table/config.yaml +++ b/examples/synthesizing/single_table/config.yaml @@ -1,7 +1,7 @@ # Training example configuration # Base data directory (can be overridden from command line) -base_data_dir: examples/training/single_table/data -results_dir: examples/training/single_table/results +base_data_dir: examples/synthesizing/single_table/data +results_dir: examples/synthesizing/single_table/results diffusion_config: d_layers: [512, 1024, 1024, 1024, 1024, 512] @@ -17,10 +17,10 @@ diffusion_config: data_split_ratios: [0.99, 0.005, 0.005] general_config: - data_dir: examples/training/single_table/data - test_data_dir: examples/training/single_table/data + data_dir: examples/synthesizing/single_table/data + test_data_dir: examples/synthesizing/single_table/data exp_name: single_table_synthesizing - workspace_dir: examples/training/single_table/results + workspace_dir: examples/synthesizing/single_table/results sample_prefix: "" sampling_config: diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py index 6a7f1a8e..4fb5ce4b 100644 --- a/examples/synthesizing/single_table/run_synthesizing.py +++ b/examples/synthesizing/single_table/run_synthesizing.py @@ -5,11 +5,11 @@ import hydra from omegaconf import DictConfig -from midst_toolkit.common.config import DiffusionConfig +from examples.training.single_table import run_training +from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig from midst_toolkit.common.logger import TOOLKIT_LOGGER, log -from midst_toolkit.common.variables import DEVICE from midst_toolkit.models.clavaddpm.data_loaders import load_tables -from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training +from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing # Preventing some excessive logging @@ -22,36 +22,48 @@ def main(config: DictConfig) -> None: Run the synthesizing pipeline for a single-table diffusion model. It will load the config and then data from the `config.base_data_dir` folder, - train the model and save the results in the `config.results_dir` folder. + train the model, synthesize the data and save the results in the + `config.results_dir` folder. + + It will first look for a pre-trained model in the `config.results_dir` folder. + If it doesn't find one, it will train a new model from scratch. Args: config: Training configuration as an OmegaConf DictConfig object. """ - log(INFO, f"Loading data from {config.base_data_dir}...") + log(INFO, f"Checking for a pre-trained model in {config.results_dir}...") + tables, relation_order, _ = load_tables(Path(config.base_data_dir)) - log(INFO, "Training model...") - diffusion_config = DiffusionConfig(**config.diffusion_config) + model_file_paths = {} + for relation in relation_order: + model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl" + model_file_paths[relation] = model_file_path + + if all(model_file.exists() for model_file in model_file_paths.values()): + log(INFO, f"Found a pre-trained models in {config.results_dir}. Skipping training.") + else: + log(INFO, "No pre-trained models found, training a new model from scratch...") + run_training.main(config) + + log(INFO, "Synthesizing data...") - tables, _ = clava_training( + models = {} + for relation in relation_order: + with open(model_file_paths[relation], "rb") as f: + models[relation] = pickle.load(f) + + clava_synthesizing( tables, relation_order, Path(config.results_dir), - diffusion_config, - device=DEVICE, + models, + GeneralConfig(**config.general_config), + SamplingConfig(**config.sampling_config), + MatchingConfig(**config.matching_config), ) - log(INFO, "Model trained successfully.") - - results_file = Path(config.results_dir) / "models" / "None_trans_ckpt.pkl" - log(INFO, f"Checking the results from {results_file}...") - - with open(results_file, "rb") as f: - result = pickle.load(f) - - # Asserting the results are the correct type - assert isinstance(result, ModelArtifacts) - log(INFO, f"Result size (in bytes): {results_file.stat().st_size}") + log(INFO, "Data synthesized successfully.") if __name__ == "__main__": diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py index e8604e22..63b57729 100644 --- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py +++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py @@ -142,11 +142,11 @@ def train_tabddpm_and_synthesize( tables, relation_order, save_dir, - all_group_lengths_prob_dicts, models, configs.general, configs.sampling, configs.matching, + all_group_lengths_prob_dicts, sample_scale=sample_scale, ) @@ -232,11 +232,11 @@ def fine_tune_tabddpm_and_synthesize( new_tables, relation_order, save_dir, - all_group_lengths_prob_dicts, new_models, configs.general, configs.sampling, configs.matching, + all_group_lengths_prob_dicts, sample_scale=sample_scale, ) diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index 008e0d6c..c9e09cce 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -96,7 +96,7 @@ def load_tables( with open(data_dir / "dataset_meta.json", "r") as f: dataset_meta = json.load(f) - relation_order = dataset_meta["relation_order"] + relation_order = [tuple(relation) for relation in dataset_meta["relation_order"]] tables = {} @@ -125,6 +125,11 @@ def load_tables( info=info, ) + # Adding the no parent placeholder column in the tables with no parent + for parent, child in relation_order: + if parent is None: + tables[child].data[NO_PARENT_COLUMN_NAME] = list(range(len(tables[child].data))) + return tables, relation_order, dataset_meta diff --git a/src/midst_toolkit/models/clavaddpm/synthesizer.py b/src/midst_toolkit/models/clavaddpm/synthesizer.py index 2204ec52..cbbdddc1 100644 --- a/src/midst_toolkit/models/clavaddpm/synthesizer.py +++ b/src/midst_toolkit/models/clavaddpm/synthesizer.py @@ -711,11 +711,11 @@ def clava_synthesizing( tables: Tables, relation_order: RelationOrder, save_dir: Path, - all_group_lengths_prob_dicts: GroupLengthsProbDicts, models: dict[Relation, ModelArtifacts], general_config: GeneralConfig, sampling_config: SamplingConfig, matching_config: MatchingConfig, + all_group_lengths_prob_dicts: GroupLengthsProbDicts | None = None, sample_scale: float = 1.0, ) -> tuple[dict[str, pd.DataFrame], float, float]: """ @@ -726,12 +726,13 @@ def clava_synthesizing( tables: Tables containing dataframes and clustering information. relation_order: List of parent-child table relationships. save_dir: Directory to save intermediate and final results. - all_group_lengths_prob_dicts: Dictionary containing group length probabilities for each - parent-child relationship. models: Trained models for each parent-child relationship. general_config: General configuration settings. sampling_config: Configuration settings for sampling. matching_config: Configuration settings for matching. + all_group_lengths_prob_dicts: Dictionary containing group length probabilities for each + parent-child relationship. Optional for single-table synthesizing, required for + multi-table synthesizing. Defaults to None. sample_scale: Scale factor for the number of samples to generate based on the train data size. Defaults to 1.0. @@ -762,7 +763,12 @@ def clava_synthesizing( sample_scale, sampling_config.batch_size, ) + else: + assert all_group_lengths_prob_dicts is not None, ( + "all_group_lengths_prob_dicts is required for multi-table synthesizing." + ) + # Finding previously synthesized data and training results for the parent parent_synthetic_data = None parent_training_results = None diff --git a/tests/integration/models/clavaddpm/test_synthesizer.py b/tests/integration/models/clavaddpm/test_synthesizer.py index 267160df..ae88477e 100644 --- a/tests/integration/models/clavaddpm/test_synthesizer.py +++ b/tests/integration/models/clavaddpm/test_synthesizer.py @@ -92,11 +92,11 @@ def test_clava_synthesize_multi_table(tmp_path: Path): tables, relation_order, tmp_path, - all_group_lengths_prob_dicts, models[1], synthesizing_config, SAMPLING_CONFIG, MATCHING_CONFIG, + all_group_lengths_prob_dicts, ) # Assert From d480dd9bc4dd88b891eda9ff49bd205f50cddda3 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 13 Nov 2025 13:52:25 -0500 Subject: [PATCH 14/23] Small tweak in the readmes --- examples/training/multi_table/README.md | 5 ++++- examples/training/single_table/README.md | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md index c2fd173d..31791112 100644 --- a/examples/training/multi_table/README.md +++ b/examples/training/multi_table/README.md @@ -36,7 +36,10 @@ To kick off training, simply run the command below from the project's root folde python -m examples.training.multi_table.run_training ``` -It will save the result files inside a `/results` folder within this folder + +## Results + +The result files will be saved inside a `/results` folder within this folder (`examples/training/multi_table`). > [!NOTE] diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md index 50b162c7..ac6fa12b 100644 --- a/examples/training/single_table/README.md +++ b/examples/training/single_table/README.md @@ -34,7 +34,10 @@ To kick off training, simply run the command below from the project's root folde python -m examples.training.single_table.run_training ``` -It will save the result files inside a `/results` folder within this folder + +## Results + +The result files will be saved inside a `/results` folder within this folder (`examples/training/single_table`). > [!NOTE] From e81aa91e42512cfe40b4f8537858d7346b3e6662 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 13 Nov 2025 13:58:48 -0500 Subject: [PATCH 15/23] Final synthesizer config --- examples/synthesizing/multi_table/config.yaml | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 examples/synthesizing/multi_table/config.yaml diff --git a/examples/synthesizing/multi_table/config.yaml b/examples/synthesizing/multi_table/config.yaml new file mode 100644 index 00000000..f9d77bbc --- /dev/null +++ b/examples/synthesizing/multi_table/config.yaml @@ -0,0 +1,46 @@ +# Training example configuration +# Base data directory (can be overridden from command line) +base_data_dir: examples/synthesizing/single_table/data +results_dir: examples/synthesizing/single_table/results + +diffusion_config: + d_layers: [512, 1024, 1024, 1024, 1024, 512] + dropout: 0.0 + num_timesteps: 100 + model_type: mlp + iterations: 1000 + batch_size: 24 + lr: 0.0006 + gaussian_loss_type: mse + weight_decay: 1e-05 + scheduler: cosine + data_split_ratios: [0.99, 0.005, 0.005] + +clustering_config: + parent_scale: 1.0 + num_clusters: 50 + clustering_method: kmeans_and_gmm + +classifier_config: + d_layers: [128, 256, 512, 1024, 512, 256, 128] + lr: 0.0001 + dim_t: 128 + batch_size: 4096 + iterations: 1000 + +general_config: + data_dir: examples/synthesizing/single_table/data + test_data_dir: examples/synthesizing/single_table/data + exp_name: single_table_synthesizing + workspace_dir: examples/synthesizing/single_table/results + sample_prefix: "" + +sampling_config: + batch_size: 20000 + classifier_scale: 1.0 + +matching_config: + num_matching_clusters: 1 + matching_batch_size: 1000 + unique_matching: True + no_matching: False From 1c55e91a508add28d975549ae9dde35c9181e0d3 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 13 Nov 2025 14:00:32 -0500 Subject: [PATCH 16/23] actual final configs --- examples/synthesizing/multi_table/config.yaml | 8 ++++---- examples/training/multi_table/config.yaml | 8 ++++---- examples/training/single_table/config.yaml | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/synthesizing/multi_table/config.yaml b/examples/synthesizing/multi_table/config.yaml index f9d77bbc..cc4cd2cc 100644 --- a/examples/synthesizing/multi_table/config.yaml +++ b/examples/synthesizing/multi_table/config.yaml @@ -6,10 +6,10 @@ results_dir: examples/synthesizing/single_table/results diffusion_config: d_layers: [512, 1024, 1024, 1024, 1024, 512] dropout: 0.0 - num_timesteps: 100 + num_timesteps: 2000 model_type: mlp - iterations: 1000 - batch_size: 24 + iterations: 20000 + batch_size: 4096 lr: 0.0006 gaussian_loss_type: mse weight_decay: 1e-05 @@ -26,7 +26,7 @@ classifier_config: lr: 0.0001 dim_t: 128 batch_size: 4096 - iterations: 1000 + iterations: 20000 general_config: data_dir: examples/synthesizing/single_table/data diff --git a/examples/training/multi_table/config.yaml b/examples/training/multi_table/config.yaml index c63e7aaa..6d5e701d 100644 --- a/examples/training/multi_table/config.yaml +++ b/examples/training/multi_table/config.yaml @@ -6,10 +6,10 @@ results_dir: examples/training/multi_table/results diffusion_config: d_layers: [512, 1024, 1024, 1024, 1024, 512] dropout: 0.0 - num_timesteps: 100 + num_timesteps: 2000 model_type: mlp - iterations: 1000 - batch_size: 24 + iterations: 20000 + batch_size: 4096 lr: 0.0006 gaussian_loss_type: mse weight_decay: 1e-05 @@ -26,4 +26,4 @@ classifier_config: lr: 0.0001 dim_t: 128 batch_size: 4096 - iterations: 1000 + iterations: 20000 diff --git a/examples/training/single_table/config.yaml b/examples/training/single_table/config.yaml index 7fc54556..719be647 100644 --- a/examples/training/single_table/config.yaml +++ b/examples/training/single_table/config.yaml @@ -8,7 +8,7 @@ diffusion_config: dropout: 0.0 num_timesteps: 2000 model_type: mlp - iterations: 200000 + iterations: 20000 batch_size: 4096 lr: 0.0006 gaussian_loss_type: mse From 37ebe7dfcccb31a7088b1fc65f1a5b41c40877fb Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 13 Nov 2025 14:01:44 -0500 Subject: [PATCH 17/23] removing one extra zero --- examples/synthesizing/single_table/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/synthesizing/single_table/config.yaml b/examples/synthesizing/single_table/config.yaml index 662a4fe1..de21a680 100644 --- a/examples/synthesizing/single_table/config.yaml +++ b/examples/synthesizing/single_table/config.yaml @@ -8,7 +8,7 @@ diffusion_config: dropout: 0.0 num_timesteps: 2000 model_type: mlp - iterations: 200000 + iterations: 20000 batch_size: 4096 lr: 0.0006 gaussian_loss_type: mse From 8974cfe8a1b7a7f4b5ab0f1a3877c32d09f0a6cd Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 13 Nov 2025 14:59:22 -0500 Subject: [PATCH 18/23] finishing the synthsizer example code --- examples/synthesizing/multi_table/README.md | 60 ++++++++++++++ examples/synthesizing/multi_table/config.yaml | 12 +-- .../multi_table/run_synthesizing.py | 81 +++++++++++++++++++ .../single_table/run_synthesizing.py | 6 +- .../models/clavaddpm/data_loaders.py | 5 -- .../models/clavaddpm/synthesizer.py | 10 ++- 6 files changed, 158 insertions(+), 16 deletions(-) create mode 100644 examples/synthesizing/multi_table/README.md create mode 100644 examples/synthesizing/multi_table/run_synthesizing.py diff --git a/examples/synthesizing/multi_table/README.md b/examples/synthesizing/multi_table/README.md new file mode 100644 index 00000000..8e13a277 --- /dev/null +++ b/examples/synthesizing/multi_table/README.md @@ -0,0 +1,60 @@ +# Multi-Table Synthesizing Example + +This example will go over synthesizing data for a multi-table dataset from the ground +up using the code in this toolkit. + + +## Downloading data + +First, we need the data. Download it from this +[Google Drive link](https://drive.google.com/file/d/1Ao222l4AJjG54-HDEGCWkIfzRbl9_IKa/view?usp=drive_link), +extract the files and place them in a `/data` folder in within this folder +(`examples/synthesizing/multi_table`). + +> [!NOTE] +> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute +> of the (`config.yaml`)[config.yaml] file. + +It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`, +and `trans`. For each table there will be two files: +- `{table_name}.csv`: The table's data. +- `{table_name}_domain.json`: Metadata about the columns in the table's data, such as data types and sizes. + +Additionally, you will find one more file: +- `dataset_meta.json`: Metadata about the relationship between the tables. It will describe which tables +are associated with which other tables. + + +## Kicking off synthesizing + +If there is a `/results` folder within this folder (`examples/synthesizing/multi_table`) +from a previous training run, we will use that data to kick off synthesizing. +For example, you can copy the results from another run (e.g. `examples.training.multi_table.run_training`) +and paste them here and it will be picked up by this example. + +The [`config.yaml`](config.yaml) file contains the parameters for the synthesizing and also +for training, in case there is a need to run that. Please take a look at them before kicking +off the synthesizing process and edit them as necessary. + +To kick off synthesizing, simply run the command below from the project's root folder: + +```bash +python -m examples.synthesizing.multi_table.run_synthesizing +``` + +## Results + +It will save the result files inside a `/results` folder within this folder +(`examples/synthesizing/multi_table`). + +> [!NOTE] +> If you wish to change the save folder, you can do so by editing the `results_dir` attribute +> of the (`config.yaml`)[config.yaml] file. + +In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`, +which is a pickle file containing the synthetic data before the matching process, in case +it's needed. + +The `/results/single_table_synthesizing` folder will contain the final synthesized +data, organized per table. In this single-table example, there is only going to be one +synthesized table under `/results/single_table_synthesizing/trans/_final/trans_synthetic.csv`. diff --git a/examples/synthesizing/multi_table/config.yaml b/examples/synthesizing/multi_table/config.yaml index cc4cd2cc..4eea7848 100644 --- a/examples/synthesizing/multi_table/config.yaml +++ b/examples/synthesizing/multi_table/config.yaml @@ -1,7 +1,7 @@ # Training example configuration # Base data directory (can be overridden from command line) -base_data_dir: examples/synthesizing/single_table/data -results_dir: examples/synthesizing/single_table/results +base_data_dir: examples/synthesizing/multi_table/data +results_dir: examples/synthesizing/multi_table/results diffusion_config: d_layers: [512, 1024, 1024, 1024, 1024, 512] @@ -29,10 +29,10 @@ classifier_config: iterations: 20000 general_config: - data_dir: examples/synthesizing/single_table/data - test_data_dir: examples/synthesizing/single_table/data - exp_name: single_table_synthesizing - workspace_dir: examples/synthesizing/single_table/results + data_dir: examples/synthesizing/multi_table/data + test_data_dir: examples/synthesizing/multi_table/data + exp_name: multi_table_synthesizing + workspace_dir: examples/synthesizing/multi_table/results sample_prefix: "" sampling_config: diff --git a/examples/synthesizing/multi_table/run_synthesizing.py b/examples/synthesizing/multi_table/run_synthesizing.py new file mode 100644 index 00000000..9d4dbe46 --- /dev/null +++ b/examples/synthesizing/multi_table/run_synthesizing.py @@ -0,0 +1,81 @@ +import pickle +from logging import INFO +from pathlib import Path + +import hydra +from omegaconf import DictConfig + +from examples.training.multi_table import run_training +from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig +from midst_toolkit.common.logger import TOOLKIT_LOGGER, log +from midst_toolkit.models.clavaddpm.data_loaders import load_tables +from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing + + +# Preventing some excessive logging +TOOLKIT_LOGGER.setLevel(INFO) + + +@hydra.main(config_path=".", config_name="config", version_base=None) +def main(config: DictConfig) -> None: + """ + Run the synthesizing pipeline for a multi-table diffusion model. + + It will load the config and then data from the `config.base_data_dir` folder, + train the model, synthesize the data and save the results in the + `config.results_dir` folder. + + It will first look for a pre-trained model in the `config.results_dir` folder. + If it doesn't find one, it will train a new model from scratch. + + Args: + config: Training and synthesizing configuration as an OmegaConf DictConfig object. + """ + log(INFO, f"Checking for a pre-trained model in {config.results_dir}...") + + _, relation_order, _ = load_tables(Path(config.base_data_dir)) + + model_file_paths = {} + for relation in relation_order: + model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl" + model_file_paths[relation] = model_file_path + + clustering_results_file = Path(config.results_dir) / "cluster_ckpt.pkl" + + if all(model_file.exists() for model_file in model_file_paths.values()) and clustering_results_file.exists(): + log(INFO, f"Found a pre-trained models in {config.results_dir}. Skipping training.") + else: + log(INFO, "No pre-trained models found, training a new model from scratch...") + run_training.main(config) + + log(INFO, "Loading models...") + + models = {} + for relation in relation_order: + with open(model_file_paths[relation], "rb") as f: + models[relation] = pickle.load(f) + + with open(clustering_results_file, "rb") as f: + clustering_result = pickle.load(f) + + tables = clustering_result["tables"] + all_group_lengths_prob_dicts = clustering_result["all_group_lengths_prob_dicts"] + + log(INFO, "Synthesizing data...") + + clava_synthesizing( + tables, + relation_order, + Path(config.results_dir), + models, + GeneralConfig(**config.general_config), + SamplingConfig(**config.sampling_config), + MatchingConfig(**config.matching_config), + all_group_lengths_prob_dicts, + ) + + log(INFO, "Data synthesized successfully.") + + +if __name__ == "__main__": + main() diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py index 4fb5ce4b..44d46027 100644 --- a/examples/synthesizing/single_table/run_synthesizing.py +++ b/examples/synthesizing/single_table/run_synthesizing.py @@ -29,7 +29,7 @@ def main(config: DictConfig) -> None: If it doesn't find one, it will train a new model from scratch. Args: - config: Training configuration as an OmegaConf DictConfig object. + config: Training and synthesizing configuration as an OmegaConf DictConfig object. """ log(INFO, f"Checking for a pre-trained model in {config.results_dir}...") @@ -46,13 +46,15 @@ def main(config: DictConfig) -> None: log(INFO, "No pre-trained models found, training a new model from scratch...") run_training.main(config) - log(INFO, "Synthesizing data...") + log(INFO, "Loading models...") models = {} for relation in relation_order: with open(model_file_paths[relation], "rb") as f: models[relation] = pickle.load(f) + log(INFO, "Synthesizing data...") + clava_synthesizing( tables, relation_order, diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py index c9e09cce..2cb10746 100644 --- a/src/midst_toolkit/models/clavaddpm/data_loaders.py +++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py @@ -125,11 +125,6 @@ def load_tables( info=info, ) - # Adding the no parent placeholder column in the tables with no parent - for parent, child in relation_order: - if parent is None: - tables[child].data[NO_PARENT_COLUMN_NAME] = list(range(len(tables[child].data))) - return tables, relation_order, dataset_meta diff --git a/src/midst_toolkit/models/clavaddpm/synthesizer.py b/src/midst_toolkit/models/clavaddpm/synthesizer.py index cbbdddc1..44741bcd 100644 --- a/src/midst_toolkit/models/clavaddpm/synthesizer.py +++ b/src/midst_toolkit/models/clavaddpm/synthesizer.py @@ -17,7 +17,7 @@ from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig from midst_toolkit.common.enumerations import DataSplit from midst_toolkit.common.logger import log -from midst_toolkit.models.clavaddpm.data_loaders import Tables +from midst_toolkit.models.clavaddpm.data_loaders import NO_PARENT_COLUMN_NAME, Tables from midst_toolkit.models.clavaddpm.dataset import Dataset, TableMetadata, Transformations from midst_toolkit.models.clavaddpm.enumerations import ( CategoricalEncoding, @@ -626,9 +626,9 @@ def sample_from_dict(probabilities: dict[int, float]) -> int: Returns: The sampled key. """ - assert sum(probabilities.values()) == 1.0, "The sum of all probabilities must be 1.0." + assert np.isclose(sum(probabilities.values()), 1), "The sum of all probabilities must be 1." - # Generate a random number between 0 and 1 + # Generate a random number between [0, 1) random_number = random.random() # Initialize cumulative sum and the selected key @@ -755,6 +755,10 @@ def clava_synthesizing( log(INFO, "Sample size: {}".format(int(sample_scale * len(df_without_id)))) if parent is None: + # Adding the no parent placeholder column in case it doesn't have it + if NO_PARENT_COLUMN_NAME not in df_without_id.columns: + df_without_id[NO_PARENT_COLUMN_NAME] = list(range(len(df_without_id))) + # synthesize data for single table or tables with no parent synthesized_df, table_keys = _synthesize_single_table( child, From 3a3814d271469aae7f25c46eace2cecc6872a04a Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 13 Nov 2025 15:28:38 -0500 Subject: [PATCH 19/23] making the save dir in case it does't exist --- src/midst_toolkit/models/clavaddpm/clustering.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py index 1299d85b..b841ec6d 100644 --- a/src/midst_toolkit/models/clavaddpm/clustering.py +++ b/src/midst_toolkit/models/clavaddpm/clustering.py @@ -59,6 +59,8 @@ def clava_clustering( "tables": tables, "all_group_lengths_prob_dicts": all_group_lengths_prob_dicts, } + + save_dir.mkdir(parents=True, exist_ok=True) with open(save_dir / "cluster_ckpt.pkl", "wb") as f: pickle.dump(cluster_ckpt, f) From 32e9ab33f2297911ae79233baeaff65451307b5e Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 13 Nov 2025 15:56:10 -0500 Subject: [PATCH 20/23] Fixing tests --- tests/integration/models/clavaddpm/test_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/models/clavaddpm/test_model.py b/tests/integration/models/clavaddpm/test_model.py index 4bb3b965..43689ff0 100644 --- a/tests/integration/models/clavaddpm/test_model.py +++ b/tests/integration/models/clavaddpm/test_model.py @@ -122,7 +122,7 @@ def test_load_single_table(): }, ) - assert relation_order == [[None, "trans"]] + assert relation_order == [(None, "trans")] assert dataset_meta["relation_order"] == [[None, "trans"]] assert dataset_meta["tables"] == {"trans": {"children": [], "parents": []}} @@ -225,7 +225,7 @@ def test_load_tables(): }, ) - assert relation_order == [[None, "account"], ["account", "trans"]] + assert relation_order == [(None, "account"), ("account", "trans")] assert dataset_meta["relation_order"] == [[None, "account"], ["account", "trans"]] assert dataset_meta["tables"] == { "account": {"children": ["trans"], "parents": []}, From 12f6491271e3123ab59541564132206c5e76c1b2 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 13 Nov 2025 16:03:38 -0500 Subject: [PATCH 21/23] Finishing the instructions for the multi table example --- examples/synthesizing/multi_table/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/synthesizing/multi_table/README.md b/examples/synthesizing/multi_table/README.md index 8e13a277..8124c68b 100644 --- a/examples/synthesizing/multi_table/README.md +++ b/examples/synthesizing/multi_table/README.md @@ -55,6 +55,6 @@ In the `/results/before_matching/` folder, there will be a file called `syntheti which is a pickle file containing the synthetic data before the matching process, in case it's needed. -The `/results/single_table_synthesizing` folder will contain the final synthesized -data, organized per table. In this single-table example, there is only going to be one -synthesized table under `/results/single_table_synthesizing/trans/_final/trans_synthetic.csv`. +The `/results/multi_table_synthesizing` folder will contain the final synthesized +data, organized per table, in the form of `.csv` files with the following naming pattern: +`/results/multi_table_synthesizing/{table_name}/_final/{table_name}_synthetic.csv`. From bc9af11df36656fa15ab3bd060b0885a2f545236 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Thu, 13 Nov 2025 16:08:26 -0500 Subject: [PATCH 22/23] CR by coderabbit --- examples/synthesizing/multi_table/README.md | 6 +++--- examples/synthesizing/multi_table/run_synthesizing.py | 2 +- examples/synthesizing/single_table/README.md | 6 +++--- examples/synthesizing/single_table/run_synthesizing.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/synthesizing/multi_table/README.md b/examples/synthesizing/multi_table/README.md index 8124c68b..737b49ec 100644 --- a/examples/synthesizing/multi_table/README.md +++ b/examples/synthesizing/multi_table/README.md @@ -8,12 +8,12 @@ up using the code in this toolkit. First, we need the data. Download it from this [Google Drive link](https://drive.google.com/file/d/1Ao222l4AJjG54-HDEGCWkIfzRbl9_IKa/view?usp=drive_link), -extract the files and place them in a `/data` folder in within this folder +extract the files and place them in a `/data` folder within this folder (`examples/synthesizing/multi_table`). > [!NOTE] > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute -> of the (`config.yaml`)[config.yaml] file. +> of the [`config.yaml`](config.yaml) file. It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`, and `trans`. For each table there will be two files: @@ -49,7 +49,7 @@ It will save the result files inside a `/results` folder within this folder > [!NOTE] > If you wish to change the save folder, you can do so by editing the `results_dir` attribute -> of the (`config.yaml`)[config.yaml] file. +> of the [`config.yaml`](config.yaml) file. In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`, which is a pickle file containing the synthetic data before the matching process, in case diff --git a/examples/synthesizing/multi_table/run_synthesizing.py b/examples/synthesizing/multi_table/run_synthesizing.py index 9d4dbe46..b19678f6 100644 --- a/examples/synthesizing/multi_table/run_synthesizing.py +++ b/examples/synthesizing/multi_table/run_synthesizing.py @@ -43,7 +43,7 @@ def main(config: DictConfig) -> None: clustering_results_file = Path(config.results_dir) / "cluster_ckpt.pkl" if all(model_file.exists() for model_file in model_file_paths.values()) and clustering_results_file.exists(): - log(INFO, f"Found a pre-trained models in {config.results_dir}. Skipping training.") + log(INFO, f"Found pre-trained models in {config.results_dir}. Skipping training.") else: log(INFO, "No pre-trained models found, training a new model from scratch...") run_training.main(config) diff --git a/examples/synthesizing/single_table/README.md b/examples/synthesizing/single_table/README.md index 15c1bca8..5f6f1f51 100644 --- a/examples/synthesizing/single_table/README.md +++ b/examples/synthesizing/single_table/README.md @@ -8,12 +8,12 @@ up using the code in this toolkit. First, we need the data. Download it from this [Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link), -extract the files and place them in a `/data` folder in within this folder +extract the files and place them in a `/data` folder within this folder (`examples/synthesizing/single_table`). > [!NOTE] > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute -> of the (`config.yaml`)[config.yaml] file. +> of the [`config.yaml`](config.yaml) file. Here is a description of the files that have been extracted: - `trans.csv`: The training data. It consists of information about bank transactions and it @@ -47,7 +47,7 @@ It will save the result files inside a `/results` folder within this folder > [!NOTE] > If you wish to change the save folder, you can do so by editing the `results_dir` attribute -> of the (`config.yaml`)[config.yaml] file. +> of the [`config.yaml`](config.yaml) file. In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`, which is a pickle file containing the synthetic data before the matching process, in case diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py index 44d46027..72449103 100644 --- a/examples/synthesizing/single_table/run_synthesizing.py +++ b/examples/synthesizing/single_table/run_synthesizing.py @@ -41,7 +41,7 @@ def main(config: DictConfig) -> None: model_file_paths[relation] = model_file_path if all(model_file.exists() for model_file in model_file_paths.values()): - log(INFO, f"Found a pre-trained models in {config.results_dir}. Skipping training.") + log(INFO, f"Found pre-trained models in {config.results_dir}. Skipping training.") else: log(INFO, "No pre-trained models found, training a new model from scratch...") run_training.main(config) From 02bcf422ee07be56836d299bc9da5b09559fd402 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Mon, 17 Nov 2025 15:31:47 -0500 Subject: [PATCH 23/23] David's CR --- examples/synthesizing/multi_table/config.yaml | 3 +++ .../multi_table/run_synthesizing.py | 19 ++++++++++----- .../synthesizing/single_table/config.yaml | 2 ++ .../single_table/run_synthesizing.py | 24 ++++++++++++++----- 4 files changed, 36 insertions(+), 12 deletions(-) diff --git a/examples/synthesizing/multi_table/config.yaml b/examples/synthesizing/multi_table/config.yaml index 4eea7848..bbfcff30 100644 --- a/examples/synthesizing/multi_table/config.yaml +++ b/examples/synthesizing/multi_table/config.yaml @@ -3,6 +3,8 @@ base_data_dir: examples/synthesizing/multi_table/data results_dir: examples/synthesizing/multi_table/results +# diffusion_config, clustering_config, and classifier_config are only required +# when training a new model from scratch diffusion_config: d_layers: [512, 1024, 1024, 1024, 1024, 512] dropout: 0.0 @@ -28,6 +30,7 @@ classifier_config: batch_size: 4096 iterations: 20000 +# Synthesizing configuration general_config: data_dir: examples/synthesizing/multi_table/data test_data_dir: examples/synthesizing/multi_table/data diff --git a/examples/synthesizing/multi_table/run_synthesizing.py b/examples/synthesizing/multi_table/run_synthesizing.py index b19678f6..9d845e2e 100644 --- a/examples/synthesizing/multi_table/run_synthesizing.py +++ b/examples/synthesizing/multi_table/run_synthesizing.py @@ -1,6 +1,7 @@ import pickle from logging import INFO from pathlib import Path +from typing import Any import hydra from omegaconf import DictConfig @@ -9,6 +10,7 @@ from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig from midst_toolkit.common.logger import TOOLKIT_LOGGER, log from midst_toolkit.models.clavaddpm.data_loaders import load_tables +from midst_toolkit.models.clavaddpm.enumerations import Relation from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing @@ -35,24 +37,29 @@ def main(config: DictConfig) -> None: _, relation_order, _ = load_tables(Path(config.base_data_dir)) - model_file_paths = {} + model_file_paths: dict[Relation, dict[str, Any]] = {} for relation in relation_order: model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl" - model_file_paths[relation] = model_file_path + model_file_paths[relation] = { + "file_path": model_file_path, + "exists": model_file_path.exists(), + } clustering_results_file = Path(config.results_dir) / "cluster_ckpt.pkl" - if all(model_file.exists() for model_file in model_file_paths.values()) and clustering_results_file.exists(): - log(INFO, f"Found pre-trained models in {config.results_dir}. Skipping training.") + if all(result["exists"] for result in model_file_paths.values()) and clustering_results_file.exists(): + log(INFO, f"Found previous results in {config.results_dir}. Skipping training.") else: - log(INFO, "No pre-trained models found, training a new model from scratch...") + log(INFO, "Not all previous results found. Training a new model from scratch.") + log(INFO, f"Summary of results: {model_file_paths}") + log(INFO, f"Clustering results file: {clustering_results_file} exists? {clustering_results_file.exists()}") run_training.main(config) log(INFO, "Loading models...") models = {} for relation in relation_order: - with open(model_file_paths[relation], "rb") as f: + with open(model_file_paths[relation]["file_path"], "rb") as f: models[relation] = pickle.load(f) with open(clustering_results_file, "rb") as f: diff --git a/examples/synthesizing/single_table/config.yaml b/examples/synthesizing/single_table/config.yaml index de21a680..b3cbb0e2 100644 --- a/examples/synthesizing/single_table/config.yaml +++ b/examples/synthesizing/single_table/config.yaml @@ -3,6 +3,7 @@ base_data_dir: examples/synthesizing/single_table/data results_dir: examples/synthesizing/single_table/results +# diffusion_config is only required when training a new model from scratch diffusion_config: d_layers: [512, 1024, 1024, 1024, 1024, 512] dropout: 0.0 @@ -16,6 +17,7 @@ diffusion_config: scheduler: cosine data_split_ratios: [0.99, 0.005, 0.005] +# Synthesizing configuration general_config: data_dir: examples/synthesizing/single_table/data test_data_dir: examples/synthesizing/single_table/data diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py index 72449103..b9f6a649 100644 --- a/examples/synthesizing/single_table/run_synthesizing.py +++ b/examples/synthesizing/single_table/run_synthesizing.py @@ -1,6 +1,7 @@ import pickle from logging import INFO from pathlib import Path +from typing import Any import hydra from omegaconf import DictConfig @@ -9,6 +10,7 @@ from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig from midst_toolkit.common.logger import TOOLKIT_LOGGER, log from midst_toolkit.models.clavaddpm.data_loaders import load_tables +from midst_toolkit.models.clavaddpm.enumerations import Relation from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing @@ -35,22 +37,32 @@ def main(config: DictConfig) -> None: tables, relation_order, _ = load_tables(Path(config.base_data_dir)) - model_file_paths = {} + assert len(relation_order) == 1 and relation_order[0][0] is None, ( + "Relation order is not configured for single-table. " + "For multi-table synthesizing, please use the `examples.synthesizing.multi_table.run_synthesizing` example. " + f"Relation order: {relation_order}" + ) + + model_file_paths: dict[Relation, dict[str, Any]] = {} for relation in relation_order: model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl" - model_file_paths[relation] = model_file_path + model_file_paths[relation] = { + "file_path": model_file_path, + "exists": model_file_path.exists(), + } - if all(model_file.exists() for model_file in model_file_paths.values()): - log(INFO, f"Found pre-trained models in {config.results_dir}. Skipping training.") + if all(result["exists"] for result in model_file_paths.values()): + log(INFO, f"Found previous results in {config.results_dir}. Skipping training.") else: - log(INFO, "No pre-trained models found, training a new model from scratch...") + log(INFO, "Not all previous results found. Training a new model from scratch.") + log(INFO, f"Summary of results: {model_file_paths}") run_training.main(config) log(INFO, "Loading models...") models = {} for relation in relation_order: - with open(model_file_paths[relation], "rb") as f: + with open(model_file_paths[relation]["file_path"], "rb") as f: models[relation] = pickle.load(f) log(INFO, "Synthesizing data...")