From 2550c85c8f233f56780498c89c0ae40c2e07294a Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Tue, 11 Nov 2025 16:27:41 -0500
Subject: [PATCH 01/23] WIP needs to save results

---
 .gitignore                                    |  6 +++
 examples/training/single_table/README.md      | 38 +++++++++++++++++
 examples/training/single_table/config.yaml    | 17 ++++++++
 .../training/single_table/run_training.py     | 42 +++++++++++++++++++
 4 files changed, 103 insertions(+)
 create mode 100644 examples/training/single_table/README.md
 create mode 100644 examples/training/single_table/config.yaml
 create mode 100644 examples/training/single_table/run_training.py

diff --git a/.gitignore b/.gitignore
index dde72bbc..611e0bc1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,3 +41,9 @@ outputs/
 
 # mkdocs site
 site/
+
+# Training examples
+examples/training/single_table/data/**
+examples/training/single_table/results/**
+examples/training/multi_table/data/**
+examples/training/multi_table/results/**
diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md
new file mode 100644
index 00000000..450078b8
--- /dev/null
+++ b/examples/training/single_table/README.md
@@ -0,0 +1,38 @@
+# Single-Table Training Example
+
+This example will go over traning a single-table diffusion model from the ground up using the
+code in this toolkit.
+
+## Downloading data
+
+First, we need the data. Download it from this
+[Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link),
+extract the files and place them `/data` folder in within this folder (`examples/training/single_table`).
+
+> [!NOTE]
+> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+Here is a description of the files:
+- `trans.csv`: The training data. It consists of information about bank transactions and it
+contains 20,000 data points.
+- `trans_domain.json`: Metadata about the columns in `trans.csv`, such as data types and sizes.
+- `dataset_meta.json`: Metadata about the relationship between the tables. Since this is a
+single-table example, it will only contain information about the `trans` table.
+
+## Kicking off traning
+
+The [`config.yaml`] file contains the parameters for the training. Please take a look a them
+beforee kicking off the training and edit them as necessary.
+
+To kick off training, simply run the command below from the project's root folder:
+
+```bash
+python -m examples.training.single_table.run_training
+```
+
+It will save the files inside a `/results` folder within this folder (`examples/training/single_table`).
+
+> [!NOTE]
+> If you wish to change the save folder, you can do so by editing the `results_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
diff --git a/examples/training/single_table/config.yaml b/examples/training/single_table/config.yaml
new file mode 100644
index 00000000..a0710316
--- /dev/null
+++ b/examples/training/single_table/config.yaml
@@ -0,0 +1,17 @@
+# Training example configuration
+# Base data directory (can be overridden from command line)
+base_data_dir: examples/training/data
+results_dir: examples/training/results
+
+diffusion_config:
+  d_layers: [512, 1024, 1024, 1024, 1024, 512]
+  dropout: 0.0
+  num_timesteps: 2000
+  model_type: mlp
+  iterations: 200000
+  batch_size: 4096
+  lr: 0.0006
+  gaussian_loss_type: mse
+  weight_decay: 1e-05
+  scheduler: cosine
+  data_split_ratios: [0.99, 0.005, 0.005]
diff --git a/examples/training/single_table/run_training.py b/examples/training/single_table/run_training.py
new file mode 100644
index 00000000..f89cfeb1
--- /dev/null
+++ b/examples/training/single_table/run_training.py
@@ -0,0 +1,42 @@
+from logging import INFO
+from pathlib import Path
+
+import hydra
+from omegaconf import DictConfig
+
+from midst_toolkit.common.config import DiffusionConfig
+from midst_toolkit.common.logger import log
+from midst_toolkit.common.variables import DEVICE
+from midst_toolkit.models.clavaddpm.data_loaders import load_tables
+from midst_toolkit.models.clavaddpm.train import clava_training
+
+
+@hydra.main(config_path=".", config_name="config", version_base=None)
+def main(config: DictConfig) -> None:
+    """
+    Run the training pipeline.
+
+    It will load the config and then data from the `config.base_data_dir` folder,
+    train the model and save the results in the `config.results_dir` folder.
+
+    Args:
+        config: Training configuration as an OmegaConf DictConfig object.
+    """
+    log(INFO, f"Loading data from {config.base_data_dir}...")
+    tables, relation_order, _ = load_tables(Path(config.base_data_dir))
+
+    log(INFO, "Training model...")
+    diffusion_config = DiffusionConfig(**config.diffusion_config)
+
+    tables, _ = clava_training(
+        tables,
+        relation_order,
+        Path(config.results_dir),
+        diffusion_config,
+        device=DEVICE,
+    )
+    log(INFO, "Model trained successfully.")
+
+
+if __name__ == "__main__":
+    main()

From 8146b8a54b91cf159ce47dd01cf65242a91832c4 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Tue, 11 Nov 2025 16:58:25 -0500
Subject: [PATCH 02/23] Done the single table

---
 examples/training/single_table/README.md      | 28 +++++++++++++++++--
 .../training/single_table/run_training.py     | 14 +++++++++-
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md
index 450078b8..f521106f 100644
--- a/examples/training/single_table/README.md
+++ b/examples/training/single_table/README.md
@@ -3,23 +3,26 @@
 This example will go over traning a single-table diffusion model from the ground up using the
 code in this toolkit.
 
+
 ## Downloading data
 
 First, we need the data. Download it from this
 [Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link),
-extract the files and place them `/data` folder in within this folder (`examples/training/single_table`).
+extract the files and place them in a `/data` folder in within this folder
+(`examples/training/single_table`).
 
 > [!NOTE]
 > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
 > of the [`config.yaml`](config.yaml) file.
 
-Here is a description of the files:
+Here is a description of the files that have been extracted:
 - `trans.csv`: The training data. It consists of information about bank transactions and it
 contains 20,000 data points.
 - `trans_domain.json`: Metadata about the columns in `trans.csv`, such as data types and sizes.
 - `dataset_meta.json`: Metadata about the relationship between the tables. Since this is a
 single-table example, it will only contain information about the `trans` table.
 
+
 ## Kicking off traning
 
 The [`config.yaml`] file contains the parameters for the training. Please take a look a them
@@ -31,8 +34,27 @@ To kick off training, simply run the command below from the project's root folde
 python -m examples.training.single_table.run_training
 ```
 
-It will save the files inside a `/results` folder within this folder (`examples/training/single_table`).
+It will save the result files inside a `/results` folder within this folder
+(`examples/training/single_table`).
 
 > [!NOTE]
 > If you wish to change the save folder, you can do so by editing the `results_dir` attribute
 > of the [`config.yaml`](config.yaml) file.
+
+In the `/results/models/` folder, there will be a file called `None_trans_ckpt.pkl`,
+which is a pickle file containing the training results. You can load it using Python's
+`pickle` and it will yield an instance of
+`midst_toolkit.models.clavaddpm.train.ModelArtifacts`, which contains the trained
+diffusion model along with some additional metadata about the training process:
+
+```python
+import pickle
+from midst_toolkit.models.clavaddpm.train import ModelArtifacts
+
+results_file = Path("examples/single_table/results/models/None_trans_ckpt.pkl")
+
+ with open(results_file, "rb") as f:
+    result = pickle.load(f)
+
+assert isinstance(result, ModelArtifacts)
+```
diff --git a/examples/training/single_table/run_training.py b/examples/training/single_table/run_training.py
index f89cfeb1..ae7c4ed0 100644
--- a/examples/training/single_table/run_training.py
+++ b/examples/training/single_table/run_training.py
@@ -1,3 +1,4 @@
+import pickle
 from logging import INFO
 from pathlib import Path
 
@@ -8,7 +9,7 @@
 from midst_toolkit.common.logger import log
 from midst_toolkit.common.variables import DEVICE
 from midst_toolkit.models.clavaddpm.data_loaders import load_tables
-from midst_toolkit.models.clavaddpm.train import clava_training
+from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training
 
 
 @hydra.main(config_path=".", config_name="config", version_base=None)
@@ -37,6 +38,17 @@ def main(config: DictConfig) -> None:
     )
     log(INFO, "Model trained successfully.")
 
+    results_file = Path(config.results_dir) / "models" / "None_trans_ckpt.pkl"
+    log(INFO, f"Checking the results from {results_file}...")
+
+    with open(results_file, "rb") as f:
+        result = pickle.load(f)
+
+    # Asserting the results are the correct type
+    assert isinstance(result, ModelArtifacts)
+
+    log(INFO, f"Result size (in bytes): {results_file.stat().st_size}")
+
 
 if __name__ == "__main__":
     main()

From 00d879a85512d50213353f26740a09295095ed45 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Wed, 12 Nov 2025 14:55:11 -0500
Subject: [PATCH 03/23] Finished adding the multi table example

---
 examples/training/multi_table/README.md       | 66 +++++++++++++++++
 examples/training/multi_table/config.yaml     | 29 ++++++++
 examples/training/multi_table/run_training.py | 74 +++++++++++++++++++
 examples/training/single_table/config.yaml    |  4 +-
 .../training/single_table/run_training.py     |  6 +-
 .../models/clavaddpm/clustering.py            |  4 +
 6 files changed, 180 insertions(+), 3 deletions(-)
 create mode 100644 examples/training/multi_table/README.md
 create mode 100644 examples/training/multi_table/config.yaml
 create mode 100644 examples/training/multi_table/run_training.py

diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md
new file mode 100644
index 00000000..380b5c86
--- /dev/null
+++ b/examples/training/multi_table/README.md
@@ -0,0 +1,66 @@
+# Multi-Table Training Example
+
+This example will go over traning a multi-table diffusion model from the ground up using the
+code in this toolkit.
+
+
+## Downloading data
+
+First, we need the data. Download it from this
+[Google Drive link](https://drive.google.com/file/d/1Ao222l4AJjG54-HDEGCWkIfzRbl9_IKa/view?usp=drive_link),
+extract the files and place them in a `/data` folder in within this folder
+(`examples/training/multi_table`).
+
+> [!NOTE]
+> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`,
+and `trans`. For each table there will be two files:
+- `{table_name}.csv`: The table's data.
+- `{table_name}_domain.json`: Metadata about the columns in the table's data, such as data types and sizes.
+
+Additionally, you will find one more file:
+- `dataset_meta.json`: Metadata about the relationship between the tables. It will describe which tables
+are associated with which other tables.
+
+
+## Kicking off traning
+
+The [`config.yaml`] file contains the parameters for the training. Please take a look a them
+beforee kicking off the training and edit them as necessary.
+
+To kick off training, simply run the command below from the project's root folder:
+
+```bash
+python -m examples.training.multi_table.run_training
+```
+
+It will save the result files inside a `/results` folder within this folder
+(`examples/training/multi_table`).
+
+> [!NOTE]
+> If you wish to change the save folder, you can do so by editing the `results_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+One of the results file is `/results/cluster_ckpt.pkl`, which will contain the results
+of the clustering step.
+
+The other result files are in the `/results/models/` folder. They will be named after the
+table relations defined in `dataset_meta.json`. For eexample: for the `("client", "account")`
+relation, there will be a file called `client_account_ckpt.pkl`, which is a pickle file
+containing the training results. You can load it using Python's `pickle` and it will yield
+an instance of `midst_toolkit.models.clavaddpm.train.ModelArtifacts`, which contains the
+trained diffusion model along with some additional metadata about the training process:
+
+```python
+import pickle
+from midst_toolkit.models.clavaddpm.train import ModelArtifacts
+
+results_file = Path("examples/multi_table/results/models/client_account_ckpt.pkl")
+
+ with open(results_file, "rb") as f:
+    result = pickle.load(f)
+
+assert isinstance(result, ModelArtifacts)
+```
diff --git a/examples/training/multi_table/config.yaml b/examples/training/multi_table/config.yaml
new file mode 100644
index 00000000..6d5e701d
--- /dev/null
+++ b/examples/training/multi_table/config.yaml
@@ -0,0 +1,29 @@
+# Multi-table training example configuration
+# Base data directory (can be overridden from command line)
+base_data_dir: examples/training/multi_table/data
+results_dir: examples/training/multi_table/results
+
+diffusion_config:
+  d_layers: [512, 1024, 1024, 1024, 1024, 512]
+  dropout: 0.0
+  num_timesteps: 2000
+  model_type: mlp
+  iterations: 20000
+  batch_size: 4096
+  lr: 0.0006
+  gaussian_loss_type: mse
+  weight_decay: 1e-05
+  scheduler: cosine
+  data_split_ratios: [0.99, 0.005, 0.005]
+
+clustering_config:
+  parent_scale: 1.0
+  num_clusters: 50
+  clustering_method: kmeans_and_gmm
+
+classifier_config:
+    d_layers: [128, 256, 512, 1024, 512, 256, 128]
+    lr: 0.0001
+    dim_t: 128
+    batch_size: 4096
+    iterations: 20000
diff --git a/examples/training/multi_table/run_training.py b/examples/training/multi_table/run_training.py
new file mode 100644
index 00000000..b0d1e2f8
--- /dev/null
+++ b/examples/training/multi_table/run_training.py
@@ -0,0 +1,74 @@
+import pickle
+from logging import INFO
+from pathlib import Path
+
+import hydra
+from omegaconf import DictConfig
+
+from midst_toolkit.common.config import ClassifierConfig, ClusteringConfig, DiffusionConfig
+from midst_toolkit.common.logger import TOOLKIT_LOGGER, log
+from midst_toolkit.common.variables import DEVICE
+from midst_toolkit.models.clavaddpm.clustering import clava_clustering
+from midst_toolkit.models.clavaddpm.data_loaders import Table, load_tables
+from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training
+
+
+# Preventing some excessive logging
+TOOLKIT_LOGGER.setLevel(INFO)
+
+
+@hydra.main(config_path=".", config_name="config", version_base=None)
+def main(config: DictConfig) -> None:
+    """
+    Run the training pipeline.
+
+    It will load the config and then data from the `config.base_data_dir` folder,
+    train the model and save the results in the `config.results_dir` folder.
+
+    Args:
+        config: Training configuration as an OmegaConf DictConfig object.
+    """
+    log(INFO, f"Loading data from {config.base_data_dir}...")
+    tables, relation_order, _ = load_tables(Path(config.base_data_dir))
+
+    log(INFO, "Clustering data...")
+    clustering_config = ClusteringConfig(**config.clustering_config)
+    tables, _ = clava_clustering(tables, relation_order, Path(config.results_dir), clustering_config)
+
+    log(INFO, "Training model...")
+    diffusion_config = DiffusionConfig(**config.diffusion_config)
+    classifier_config = ClassifierConfig(**config.classifier_config)
+
+    tables, _ = clava_training(
+        tables,
+        relation_order,
+        Path(config.results_dir),
+        diffusion_config,
+        classifier_config,
+        device=DEVICE,
+    )
+    log(INFO, "Model trained successfully.")
+
+    log(INFO, "Checking the clustering results...")
+    clustering_results_file = Path(config.results_dir) / "cluster_ckpt.pkl"
+    with open(clustering_results_file, "rb") as f:
+        clustering_result = pickle.load(f)
+
+    assert all(isinstance(table, Table) for table in clustering_result["tables"].values())
+    assert isinstance(clustering_result["all_group_lengths_prob_dicts"], dict)
+
+    for relation in relation_order:
+        results_file = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl"
+        log(INFO, f"Checking the results from {results_file}...")
+
+        with open(results_file, "rb") as f:
+            result = pickle.load(f)
+
+        # Asserting the results are the correct type
+        assert isinstance(result, ModelArtifacts)
+
+        log(INFO, f"Result size (in bytes): {results_file.stat().st_size}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/training/single_table/config.yaml b/examples/training/single_table/config.yaml
index a0710316..7fc54556 100644
--- a/examples/training/single_table/config.yaml
+++ b/examples/training/single_table/config.yaml
@@ -1,7 +1,7 @@
 # Training example configuration
 # Base data directory (can be overridden from command line)
-base_data_dir: examples/training/data
-results_dir: examples/training/results
+base_data_dir: examples/training/single_table/data
+results_dir: examples/training/single_table/results
 
 diffusion_config:
   d_layers: [512, 1024, 1024, 1024, 1024, 512]
diff --git a/examples/training/single_table/run_training.py b/examples/training/single_table/run_training.py
index ae7c4ed0..e5dd7202 100644
--- a/examples/training/single_table/run_training.py
+++ b/examples/training/single_table/run_training.py
@@ -6,12 +6,16 @@
 from omegaconf import DictConfig
 
 from midst_toolkit.common.config import DiffusionConfig
-from midst_toolkit.common.logger import log
+from midst_toolkit.common.logger import TOOLKIT_LOGGER, log
 from midst_toolkit.common.variables import DEVICE
 from midst_toolkit.models.clavaddpm.data_loaders import load_tables
 from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training
 
 
+# Preventing some excessive logging
+TOOLKIT_LOGGER.setLevel(INFO)
+
+
 @hydra.main(config_path=".", config_name="config", version_base=None)
 def main(config: DictConfig) -> None:
     """
diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py
index 646b6de3..5c383122 100644
--- a/src/midst_toolkit/models/clavaddpm/clustering.py
+++ b/src/midst_toolkit/models/clavaddpm/clustering.py
@@ -753,6 +753,10 @@ def _min_max_normalize_sklearn(matrix: np.ndarray) -> np.ndarray:
     Returns:
         Numpy array of the normalized data.
     """
+    if matrix.shape[1] == 0:
+        # If theree are no features to normalize, then no-op
+        return matrix
+
     scaler = MinMaxScaler(feature_range=(-1, 1))
     return scaler.fit_transform(matrix)
 

From 414892241c75790568160b4bed4c158854f5adb7 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Wed, 12 Nov 2025 14:59:30 -0500
Subject: [PATCH 04/23] Adding test for the bug fix

---
 tests/unit/models/clavaddpm/test_clustering.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/unit/models/clavaddpm/test_clustering.py b/tests/unit/models/clavaddpm/test_clustering.py
index 8d684796..c4a8a77c 100644
--- a/tests/unit/models/clavaddpm/test_clustering.py
+++ b/tests/unit/models/clavaddpm/test_clustering.py
@@ -51,6 +51,16 @@ def test_min_max_normalize_sklearn() -> None:
     unset_all_random_seeds()
 
 
+def test_min_max_normalize_sklearn_empty_matrix() -> None:
+    set_all_random_seeds(42)
+
+    data_to_normalize = np.random.randint(0, 3, (5, 0))
+    normalized_data = _min_max_normalize_sklearn(data_to_normalize)
+    assert data_to_normalize is normalized_data
+
+    unset_all_random_seeds()
+
+
 def test_get_normalized_numerical_columns() -> None:
     set_all_random_seeds(42)
     child_data = np.random.randint(0, 3, (3, 3))

From f075e46dfffaa8f69beb4e10e5b97493d04fdc97 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Wed, 12 Nov 2025 15:03:55 -0500
Subject: [PATCH 05/23] Better docstrings

---
 examples/training/multi_table/run_training.py  | 2 +-
 examples/training/single_table/run_training.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/training/multi_table/run_training.py b/examples/training/multi_table/run_training.py
index b0d1e2f8..6d5548a5 100644
--- a/examples/training/multi_table/run_training.py
+++ b/examples/training/multi_table/run_training.py
@@ -20,7 +20,7 @@
 @hydra.main(config_path=".", config_name="config", version_base=None)
 def main(config: DictConfig) -> None:
     """
-    Run the training pipeline.
+    Run the training pipeline for a multi-table diffusion model.
 
     It will load the config and then data from the `config.base_data_dir` folder,
     train the model and save the results in the `config.results_dir` folder.
diff --git a/examples/training/single_table/run_training.py b/examples/training/single_table/run_training.py
index e5dd7202..74897db7 100644
--- a/examples/training/single_table/run_training.py
+++ b/examples/training/single_table/run_training.py
@@ -19,7 +19,7 @@
 @hydra.main(config_path=".", config_name="config", version_base=None)
 def main(config: DictConfig) -> None:
     """
-    Run the training pipeline.
+    Run the training pipeline for a single-table diffusion model.
 
     It will load the config and then data from the `config.base_data_dir` folder,
     train the model and save the results in the `config.results_dir` folder.

From 7d890f90abe3920c60b9d85b4263872b79211c1c Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Wed, 12 Nov 2025 15:04:20 -0500
Subject: [PATCH 06/23] Fixing typo

---
 src/midst_toolkit/models/clavaddpm/clustering.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py
index 5c383122..1299d85b 100644
--- a/src/midst_toolkit/models/clavaddpm/clustering.py
+++ b/src/midst_toolkit/models/clavaddpm/clustering.py
@@ -754,7 +754,7 @@ def _min_max_normalize_sklearn(matrix: np.ndarray) -> np.ndarray:
         Numpy array of the normalized data.
     """
     if matrix.shape[1] == 0:
-        # If theree are no features to normalize, then no-op
+        # If there are no features to normalize, then no-op
         return matrix
 
     scaler = MinMaxScaler(feature_range=(-1, 1))

From 402e35d68af18cf489eabf0649df68c3a72e2c00 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Wed, 12 Nov 2025 15:09:03 -0500
Subject: [PATCH 07/23] Fixing the config yaml link

---
 examples/training/multi_table/README.md  | 6 +++---
 examples/training/single_table/README.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md
index 380b5c86..ddbec0ee 100644
--- a/examples/training/multi_table/README.md
+++ b/examples/training/multi_table/README.md
@@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder
 
 > [!NOTE]
 > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
-> of the [`config.yaml`](config.yaml) file.
+> of the (`config.yaml`)[config.yaml] file.
 
 It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`,
 and `trans`. For each table there will be two files:
@@ -27,7 +27,7 @@ are associated with which other tables.
 
 ## Kicking off traning
 
-The [`config.yaml`] file contains the parameters for the training. Please take a look a them
+The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them
 beforee kicking off the training and edit them as necessary.
 
 To kick off training, simply run the command below from the project's root folder:
@@ -41,7 +41,7 @@ It will save the result files inside a `/results` folder within this folder
 
 > [!NOTE]
 > If you wish to change the save folder, you can do so by editing the `results_dir` attribute
-> of the [`config.yaml`](config.yaml) file.
+> of the (`config.yaml`)[config.yaml] file.
 
 One of the results file is `/results/cluster_ckpt.pkl`, which will contain the results
 of the clustering step.
diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md
index f521106f..4e9b6f9e 100644
--- a/examples/training/single_table/README.md
+++ b/examples/training/single_table/README.md
@@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder
 
 > [!NOTE]
 > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
-> of the [`config.yaml`](config.yaml) file.
+> of the (`config.yaml`)[config.yaml] file.
 
 Here is a description of the files that have been extracted:
 - `trans.csv`: The training data. It consists of information about bank transactions and it
@@ -25,7 +25,7 @@ single-table example, it will only contain information about the `trans` table.
 
 ## Kicking off traning
 
-The [`config.yaml`] file contains the parameters for the training. Please take a look a them
+The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them
 beforee kicking off the training and edit them as necessary.
 
 To kick off training, simply run the command below from the project's root folder:
@@ -39,7 +39,7 @@ It will save the result files inside a `/results` folder within this folder
 
 > [!NOTE]
 > If you wish to change the save folder, you can do so by editing the `results_dir` attribute
-> of the [`config.yaml`](config.yaml) file.
+> of the (`config.yaml`)[config.yaml] file.
 
 In the `/results/models/` folder, there will be a file called `None_trans_ckpt.pkl`,
 which is a pickle file containing the training results. You can load it using Python's

From 5fd0cc88f1d455ea2f7a09784a2047f79f27adca Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Wed, 12 Nov 2025 15:12:06 -0500
Subject: [PATCH 08/23] CR by coderabbit

---
 examples/training/multi_table/README.md  | 12 ++++++------
 examples/training/single_table/README.md | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md
index ddbec0ee..a6c47238 100644
--- a/examples/training/multi_table/README.md
+++ b/examples/training/multi_table/README.md
@@ -1,6 +1,6 @@
 # Multi-Table Training Example
 
-This example will go over traning a multi-table diffusion model from the ground up using the
+This example will go over training a multi-table diffusion model from the ground up using the
 code in this toolkit.
 
 
@@ -25,10 +25,10 @@ Additionally, you will find one more file:
 are associated with which other tables.
 
 
-## Kicking off traning
+## Kicking off training
 
-The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them
-beforee kicking off the training and edit them as necessary.
+The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a
+look at them before kicking off the training and edit them as necessary.
 
 To kick off training, simply run the command below from the project's root folder:
 
@@ -47,7 +47,7 @@ One of the results file is `/results/cluster_ckpt.pkl`, which will contain the r
 of the clustering step.
 
 The other result files are in the `/results/models/` folder. They will be named after the
-table relations defined in `dataset_meta.json`. For eexample: for the `("client", "account")`
+table relations defined in `dataset_meta.json`. For example: for the `("client", "account")`
 relation, there will be a file called `client_account_ckpt.pkl`, which is a pickle file
 containing the training results. You can load it using Python's `pickle` and it will yield
 an instance of `midst_toolkit.models.clavaddpm.train.ModelArtifacts`, which contains the
@@ -57,7 +57,7 @@ trained diffusion model along with some additional metadata about the training p
 import pickle
 from midst_toolkit.models.clavaddpm.train import ModelArtifacts
 
-results_file = Path("examples/multi_table/results/models/client_account_ckpt.pkl")
+results_file = Path("examples/training/multi_table/results/models/client_account_ckpt.pkl")
 
  with open(results_file, "rb") as f:
     result = pickle.load(f)
diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md
index 4e9b6f9e..ea78d325 100644
--- a/examples/training/single_table/README.md
+++ b/examples/training/single_table/README.md
@@ -1,6 +1,6 @@
 # Single-Table Training Example
 
-This example will go over traning a single-table diffusion model from the ground up using the
+This example will go over training a single-table diffusion model from the ground up using the
 code in this toolkit.
 
 
@@ -23,10 +23,10 @@ contains 20,000 data points.
 single-table example, it will only contain information about the `trans` table.
 
 
-## Kicking off traning
+## Kicking off training
 
-The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them
-beforee kicking off the training and edit them as necessary.
+The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a
+look at them before kicking off the training and edit them as necessary.
 
 To kick off training, simply run the command below from the project's root folder:
 
@@ -51,7 +51,7 @@ diffusion model along with some additional metadata about the training process:
 import pickle
 from midst_toolkit.models.clavaddpm.train import ModelArtifacts
 
-results_file = Path("examples/single_table/results/models/None_trans_ckpt.pkl")
+results_file = Path("examples/training/single_table/results/models/None_trans_ckpt.pkl")
 
  with open(results_file, "rb") as f:
     result = pickle.load(f)

From e9d385b4075b36fcd9784553123fd63b7b6140ee Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Wed, 12 Nov 2025 15:09:03 -0500
Subject: [PATCH 09/23] Fixing the config yaml link

---
 examples/training/multi_table/README.md  | 6 +++---
 examples/training/single_table/README.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md
index 380b5c86..ddbec0ee 100644
--- a/examples/training/multi_table/README.md
+++ b/examples/training/multi_table/README.md
@@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder
 
 > [!NOTE]
 > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
-> of the [`config.yaml`](config.yaml) file.
+> of the (`config.yaml`)[config.yaml] file.
 
 It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`,
 and `trans`. For each table there will be two files:
@@ -27,7 +27,7 @@ are associated with which other tables.
 
 ## Kicking off traning
 
-The [`config.yaml`] file contains the parameters for the training. Please take a look a them
+The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them
 beforee kicking off the training and edit them as necessary.
 
 To kick off training, simply run the command below from the project's root folder:
@@ -41,7 +41,7 @@ It will save the result files inside a `/results` folder within this folder
 
 > [!NOTE]
 > If you wish to change the save folder, you can do so by editing the `results_dir` attribute
-> of the [`config.yaml`](config.yaml) file.
+> of the (`config.yaml`)[config.yaml] file.
 
 One of the results file is `/results/cluster_ckpt.pkl`, which will contain the results
 of the clustering step.
diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md
index f521106f..4e9b6f9e 100644
--- a/examples/training/single_table/README.md
+++ b/examples/training/single_table/README.md
@@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder
 
 > [!NOTE]
 > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
-> of the [`config.yaml`](config.yaml) file.
+> of the (`config.yaml`)[config.yaml] file.
 
 Here is a description of the files that have been extracted:
 - `trans.csv`: The training data. It consists of information about bank transactions and it
@@ -25,7 +25,7 @@ single-table example, it will only contain information about the `trans` table.
 
 ## Kicking off traning
 
-The [`config.yaml`] file contains the parameters for the training. Please take a look a them
+The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them
 beforee kicking off the training and edit them as necessary.
 
 To kick off training, simply run the command below from the project's root folder:
@@ -39,7 +39,7 @@ It will save the result files inside a `/results` folder within this folder
 
 > [!NOTE]
 > If you wish to change the save folder, you can do so by editing the `results_dir` attribute
-> of the [`config.yaml`](config.yaml) file.
+> of the (`config.yaml`)[config.yaml] file.
 
 In the `/results/models/` folder, there will be a file called `None_trans_ckpt.pkl`,
 which is a pickle file containing the training results. You can load it using Python's

From ac1a885d9f765fd160a96097bf3becf56e385318 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Wed, 12 Nov 2025 15:12:06 -0500
Subject: [PATCH 10/23] CR by coderabbit

---
 examples/training/multi_table/README.md  | 12 ++++++------
 examples/training/single_table/README.md | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md
index ddbec0ee..a6c47238 100644
--- a/examples/training/multi_table/README.md
+++ b/examples/training/multi_table/README.md
@@ -1,6 +1,6 @@
 # Multi-Table Training Example
 
-This example will go over traning a multi-table diffusion model from the ground up using the
+This example will go over training a multi-table diffusion model from the ground up using the
 code in this toolkit.
 
 
@@ -25,10 +25,10 @@ Additionally, you will find one more file:
 are associated with which other tables.
 
 
-## Kicking off traning
+## Kicking off training
 
-The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them
-beforee kicking off the training and edit them as necessary.
+The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a
+look at them before kicking off the training and edit them as necessary.
 
 To kick off training, simply run the command below from the project's root folder:
 
@@ -47,7 +47,7 @@ One of the results file is `/results/cluster_ckpt.pkl`, which will contain the r
 of the clustering step.
 
 The other result files are in the `/results/models/` folder. They will be named after the
-table relations defined in `dataset_meta.json`. For eexample: for the `("client", "account")`
+table relations defined in `dataset_meta.json`. For example: for the `("client", "account")`
 relation, there will be a file called `client_account_ckpt.pkl`, which is a pickle file
 containing the training results. You can load it using Python's `pickle` and it will yield
 an instance of `midst_toolkit.models.clavaddpm.train.ModelArtifacts`, which contains the
@@ -57,7 +57,7 @@ trained diffusion model along with some additional metadata about the training p
 import pickle
 from midst_toolkit.models.clavaddpm.train import ModelArtifacts
 
-results_file = Path("examples/multi_table/results/models/client_account_ckpt.pkl")
+results_file = Path("examples/training/multi_table/results/models/client_account_ckpt.pkl")
 
  with open(results_file, "rb") as f:
     result = pickle.load(f)
diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md
index 4e9b6f9e..ea78d325 100644
--- a/examples/training/single_table/README.md
+++ b/examples/training/single_table/README.md
@@ -1,6 +1,6 @@
 # Single-Table Training Example
 
-This example will go over traning a single-table diffusion model from the ground up using the
+This example will go over training a single-table diffusion model from the ground up using the
 code in this toolkit.
 
 
@@ -23,10 +23,10 @@ contains 20,000 data points.
 single-table example, it will only contain information about the `trans` table.
 
 
-## Kicking off traning
+## Kicking off training
 
-The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a look a them
-beforee kicking off the training and edit them as necessary.
+The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a
+look at them before kicking off the training and edit them as necessary.
 
 To kick off training, simply run the command below from the project's root folder:
 
@@ -51,7 +51,7 @@ diffusion model along with some additional metadata about the training process:
 import pickle
 from midst_toolkit.models.clavaddpm.train import ModelArtifacts
 
-results_file = Path("examples/single_table/results/models/None_trans_ckpt.pkl")
+results_file = Path("examples/training/single_table/results/models/None_trans_ckpt.pkl")
 
  with open(results_file, "rb") as f:
     result = pickle.load(f)

From c904e9bf401333d21db120c541c3739d4dcc8aeb Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Wed, 12 Nov 2025 15:17:57 -0500
Subject: [PATCH 11/23] Actually fixing the config file links

---
 examples/training/multi_table/README.md  | 6 +++---
 examples/training/single_table/README.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md
index a6c47238..c2fd173d 100644
--- a/examples/training/multi_table/README.md
+++ b/examples/training/multi_table/README.md
@@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder
 
 > [!NOTE]
 > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
-> of the (`config.yaml`)[config.yaml] file.
+> of the [`config.yaml`](config.yaml) file.
 
 It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`,
 and `trans`. For each table there will be two files:
@@ -27,7 +27,7 @@ are associated with which other tables.
 
 ## Kicking off training
 
-The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a
+The [`config.yaml`](config.yaml) file contains the parameters for the training. Please take a
 look at them before kicking off the training and edit them as necessary.
 
 To kick off training, simply run the command below from the project's root folder:
@@ -41,7 +41,7 @@ It will save the result files inside a `/results` folder within this folder
 
 > [!NOTE]
 > If you wish to change the save folder, you can do so by editing the `results_dir` attribute
-> of the (`config.yaml`)[config.yaml] file.
+> of the [`config.yaml`](config.yaml) file.
 
 One of the results file is `/results/cluster_ckpt.pkl`, which will contain the results
 of the clustering step.
diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md
index ea78d325..50b162c7 100644
--- a/examples/training/single_table/README.md
+++ b/examples/training/single_table/README.md
@@ -13,7 +13,7 @@ extract the files and place them in a `/data` folder in within this folder
 
 > [!NOTE]
 > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
-> of the (`config.yaml`)[config.yaml] file.
+> of the [`config.yaml`](config.yaml) file.
 
 Here is a description of the files that have been extracted:
 - `trans.csv`: The training data. It consists of information about bank transactions and it
@@ -25,7 +25,7 @@ single-table example, it will only contain information about the `trans` table.
 
 ## Kicking off training
 
-The (`config.yaml`)[config.yaml] file contains the parameters for the training. Please take a
+The [`config.yaml`](config.yaml) file contains the parameters for the training. Please take a
 look at them before kicking off the training and edit them as necessary.
 
 To kick off training, simply run the command below from the project's root folder:
@@ -39,7 +39,7 @@ It will save the result files inside a `/results` folder within this folder
 
 > [!NOTE]
 > If you wish to change the save folder, you can do so by editing the `results_dir` attribute
-> of the (`config.yaml`)[config.yaml] file.
+> of the [`config.yaml`](config.yaml) file.
 
 In the `/results/models/` folder, there will be a file called `None_trans_ckpt.pkl`,
 which is a pickle file containing the training results. You can load it using Python's

From 3d0b9c6c8afcf629dcedf4a5ccb5a7260a26e38c Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Thu, 13 Nov 2025 11:44:24 -0500
Subject: [PATCH 12/23] Synthesizing single table first files

---
 .../synthesizing/single_table/config.yaml     | 34 +++++++++++
 .../single_table/run_synthesizing.py          | 58 +++++++++++++++++++
 examples/training/multi_table/config.yaml     |  8 +--
 3 files changed, 96 insertions(+), 4 deletions(-)
 create mode 100644 examples/synthesizing/single_table/config.yaml
 create mode 100644 examples/synthesizing/single_table/run_synthesizing.py

diff --git a/examples/synthesizing/single_table/config.yaml b/examples/synthesizing/single_table/config.yaml
new file mode 100644
index 00000000..492c43c6
--- /dev/null
+++ b/examples/synthesizing/single_table/config.yaml
@@ -0,0 +1,34 @@
+# Training example configuration
+# Base data directory (can be overridden from command line)
+base_data_dir: examples/training/single_table/data
+results_dir: examples/training/single_table/results
+
+diffusion_config:
+  d_layers: [512, 1024, 1024, 1024, 1024, 512]
+  dropout: 0.0
+  num_timesteps: 2000
+  model_type: mlp
+  iterations: 200000
+  batch_size: 4096
+  lr: 0.0006
+  gaussian_loss_type: mse
+  weight_decay: 1e-05
+  scheduler: cosine
+  data_split_ratios: [0.99, 0.005, 0.005]
+
+general_config:
+    data_dir: examples/training/single_table/data
+    test_data_dir: examples/training/single_table/data
+    exp_name: single_table_synthesizing
+    workspace_dir: examples/training/single_table/results
+    sample_prefix: ""
+
+sampling_config:
+    batch_size: 20000
+    classifier_scale: 1.0
+
+matching_config:
+    num_matching_clusters: 1
+    matching_batch_size: 1000
+    unique_matching: True
+    no_matching: False
diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py
new file mode 100644
index 00000000..6a7f1a8e
--- /dev/null
+++ b/examples/synthesizing/single_table/run_synthesizing.py
@@ -0,0 +1,58 @@
+import pickle
+from logging import INFO
+from pathlib import Path
+
+import hydra
+from omegaconf import DictConfig
+
+from midst_toolkit.common.config import DiffusionConfig
+from midst_toolkit.common.logger import TOOLKIT_LOGGER, log
+from midst_toolkit.common.variables import DEVICE
+from midst_toolkit.models.clavaddpm.data_loaders import load_tables
+from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training
+
+
+# Preventing some excessive logging
+TOOLKIT_LOGGER.setLevel(INFO)
+
+
+@hydra.main(config_path=".", config_name="config", version_base=None)
+def main(config: DictConfig) -> None:
+    """
+    Run the synthesizing pipeline for a single-table diffusion model.
+
+    It will load the config and then data from the `config.base_data_dir` folder,
+    train the model and save the results in the `config.results_dir` folder.
+
+    Args:
+        config: Training configuration as an OmegaConf DictConfig object.
+    """
+    log(INFO, f"Loading data from {config.base_data_dir}...")
+    tables, relation_order, _ = load_tables(Path(config.base_data_dir))
+
+    log(INFO, "Training model...")
+    diffusion_config = DiffusionConfig(**config.diffusion_config)
+
+    tables, _ = clava_training(
+        tables,
+        relation_order,
+        Path(config.results_dir),
+        diffusion_config,
+        device=DEVICE,
+    )
+    log(INFO, "Model trained successfully.")
+
+    results_file = Path(config.results_dir) / "models" / "None_trans_ckpt.pkl"
+    log(INFO, f"Checking the results from {results_file}...")
+
+    with open(results_file, "rb") as f:
+        result = pickle.load(f)
+
+    # Asserting the results are the correct type
+    assert isinstance(result, ModelArtifacts)
+
+    log(INFO, f"Result size (in bytes): {results_file.stat().st_size}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/training/multi_table/config.yaml b/examples/training/multi_table/config.yaml
index 6d5e701d..c63e7aaa 100644
--- a/examples/training/multi_table/config.yaml
+++ b/examples/training/multi_table/config.yaml
@@ -6,10 +6,10 @@ results_dir: examples/training/multi_table/results
 diffusion_config:
   d_layers: [512, 1024, 1024, 1024, 1024, 512]
   dropout: 0.0
-  num_timesteps: 2000
+  num_timesteps: 100
   model_type: mlp
-  iterations: 20000
-  batch_size: 4096
+  iterations: 1000
+  batch_size: 24
   lr: 0.0006
   gaussian_loss_type: mse
   weight_decay: 1e-05
@@ -26,4 +26,4 @@ classifier_config:
     lr: 0.0001
     dim_t: 128
     batch_size: 4096
-    iterations: 20000
+    iterations: 1000

From baa9824b5135ad30585dcaf2e4dc3ae3663ab6f4 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Thu, 13 Nov 2025 13:50:13 -0500
Subject: [PATCH 13/23] Finishing the sythesizer single table example

---
 .gitignore                                    |  4 ++
 examples/synthesizing/single_table/README.md  | 58 +++++++++++++++++++
 .../synthesizing/single_table/config.yaml     | 10 ++--
 .../single_table/run_synthesizing.py          | 54 ++++++++++-------
 .../attacks/ensemble/shadow_model_utils.py    |  4 +-
 .../models/clavaddpm/data_loaders.py          |  7 ++-
 .../models/clavaddpm/synthesizer.py           | 12 +++-
 .../models/clavaddpm/test_synthesizer.py      |  2 +-
 8 files changed, 118 insertions(+), 33 deletions(-)
 create mode 100644 examples/synthesizing/single_table/README.md

diff --git a/.gitignore b/.gitignore
index 611e0bc1..9c5092f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,3 +47,7 @@ examples/training/single_table/data/**
 examples/training/single_table/results/**
 examples/training/multi_table/data/**
 examples/training/multi_table/results/**
+examples/synthesizing/single_table/data/**
+examples/synthesizing/single_table/results/**
+examples/synthesizing/multi_table/data/**
+examples/synthesizing/multi_table/results/**
diff --git a/examples/synthesizing/single_table/README.md b/examples/synthesizing/single_table/README.md
new file mode 100644
index 00000000..15c1bca8
--- /dev/null
+++ b/examples/synthesizing/single_table/README.md
@@ -0,0 +1,58 @@
+# Single-Table Synthesizing Example
+
+This example will go over synthesizing data for a single-table dataset from the ground
+up using the code in this toolkit.
+
+
+## Downloading data
+
+First, we need the data. Download it from this
+[Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link),
+extract the files and place them in a `/data` folder in within this folder
+(`examples/synthesizing/single_table`).
+
+> [!NOTE]
+> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
+> of the (`config.yaml`)[config.yaml] file.
+
+Here is a description of the files that have been extracted:
+- `trans.csv`: The training data. It consists of information about bank transactions and it
+contains 20,000 data points.
+- `trans_domain.json`: Metadata about the columns in `trans.csv`, such as data types and sizes.
+- `dataset_meta.json`: Metadata about the relationship between the tables. Since this is a
+single-table example, it will only contain information about the `trans` table.
+
+
+## Kicking off synthesizing
+
+If there is a `/results` folder within this folder (`examples/synthesizing/single_table`)
+from a previous training run, we will use that data to kick off synthesizing.
+For example, you can copy the results from another run (e.g. `examples.training.single_table.run_training`)
+and paste them here and it will be picked up by this example.
+
+The [`config.yaml`](config.yaml) file contains the parameters for the synthesizing and also
+for training, in case there is a need to run that. Please take a look at them before kicking
+off the synthesizing process and edit them as necessary.
+
+To kick off synthesizing, simply run the command below from the project's root folder:
+
+```bash
+python -m examples.synthesizing.single_table.run_synthesizing
+```
+
+## Results
+
+It will save the result files inside a `/results` folder within this folder
+(`examples/synthesizing/single_table`).
+
+> [!NOTE]
+> If you wish to change the save folder, you can do so by editing the `results_dir` attribute
+> of the (`config.yaml`)[config.yaml] file.
+
+In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`,
+which is a pickle file containing the synthetic data before the matching process, in case
+it's needed.
+
+The `/results/single_table_synthesizing` folder will contain the final synthesized
+data, organized per table. In this single-table example, there is only going to be one
+synthesized table under `/results/single_table_synthesizing/trans/_final/trans_synthetic.csv`.
diff --git a/examples/synthesizing/single_table/config.yaml b/examples/synthesizing/single_table/config.yaml
index 492c43c6..662a4fe1 100644
--- a/examples/synthesizing/single_table/config.yaml
+++ b/examples/synthesizing/single_table/config.yaml
@@ -1,7 +1,7 @@
 # Training example configuration
 # Base data directory (can be overridden from command line)
-base_data_dir: examples/training/single_table/data
-results_dir: examples/training/single_table/results
+base_data_dir: examples/synthesizing/single_table/data
+results_dir: examples/synthesizing/single_table/results
 
 diffusion_config:
   d_layers: [512, 1024, 1024, 1024, 1024, 512]
@@ -17,10 +17,10 @@ diffusion_config:
   data_split_ratios: [0.99, 0.005, 0.005]
 
 general_config:
-    data_dir: examples/training/single_table/data
-    test_data_dir: examples/training/single_table/data
+    data_dir: examples/synthesizing/single_table/data
+    test_data_dir: examples/synthesizing/single_table/data
     exp_name: single_table_synthesizing
-    workspace_dir: examples/training/single_table/results
+    workspace_dir: examples/synthesizing/single_table/results
     sample_prefix: ""
 
 sampling_config:
diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py
index 6a7f1a8e..4fb5ce4b 100644
--- a/examples/synthesizing/single_table/run_synthesizing.py
+++ b/examples/synthesizing/single_table/run_synthesizing.py
@@ -5,11 +5,11 @@
 import hydra
 from omegaconf import DictConfig
 
-from midst_toolkit.common.config import DiffusionConfig
+from examples.training.single_table import run_training
+from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig
 from midst_toolkit.common.logger import TOOLKIT_LOGGER, log
-from midst_toolkit.common.variables import DEVICE
 from midst_toolkit.models.clavaddpm.data_loaders import load_tables
-from midst_toolkit.models.clavaddpm.train import ModelArtifacts, clava_training
+from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing
 
 
 # Preventing some excessive logging
@@ -22,36 +22,48 @@ def main(config: DictConfig) -> None:
     Run the synthesizing pipeline for a single-table diffusion model.
 
     It will load the config and then data from the `config.base_data_dir` folder,
-    train the model and save the results in the `config.results_dir` folder.
+    train the model, synthesize the data and save the results in the
+    `config.results_dir` folder.
+
+    It will first look for a pre-trained model in the `config.results_dir` folder.
+    If it doesn't find one, it will train a new model from scratch.
 
     Args:
         config: Training configuration as an OmegaConf DictConfig object.
     """
-    log(INFO, f"Loading data from {config.base_data_dir}...")
+    log(INFO, f"Checking for a pre-trained model in {config.results_dir}...")
+
     tables, relation_order, _ = load_tables(Path(config.base_data_dir))
 
-    log(INFO, "Training model...")
-    diffusion_config = DiffusionConfig(**config.diffusion_config)
+    model_file_paths = {}
+    for relation in relation_order:
+        model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl"
+        model_file_paths[relation] = model_file_path
+
+    if all(model_file.exists() for model_file in model_file_paths.values()):
+        log(INFO, f"Found a pre-trained models in {config.results_dir}. Skipping training.")
+    else:
+        log(INFO, "No pre-trained models found, training a new model from scratch...")
+        run_training.main(config)
+
+    log(INFO, "Synthesizing data...")
 
-    tables, _ = clava_training(
+    models = {}
+    for relation in relation_order:
+        with open(model_file_paths[relation], "rb") as f:
+            models[relation] = pickle.load(f)
+
+    clava_synthesizing(
         tables,
         relation_order,
         Path(config.results_dir),
-        diffusion_config,
-        device=DEVICE,
+        models,
+        GeneralConfig(**config.general_config),
+        SamplingConfig(**config.sampling_config),
+        MatchingConfig(**config.matching_config),
     )
-    log(INFO, "Model trained successfully.")
-
-    results_file = Path(config.results_dir) / "models" / "None_trans_ckpt.pkl"
-    log(INFO, f"Checking the results from {results_file}...")
-
-    with open(results_file, "rb") as f:
-        result = pickle.load(f)
-
-    # Asserting the results are the correct type
-    assert isinstance(result, ModelArtifacts)
 
-    log(INFO, f"Result size (in bytes): {results_file.stat().st_size}")
+    log(INFO, "Data synthesized successfully.")
 
 
 if __name__ == "__main__":
diff --git a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py
index e8604e22..63b57729 100644
--- a/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py
+++ b/src/midst_toolkit/attacks/ensemble/shadow_model_utils.py
@@ -142,11 +142,11 @@ def train_tabddpm_and_synthesize(
             tables,
             relation_order,
             save_dir,
-            all_group_lengths_prob_dicts,
             models,
             configs.general,
             configs.sampling,
             configs.matching,
+            all_group_lengths_prob_dicts,
             sample_scale=sample_scale,
         )
 
@@ -232,11 +232,11 @@ def fine_tune_tabddpm_and_synthesize(
             new_tables,
             relation_order,
             save_dir,
-            all_group_lengths_prob_dicts,
             new_models,
             configs.general,
             configs.sampling,
             configs.matching,
+            all_group_lengths_prob_dicts,
             sample_scale=sample_scale,
         )
 
diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py
index 008e0d6c..c9e09cce 100644
--- a/src/midst_toolkit/models/clavaddpm/data_loaders.py
+++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py
@@ -96,7 +96,7 @@ def load_tables(
     with open(data_dir / "dataset_meta.json", "r") as f:
         dataset_meta = json.load(f)
 
-    relation_order = dataset_meta["relation_order"]
+    relation_order = [tuple(relation) for relation in dataset_meta["relation_order"]]
 
     tables = {}
 
@@ -125,6 +125,11 @@ def load_tables(
             info=info,
         )
 
+    # Adding the no parent placeholder column in the tables with no parent
+    for parent, child in relation_order:
+        if parent is None:
+            tables[child].data[NO_PARENT_COLUMN_NAME] = list(range(len(tables[child].data)))
+
     return tables, relation_order, dataset_meta
 
 
diff --git a/src/midst_toolkit/models/clavaddpm/synthesizer.py b/src/midst_toolkit/models/clavaddpm/synthesizer.py
index 2204ec52..cbbdddc1 100644
--- a/src/midst_toolkit/models/clavaddpm/synthesizer.py
+++ b/src/midst_toolkit/models/clavaddpm/synthesizer.py
@@ -711,11 +711,11 @@ def clava_synthesizing(
     tables: Tables,
     relation_order: RelationOrder,
     save_dir: Path,
-    all_group_lengths_prob_dicts: GroupLengthsProbDicts,
     models: dict[Relation, ModelArtifacts],
     general_config: GeneralConfig,
     sampling_config: SamplingConfig,
     matching_config: MatchingConfig,
+    all_group_lengths_prob_dicts: GroupLengthsProbDicts | None = None,
     sample_scale: float = 1.0,
 ) -> tuple[dict[str, pd.DataFrame], float, float]:
     """
@@ -726,12 +726,13 @@ def clava_synthesizing(
         tables: Tables containing dataframes and clustering information.
         relation_order: List of parent-child table relationships.
         save_dir: Directory to save intermediate and final results.
-        all_group_lengths_prob_dicts: Dictionary containing group length probabilities for each
-            parent-child relationship.
         models: Trained models for each parent-child relationship.
         general_config: General configuration settings.
         sampling_config: Configuration settings for sampling.
         matching_config: Configuration settings for matching.
+        all_group_lengths_prob_dicts: Dictionary containing group length probabilities for each
+            parent-child relationship. Optional for single-table synthesizing, required for
+            multi-table synthesizing. Defaults to None.
         sample_scale: Scale factor for the number of samples to generate
             based on the train data size. Defaults to 1.0.
 
@@ -762,7 +763,12 @@ def clava_synthesizing(
                 sample_scale,
                 sampling_config.batch_size,
             )
+
         else:
+            assert all_group_lengths_prob_dicts is not None, (
+                "all_group_lengths_prob_dicts is required for multi-table synthesizing."
+            )
+
             # Finding previously synthesized data and training results for the parent
             parent_synthetic_data = None
             parent_training_results = None
diff --git a/tests/integration/models/clavaddpm/test_synthesizer.py b/tests/integration/models/clavaddpm/test_synthesizer.py
index 267160df..ae88477e 100644
--- a/tests/integration/models/clavaddpm/test_synthesizer.py
+++ b/tests/integration/models/clavaddpm/test_synthesizer.py
@@ -92,11 +92,11 @@ def test_clava_synthesize_multi_table(tmp_path: Path):
         tables,
         relation_order,
         tmp_path,
-        all_group_lengths_prob_dicts,
         models[1],
         synthesizing_config,
         SAMPLING_CONFIG,
         MATCHING_CONFIG,
+        all_group_lengths_prob_dicts,
     )
 
     # Assert

From d480dd9bc4dd88b891eda9ff49bd205f50cddda3 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Thu, 13 Nov 2025 13:52:25 -0500
Subject: [PATCH 14/23] Small tweak in the readmes

---
 examples/training/multi_table/README.md  | 5 ++++-
 examples/training/single_table/README.md | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/training/multi_table/README.md b/examples/training/multi_table/README.md
index c2fd173d..31791112 100644
--- a/examples/training/multi_table/README.md
+++ b/examples/training/multi_table/README.md
@@ -36,7 +36,10 @@ To kick off training, simply run the command below from the project's root folde
 python -m examples.training.multi_table.run_training
 ```
 
-It will save the result files inside a `/results` folder within this folder
+
+## Results
+
+The result files will be saved inside a `/results` folder within this folder
 (`examples/training/multi_table`).
 
 > [!NOTE]
diff --git a/examples/training/single_table/README.md b/examples/training/single_table/README.md
index 50b162c7..ac6fa12b 100644
--- a/examples/training/single_table/README.md
+++ b/examples/training/single_table/README.md
@@ -34,7 +34,10 @@ To kick off training, simply run the command below from the project's root folde
 python -m examples.training.single_table.run_training
 ```
 
-It will save the result files inside a `/results` folder within this folder
+
+## Results
+
+The result files will be saved inside a `/results` folder within this folder
 (`examples/training/single_table`).
 
 > [!NOTE]

From e81aa91e42512cfe40b4f8537858d7346b3e6662 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Thu, 13 Nov 2025 13:58:48 -0500
Subject: [PATCH 15/23] Final synthesizer config

---
 examples/synthesizing/multi_table/config.yaml | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 examples/synthesizing/multi_table/config.yaml

diff --git a/examples/synthesizing/multi_table/config.yaml b/examples/synthesizing/multi_table/config.yaml
new file mode 100644
index 00000000..f9d77bbc
--- /dev/null
+++ b/examples/synthesizing/multi_table/config.yaml
@@ -0,0 +1,46 @@
+# Training example configuration
+# Base data directory (can be overridden from command line)
+base_data_dir: examples/synthesizing/single_table/data
+results_dir: examples/synthesizing/single_table/results
+
+diffusion_config:
+  d_layers: [512, 1024, 1024, 1024, 1024, 512]
+  dropout: 0.0
+  num_timesteps: 100
+  model_type: mlp
+  iterations: 1000
+  batch_size: 24
+  lr: 0.0006
+  gaussian_loss_type: mse
+  weight_decay: 1e-05
+  scheduler: cosine
+  data_split_ratios: [0.99, 0.005, 0.005]
+
+clustering_config:
+  parent_scale: 1.0
+  num_clusters: 50
+  clustering_method: kmeans_and_gmm
+
+classifier_config:
+    d_layers: [128, 256, 512, 1024, 512, 256, 128]
+    lr: 0.0001
+    dim_t: 128
+    batch_size: 4096
+    iterations: 1000
+
+general_config:
+    data_dir: examples/synthesizing/single_table/data
+    test_data_dir: examples/synthesizing/single_table/data
+    exp_name: single_table_synthesizing
+    workspace_dir: examples/synthesizing/single_table/results
+    sample_prefix: ""
+
+sampling_config:
+    batch_size: 20000
+    classifier_scale: 1.0
+
+matching_config:
+    num_matching_clusters: 1
+    matching_batch_size: 1000
+    unique_matching: True
+    no_matching: False

From 1c55e91a508add28d975549ae9dde35c9181e0d3 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Thu, 13 Nov 2025 14:00:32 -0500
Subject: [PATCH 16/23] actual final configs

---
 examples/synthesizing/multi_table/config.yaml | 8 ++++----
 examples/training/multi_table/config.yaml     | 8 ++++----
 examples/training/single_table/config.yaml    | 2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/synthesizing/multi_table/config.yaml b/examples/synthesizing/multi_table/config.yaml
index f9d77bbc..cc4cd2cc 100644
--- a/examples/synthesizing/multi_table/config.yaml
+++ b/examples/synthesizing/multi_table/config.yaml
@@ -6,10 +6,10 @@ results_dir: examples/synthesizing/single_table/results
 diffusion_config:
   d_layers: [512, 1024, 1024, 1024, 1024, 512]
   dropout: 0.0
-  num_timesteps: 100
+  num_timesteps: 2000
   model_type: mlp
-  iterations: 1000
-  batch_size: 24
+  iterations: 20000
+  batch_size: 4096
   lr: 0.0006
   gaussian_loss_type: mse
   weight_decay: 1e-05
@@ -26,7 +26,7 @@ classifier_config:
     lr: 0.0001
     dim_t: 128
     batch_size: 4096
-    iterations: 1000
+    iterations: 20000
 
 general_config:
     data_dir: examples/synthesizing/single_table/data
diff --git a/examples/training/multi_table/config.yaml b/examples/training/multi_table/config.yaml
index c63e7aaa..6d5e701d 100644
--- a/examples/training/multi_table/config.yaml
+++ b/examples/training/multi_table/config.yaml
@@ -6,10 +6,10 @@ results_dir: examples/training/multi_table/results
 diffusion_config:
   d_layers: [512, 1024, 1024, 1024, 1024, 512]
   dropout: 0.0
-  num_timesteps: 100
+  num_timesteps: 2000
   model_type: mlp
-  iterations: 1000
-  batch_size: 24
+  iterations: 20000
+  batch_size: 4096
   lr: 0.0006
   gaussian_loss_type: mse
   weight_decay: 1e-05
@@ -26,4 +26,4 @@ classifier_config:
     lr: 0.0001
     dim_t: 128
     batch_size: 4096
-    iterations: 1000
+    iterations: 20000
diff --git a/examples/training/single_table/config.yaml b/examples/training/single_table/config.yaml
index 7fc54556..719be647 100644
--- a/examples/training/single_table/config.yaml
+++ b/examples/training/single_table/config.yaml
@@ -8,7 +8,7 @@ diffusion_config:
   dropout: 0.0
   num_timesteps: 2000
   model_type: mlp
-  iterations: 200000
+  iterations: 20000
   batch_size: 4096
   lr: 0.0006
   gaussian_loss_type: mse

From 37ebe7dfcccb31a7088b1fc65f1a5b41c40877fb Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Thu, 13 Nov 2025 14:01:44 -0500
Subject: [PATCH 17/23] removing one extra zero

---
 examples/synthesizing/single_table/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/synthesizing/single_table/config.yaml b/examples/synthesizing/single_table/config.yaml
index 662a4fe1..de21a680 100644
--- a/examples/synthesizing/single_table/config.yaml
+++ b/examples/synthesizing/single_table/config.yaml
@@ -8,7 +8,7 @@ diffusion_config:
   dropout: 0.0
   num_timesteps: 2000
   model_type: mlp
-  iterations: 200000
+  iterations: 20000
   batch_size: 4096
   lr: 0.0006
   gaussian_loss_type: mse

From 8974cfe8a1b7a7f4b5ab0f1a3877c32d09f0a6cd Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Thu, 13 Nov 2025 14:59:22 -0500
Subject: [PATCH 18/23] finishing the synthsizer example code

---
 examples/synthesizing/multi_table/README.md   | 60 ++++++++++++++
 examples/synthesizing/multi_table/config.yaml | 12 +--
 .../multi_table/run_synthesizing.py           | 81 +++++++++++++++++++
 .../single_table/run_synthesizing.py          |  6 +-
 .../models/clavaddpm/data_loaders.py          |  5 --
 .../models/clavaddpm/synthesizer.py           | 10 ++-
 6 files changed, 158 insertions(+), 16 deletions(-)
 create mode 100644 examples/synthesizing/multi_table/README.md
 create mode 100644 examples/synthesizing/multi_table/run_synthesizing.py

diff --git a/examples/synthesizing/multi_table/README.md b/examples/synthesizing/multi_table/README.md
new file mode 100644
index 00000000..8e13a277
--- /dev/null
+++ b/examples/synthesizing/multi_table/README.md
@@ -0,0 +1,60 @@
+# Multi-Table Synthesizing Example
+
+This example will go over synthesizing data for a multi-table dataset from the ground
+up using the code in this toolkit.
+
+
+## Downloading data
+
+First, we need the data. Download it from this
+[Google Drive link](https://drive.google.com/file/d/1Ao222l4AJjG54-HDEGCWkIfzRbl9_IKa/view?usp=drive_link),
+extract the files and place them in a `/data` folder in within this folder
+(`examples/synthesizing/multi_table`).
+
+> [!NOTE]
+> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
+> of the (`config.yaml`)[config.yaml] file.
+
+It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`,
+and `trans`. For each table there will be two files:
+- `{table_name}.csv`: The table's data.
+- `{table_name}_domain.json`: Metadata about the columns in the table's data, such as data types and sizes.
+
+Additionally, you will find one more file:
+- `dataset_meta.json`: Metadata about the relationship between the tables. It will describe which tables
+are associated with which other tables.
+
+
+## Kicking off synthesizing
+
+If there is a `/results` folder within this folder (`examples/synthesizing/multi_table`)
+from a previous training run, we will use that data to kick off synthesizing.
+For example, you can copy the results from another run (e.g. `examples.training.multi_table.run_training`)
+and paste them here and it will be picked up by this example.
+
+The [`config.yaml`](config.yaml) file contains the parameters for the synthesizing and also
+for training, in case there is a need to run that. Please take a look at them before kicking
+off the synthesizing process and edit them as necessary.
+
+To kick off synthesizing, simply run the command below from the project's root folder:
+
+```bash
+python -m examples.synthesizing.multi_table.run_synthesizing
+```
+
+## Results
+
+It will save the result files inside a `/results` folder within this folder
+(`examples/synthesizing/multi_table`).
+
+> [!NOTE]
+> If you wish to change the save folder, you can do so by editing the `results_dir` attribute
+> of the (`config.yaml`)[config.yaml] file.
+
+In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`,
+which is a pickle file containing the synthetic data before the matching process, in case
+it's needed.
+
+The `/results/single_table_synthesizing` folder will contain the final synthesized
+data, organized per table. In this single-table example, there is only going to be one
+synthesized table under `/results/single_table_synthesizing/trans/_final/trans_synthetic.csv`.
diff --git a/examples/synthesizing/multi_table/config.yaml b/examples/synthesizing/multi_table/config.yaml
index cc4cd2cc..4eea7848 100644
--- a/examples/synthesizing/multi_table/config.yaml
+++ b/examples/synthesizing/multi_table/config.yaml
@@ -1,7 +1,7 @@
 # Training example configuration
 # Base data directory (can be overridden from command line)
-base_data_dir: examples/synthesizing/single_table/data
-results_dir: examples/synthesizing/single_table/results
+base_data_dir: examples/synthesizing/multi_table/data
+results_dir: examples/synthesizing/multi_table/results
 
 diffusion_config:
   d_layers: [512, 1024, 1024, 1024, 1024, 512]
@@ -29,10 +29,10 @@ classifier_config:
     iterations: 20000
 
 general_config:
-    data_dir: examples/synthesizing/single_table/data
-    test_data_dir: examples/synthesizing/single_table/data
-    exp_name: single_table_synthesizing
-    workspace_dir: examples/synthesizing/single_table/results
+    data_dir: examples/synthesizing/multi_table/data
+    test_data_dir: examples/synthesizing/multi_table/data
+    exp_name: multi_table_synthesizing
+    workspace_dir: examples/synthesizing/multi_table/results
     sample_prefix: ""
 
 sampling_config:
diff --git a/examples/synthesizing/multi_table/run_synthesizing.py b/examples/synthesizing/multi_table/run_synthesizing.py
new file mode 100644
index 00000000..9d4dbe46
--- /dev/null
+++ b/examples/synthesizing/multi_table/run_synthesizing.py
@@ -0,0 +1,81 @@
+import pickle
+from logging import INFO
+from pathlib import Path
+
+import hydra
+from omegaconf import DictConfig
+
+from examples.training.multi_table import run_training
+from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig
+from midst_toolkit.common.logger import TOOLKIT_LOGGER, log
+from midst_toolkit.models.clavaddpm.data_loaders import load_tables
+from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing
+
+
+# Preventing some excessive logging
+TOOLKIT_LOGGER.setLevel(INFO)
+
+
+@hydra.main(config_path=".", config_name="config", version_base=None)
+def main(config: DictConfig) -> None:
+    """
+    Run the synthesizing pipeline for a multi-table diffusion model.
+
+    It will load the config and then data from the `config.base_data_dir` folder,
+    train the model, synthesize the data and save the results in the
+    `config.results_dir` folder.
+
+    It will first look for a pre-trained model in the `config.results_dir` folder.
+    If it doesn't find one, it will train a new model from scratch.
+
+    Args:
+        config: Training and synthesizing configuration as an OmegaConf DictConfig object.
+    """
+    log(INFO, f"Checking for a pre-trained model in {config.results_dir}...")
+
+    _, relation_order, _ = load_tables(Path(config.base_data_dir))
+
+    model_file_paths = {}
+    for relation in relation_order:
+        model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl"
+        model_file_paths[relation] = model_file_path
+
+    clustering_results_file = Path(config.results_dir) / "cluster_ckpt.pkl"
+
+    if all(model_file.exists() for model_file in model_file_paths.values()) and clustering_results_file.exists():
+        log(INFO, f"Found a pre-trained models in {config.results_dir}. Skipping training.")
+    else:
+        log(INFO, "No pre-trained models found, training a new model from scratch...")
+        run_training.main(config)
+
+    log(INFO, "Loading models...")
+
+    models = {}
+    for relation in relation_order:
+        with open(model_file_paths[relation], "rb") as f:
+            models[relation] = pickle.load(f)
+
+    with open(clustering_results_file, "rb") as f:
+        clustering_result = pickle.load(f)
+
+    tables = clustering_result["tables"]
+    all_group_lengths_prob_dicts = clustering_result["all_group_lengths_prob_dicts"]
+
+    log(INFO, "Synthesizing data...")
+
+    clava_synthesizing(
+        tables,
+        relation_order,
+        Path(config.results_dir),
+        models,
+        GeneralConfig(**config.general_config),
+        SamplingConfig(**config.sampling_config),
+        MatchingConfig(**config.matching_config),
+        all_group_lengths_prob_dicts,
+    )
+
+    log(INFO, "Data synthesized successfully.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py
index 4fb5ce4b..44d46027 100644
--- a/examples/synthesizing/single_table/run_synthesizing.py
+++ b/examples/synthesizing/single_table/run_synthesizing.py
@@ -29,7 +29,7 @@ def main(config: DictConfig) -> None:
     If it doesn't find one, it will train a new model from scratch.
 
     Args:
-        config: Training configuration as an OmegaConf DictConfig object.
+        config: Training and synthesizing configuration as an OmegaConf DictConfig object.
     """
     log(INFO, f"Checking for a pre-trained model in {config.results_dir}...")
 
@@ -46,13 +46,15 @@ def main(config: DictConfig) -> None:
         log(INFO, "No pre-trained models found, training a new model from scratch...")
         run_training.main(config)
 
-    log(INFO, "Synthesizing data...")
+    log(INFO, "Loading models...")
 
     models = {}
     for relation in relation_order:
         with open(model_file_paths[relation], "rb") as f:
             models[relation] = pickle.load(f)
 
+    log(INFO, "Synthesizing data...")
+
     clava_synthesizing(
         tables,
         relation_order,
diff --git a/src/midst_toolkit/models/clavaddpm/data_loaders.py b/src/midst_toolkit/models/clavaddpm/data_loaders.py
index c9e09cce..2cb10746 100644
--- a/src/midst_toolkit/models/clavaddpm/data_loaders.py
+++ b/src/midst_toolkit/models/clavaddpm/data_loaders.py
@@ -125,11 +125,6 @@ def load_tables(
             info=info,
         )
 
-    # Adding the no parent placeholder column in the tables with no parent
-    for parent, child in relation_order:
-        if parent is None:
-            tables[child].data[NO_PARENT_COLUMN_NAME] = list(range(len(tables[child].data)))
-
     return tables, relation_order, dataset_meta
 
 
diff --git a/src/midst_toolkit/models/clavaddpm/synthesizer.py b/src/midst_toolkit/models/clavaddpm/synthesizer.py
index cbbdddc1..44741bcd 100644
--- a/src/midst_toolkit/models/clavaddpm/synthesizer.py
+++ b/src/midst_toolkit/models/clavaddpm/synthesizer.py
@@ -17,7 +17,7 @@
 from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig
 from midst_toolkit.common.enumerations import DataSplit
 from midst_toolkit.common.logger import log
-from midst_toolkit.models.clavaddpm.data_loaders import Tables
+from midst_toolkit.models.clavaddpm.data_loaders import NO_PARENT_COLUMN_NAME, Tables
 from midst_toolkit.models.clavaddpm.dataset import Dataset, TableMetadata, Transformations
 from midst_toolkit.models.clavaddpm.enumerations import (
     CategoricalEncoding,
@@ -626,9 +626,9 @@ def sample_from_dict(probabilities: dict[int, float]) -> int:
     Returns:
         The sampled key.
     """
-    assert sum(probabilities.values()) == 1.0, "The sum of all probabilities must be 1.0."
+    assert np.isclose(sum(probabilities.values()), 1), "The sum of all probabilities must be 1."
 
-    # Generate a random number between 0 and 1
+    # Generate a random number between [0, 1)
     random_number = random.random()
 
     # Initialize cumulative sum and the selected key
@@ -755,6 +755,10 @@ def clava_synthesizing(
         log(INFO, "Sample size: {}".format(int(sample_scale * len(df_without_id))))
 
         if parent is None:
+            # Adding the no parent placeholder column in case it doesn't have it
+            if NO_PARENT_COLUMN_NAME not in df_without_id.columns:
+                df_without_id[NO_PARENT_COLUMN_NAME] = list(range(len(df_without_id)))
+
             # synthesize data for single table or tables with no parent
             synthesized_df, table_keys = _synthesize_single_table(
                 child,

From 3a3814d271469aae7f25c46eace2cecc6872a04a Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Thu, 13 Nov 2025 15:28:38 -0500
Subject: [PATCH 19/23] making the save dir in case it does't exist

---
 src/midst_toolkit/models/clavaddpm/clustering.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/midst_toolkit/models/clavaddpm/clustering.py b/src/midst_toolkit/models/clavaddpm/clustering.py
index 1299d85b..b841ec6d 100644
--- a/src/midst_toolkit/models/clavaddpm/clustering.py
+++ b/src/midst_toolkit/models/clavaddpm/clustering.py
@@ -59,6 +59,8 @@ def clava_clustering(
             "tables": tables,
             "all_group_lengths_prob_dicts": all_group_lengths_prob_dicts,
         }
+
+        save_dir.mkdir(parents=True, exist_ok=True)
         with open(save_dir / "cluster_ckpt.pkl", "wb") as f:
             pickle.dump(cluster_ckpt, f)
 

From 32e9ab33f2297911ae79233baeaff65451307b5e Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Thu, 13 Nov 2025 15:56:10 -0500
Subject: [PATCH 20/23] Fixing tests

---
 tests/integration/models/clavaddpm/test_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/models/clavaddpm/test_model.py b/tests/integration/models/clavaddpm/test_model.py
index 4bb3b965..43689ff0 100644
--- a/tests/integration/models/clavaddpm/test_model.py
+++ b/tests/integration/models/clavaddpm/test_model.py
@@ -122,7 +122,7 @@ def test_load_single_table():
         },
     )
 
-    assert relation_order == [[None, "trans"]]
+    assert relation_order == [(None, "trans")]
     assert dataset_meta["relation_order"] == [[None, "trans"]]
     assert dataset_meta["tables"] == {"trans": {"children": [], "parents": []}}
 
@@ -225,7 +225,7 @@ def test_load_tables():
         },
     )
 
-    assert relation_order == [[None, "account"], ["account", "trans"]]
+    assert relation_order == [(None, "account"), ("account", "trans")]
     assert dataset_meta["relation_order"] == [[None, "account"], ["account", "trans"]]
     assert dataset_meta["tables"] == {
         "account": {"children": ["trans"], "parents": []},

From 12f6491271e3123ab59541564132206c5e76c1b2 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Thu, 13 Nov 2025 16:03:38 -0500
Subject: [PATCH 21/23] Finishing the instructions for the multi table example

---
 examples/synthesizing/multi_table/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/synthesizing/multi_table/README.md b/examples/synthesizing/multi_table/README.md
index 8e13a277..8124c68b 100644
--- a/examples/synthesizing/multi_table/README.md
+++ b/examples/synthesizing/multi_table/README.md
@@ -55,6 +55,6 @@ In the `/results/before_matching/` folder, there will be a file called `syntheti
 which is a pickle file containing the synthetic data before the matching process, in case
 it's needed.
 
-The `/results/single_table_synthesizing` folder will contain the final synthesized
-data, organized per table. In this single-table example, there is only going to be one
-synthesized table under `/results/single_table_synthesizing/trans/_final/trans_synthetic.csv`.
+The `/results/multi_table_synthesizing` folder will contain the final synthesized
+data, organized per table, in the form of `.csv` files with the following naming pattern:
+`/results/multi_table_synthesizing/{table_name}/_final/{table_name}_synthetic.csv`.

From bc9af11df36656fa15ab3bd060b0885a2f545236 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Thu, 13 Nov 2025 16:08:26 -0500
Subject: [PATCH 22/23] CR by coderabbit

---
 examples/synthesizing/multi_table/README.md            | 6 +++---
 examples/synthesizing/multi_table/run_synthesizing.py  | 2 +-
 examples/synthesizing/single_table/README.md           | 6 +++---
 examples/synthesizing/single_table/run_synthesizing.py | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/synthesizing/multi_table/README.md b/examples/synthesizing/multi_table/README.md
index 8124c68b..737b49ec 100644
--- a/examples/synthesizing/multi_table/README.md
+++ b/examples/synthesizing/multi_table/README.md
@@ -8,12 +8,12 @@ up using the code in this toolkit.
 
 First, we need the data. Download it from this
 [Google Drive link](https://drive.google.com/file/d/1Ao222l4AJjG54-HDEGCWkIfzRbl9_IKa/view?usp=drive_link),
-extract the files and place them in a `/data` folder in within this folder
+extract the files and place them in a `/data` folder within this folder
 (`examples/synthesizing/multi_table`).
 
 > [!NOTE]
 > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
-> of the (`config.yaml`)[config.yaml] file.
+> of the [`config.yaml`](config.yaml) file.
 
 It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`,
 and `trans`. For each table there will be two files:
@@ -49,7 +49,7 @@ It will save the result files inside a `/results` folder within this folder
 
 > [!NOTE]
 > If you wish to change the save folder, you can do so by editing the `results_dir` attribute
-> of the (`config.yaml`)[config.yaml] file.
+> of the [`config.yaml`](config.yaml) file.
 
 In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`,
 which is a pickle file containing the synthetic data before the matching process, in case
diff --git a/examples/synthesizing/multi_table/run_synthesizing.py b/examples/synthesizing/multi_table/run_synthesizing.py
index 9d4dbe46..b19678f6 100644
--- a/examples/synthesizing/multi_table/run_synthesizing.py
+++ b/examples/synthesizing/multi_table/run_synthesizing.py
@@ -43,7 +43,7 @@ def main(config: DictConfig) -> None:
     clustering_results_file = Path(config.results_dir) / "cluster_ckpt.pkl"
 
     if all(model_file.exists() for model_file in model_file_paths.values()) and clustering_results_file.exists():
-        log(INFO, f"Found a pre-trained models in {config.results_dir}. Skipping training.")
+        log(INFO, f"Found pre-trained models in {config.results_dir}. Skipping training.")
     else:
         log(INFO, "No pre-trained models found, training a new model from scratch...")
         run_training.main(config)
diff --git a/examples/synthesizing/single_table/README.md b/examples/synthesizing/single_table/README.md
index 15c1bca8..5f6f1f51 100644
--- a/examples/synthesizing/single_table/README.md
+++ b/examples/synthesizing/single_table/README.md
@@ -8,12 +8,12 @@ up using the code in this toolkit.
 
 First, we need the data. Download it from this
 [Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link),
-extract the files and place them in a `/data` folder in within this folder
+extract the files and place them in a `/data` folder within this folder
 (`examples/synthesizing/single_table`).
 
 > [!NOTE]
 > If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
-> of the (`config.yaml`)[config.yaml] file.
+> of the [`config.yaml`](config.yaml) file.
 
 Here is a description of the files that have been extracted:
 - `trans.csv`: The training data. It consists of information about bank transactions and it
@@ -47,7 +47,7 @@ It will save the result files inside a `/results` folder within this folder
 
 > [!NOTE]
 > If you wish to change the save folder, you can do so by editing the `results_dir` attribute
-> of the (`config.yaml`)[config.yaml] file.
+> of the [`config.yaml`](config.yaml) file.
 
 In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`,
 which is a pickle file containing the synthetic data before the matching process, in case
diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py
index 44d46027..72449103 100644
--- a/examples/synthesizing/single_table/run_synthesizing.py
+++ b/examples/synthesizing/single_table/run_synthesizing.py
@@ -41,7 +41,7 @@ def main(config: DictConfig) -> None:
         model_file_paths[relation] = model_file_path
 
     if all(model_file.exists() for model_file in model_file_paths.values()):
-        log(INFO, f"Found a pre-trained models in {config.results_dir}. Skipping training.")
+        log(INFO, f"Found pre-trained models in {config.results_dir}. Skipping training.")
     else:
         log(INFO, "No pre-trained models found, training a new model from scratch...")
         run_training.main(config)

From 02bcf422ee07be56836d299bc9da5b09559fd402 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif <marcelo.lotif@vectorinstitute.ai>
Date: Mon, 17 Nov 2025 15:31:47 -0500
Subject: [PATCH 23/23] David's CR

---
 examples/synthesizing/multi_table/config.yaml |  3 +++
 .../multi_table/run_synthesizing.py           | 19 ++++++++++-----
 .../synthesizing/single_table/config.yaml     |  2 ++
 .../single_table/run_synthesizing.py          | 24 ++++++++++++++-----
 4 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/examples/synthesizing/multi_table/config.yaml b/examples/synthesizing/multi_table/config.yaml
index 4eea7848..bbfcff30 100644
--- a/examples/synthesizing/multi_table/config.yaml
+++ b/examples/synthesizing/multi_table/config.yaml
@@ -3,6 +3,8 @@
 base_data_dir: examples/synthesizing/multi_table/data
 results_dir: examples/synthesizing/multi_table/results
 
+# diffusion_config, clustering_config, and classifier_config are only required
+# when training a new model from scratch
 diffusion_config:
   d_layers: [512, 1024, 1024, 1024, 1024, 512]
   dropout: 0.0
@@ -28,6 +30,7 @@ classifier_config:
     batch_size: 4096
     iterations: 20000
 
+# Synthesizing configuration
 general_config:
     data_dir: examples/synthesizing/multi_table/data
     test_data_dir: examples/synthesizing/multi_table/data
diff --git a/examples/synthesizing/multi_table/run_synthesizing.py b/examples/synthesizing/multi_table/run_synthesizing.py
index b19678f6..9d845e2e 100644
--- a/examples/synthesizing/multi_table/run_synthesizing.py
+++ b/examples/synthesizing/multi_table/run_synthesizing.py
@@ -1,6 +1,7 @@
 import pickle
 from logging import INFO
 from pathlib import Path
+from typing import Any
 
 import hydra
 from omegaconf import DictConfig
@@ -9,6 +10,7 @@
 from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig
 from midst_toolkit.common.logger import TOOLKIT_LOGGER, log
 from midst_toolkit.models.clavaddpm.data_loaders import load_tables
+from midst_toolkit.models.clavaddpm.enumerations import Relation
 from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing
 
 
@@ -35,24 +37,29 @@ def main(config: DictConfig) -> None:
 
     _, relation_order, _ = load_tables(Path(config.base_data_dir))
 
-    model_file_paths = {}
+    model_file_paths: dict[Relation, dict[str, Any]] = {}
     for relation in relation_order:
         model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl"
-        model_file_paths[relation] = model_file_path
+        model_file_paths[relation] = {
+            "file_path": model_file_path,
+            "exists": model_file_path.exists(),
+        }
 
     clustering_results_file = Path(config.results_dir) / "cluster_ckpt.pkl"
 
-    if all(model_file.exists() for model_file in model_file_paths.values()) and clustering_results_file.exists():
-        log(INFO, f"Found pre-trained models in {config.results_dir}. Skipping training.")
+    if all(result["exists"] for result in model_file_paths.values()) and clustering_results_file.exists():
+        log(INFO, f"Found previous results in {config.results_dir}. Skipping training.")
     else:
-        log(INFO, "No pre-trained models found, training a new model from scratch...")
+        log(INFO, "Not all previous results found. Training a new model from scratch.")
+        log(INFO, f"Summary of results: {model_file_paths}")
+        log(INFO, f"Clustering results file: {clustering_results_file} exists? {clustering_results_file.exists()}")
         run_training.main(config)
 
     log(INFO, "Loading models...")
 
     models = {}
     for relation in relation_order:
-        with open(model_file_paths[relation], "rb") as f:
+        with open(model_file_paths[relation]["file_path"], "rb") as f:
             models[relation] = pickle.load(f)
 
     with open(clustering_results_file, "rb") as f:
diff --git a/examples/synthesizing/single_table/config.yaml b/examples/synthesizing/single_table/config.yaml
index de21a680..b3cbb0e2 100644
--- a/examples/synthesizing/single_table/config.yaml
+++ b/examples/synthesizing/single_table/config.yaml
@@ -3,6 +3,7 @@
 base_data_dir: examples/synthesizing/single_table/data
 results_dir: examples/synthesizing/single_table/results
 
+# diffusion_config is only required when training a new model from scratch
 diffusion_config:
   d_layers: [512, 1024, 1024, 1024, 1024, 512]
   dropout: 0.0
@@ -16,6 +17,7 @@ diffusion_config:
   scheduler: cosine
   data_split_ratios: [0.99, 0.005, 0.005]
 
+# Synthesizing configuration
 general_config:
     data_dir: examples/synthesizing/single_table/data
     test_data_dir: examples/synthesizing/single_table/data
diff --git a/examples/synthesizing/single_table/run_synthesizing.py b/examples/synthesizing/single_table/run_synthesizing.py
index 72449103..b9f6a649 100644
--- a/examples/synthesizing/single_table/run_synthesizing.py
+++ b/examples/synthesizing/single_table/run_synthesizing.py
@@ -1,6 +1,7 @@
 import pickle
 from logging import INFO
 from pathlib import Path
+from typing import Any
 
 import hydra
 from omegaconf import DictConfig
@@ -9,6 +10,7 @@
 from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig
 from midst_toolkit.common.logger import TOOLKIT_LOGGER, log
 from midst_toolkit.models.clavaddpm.data_loaders import load_tables
+from midst_toolkit.models.clavaddpm.enumerations import Relation
 from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing
 
 
@@ -35,22 +37,32 @@ def main(config: DictConfig) -> None:
 
     tables, relation_order, _ = load_tables(Path(config.base_data_dir))
 
-    model_file_paths = {}
+    assert len(relation_order) == 1 and relation_order[0][0] is None, (
+        "Relation order is not configured for single-table. "
+        "For multi-table synthesizing, please use the `examples.synthesizing.multi_table.run_synthesizing` example. "
+        f"Relation order: {relation_order}"
+    )
+
+    model_file_paths: dict[Relation, dict[str, Any]] = {}
     for relation in relation_order:
         model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl"
-        model_file_paths[relation] = model_file_path
+        model_file_paths[relation] = {
+            "file_path": model_file_path,
+            "exists": model_file_path.exists(),
+        }
 
-    if all(model_file.exists() for model_file in model_file_paths.values()):
-        log(INFO, f"Found pre-trained models in {config.results_dir}. Skipping training.")
+    if all(result["exists"] for result in model_file_paths.values()):
+        log(INFO, f"Found previous results in {config.results_dir}. Skipping training.")
     else:
-        log(INFO, "No pre-trained models found, training a new model from scratch...")
+        log(INFO, "Not all previous results found. Training a new model from scratch.")
+        log(INFO, f"Summary of results: {model_file_paths}")
         run_training.main(config)
 
     log(INFO, "Loading models...")
 
     models = {}
     for relation in relation_order:
-        with open(model_file_paths[relation], "rb") as f:
+        with open(model_file_paths[relation]["file_path"], "rb") as f:
             models[relation] = pickle.load(f)
 
     log(INFO, "Synthesizing data...")