VectorInstitute
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/synthesizing/multi_table/README.md‎
Lines changed: 60 additions & 0 deletions b/‎examples/synthesizing/multi_table/README.md‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎examples/synthesizing/multi_table/config.yaml‎
Lines changed: 49 additions & 0 deletions b/‎examples/synthesizing/multi_table/config.yaml‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎examples/synthesizing/multi_table/run_synthesizing.py‎
Lines changed: 88 additions & 0 deletions b/‎examples/synthesizing/multi_table/run_synthesizing.py‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎examples/synthesizing/single_table/README.md‎
Lines changed: 58 additions & 0 deletions b/‎examples/synthesizing/single_table/README.md‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎examples/synthesizing/single_table/config.yaml‎
Lines changed: 36 additions & 0 deletions b/‎examples/synthesizing/single_table/config.yaml‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎examples/synthesizing/single_table/run_synthesizing.py‎
Lines changed: 84 additions & 0 deletions b/‎examples/synthesizing/single_table/run_synthesizing.py‎
Lines changed: 84 additions & 0 deletions
@@ -47,3 +47,7 @@ examples/training/single_table/data/**
 examples/training/single_table/results/**
 examples/training/multi_table/data/**
 examples/training/multi_table/results/**
+examples/synthesizing/single_table/data/**
+examples/synthesizing/single_table/results/**
+examples/synthesizing/multi_table/data/**
+examples/synthesizing/multi_table/results/**
@@ -0,0 +1,60 @@
+# Multi-Table Synthesizing Example
+
+This example will go over synthesizing data for a multi-table dataset from the ground
+up using the code in this toolkit.
+
+
+## Downloading data
+
+First, we need the data. Download it from this
+[Google Drive link](https://drive.google.com/file/d/1Ao222l4AJjG54-HDEGCWkIfzRbl9_IKa/view?usp=drive_link),
+extract the files and place them in a `/data` folder within this folder
+(`examples/synthesizing/multi_table`).
+
+> [!NOTE]
+> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+It will contain data for 8 tables: `account`, `card`, `client`, `disp`, `district`, `loan`, `order`,
+and `trans`. For each table there will be two files:
+- `{table_name}.csv`: The table's data.
+- `{table_name}_domain.json`: Metadata about the columns in the table's data, such as data types and sizes.
+
+Additionally, you will find one more file:
+- `dataset_meta.json`: Metadata about the relationship between the tables. It will describe which tables
+are associated with which other tables.
+
+
+## Kicking off synthesizing
+
+If there is a `/results` folder within this folder (`examples/synthesizing/multi_table`)
+from a previous training run, we will use that data to kick off synthesizing.
+For example, you can copy the results from another run (e.g. `examples.training.multi_table.run_training`)
+and paste them here and it will be picked up by this example.
+
+The [`config.yaml`](config.yaml) file contains the parameters for the synthesizing and also
+for training, in case there is a need to run that. Please take a look at them before kicking
+off the synthesizing process and edit them as necessary.
+
+To kick off synthesizing, simply run the command below from the project's root folder:
+
+```bash
+python -m examples.synthesizing.multi_table.run_synthesizing
+```
+
+## Results
+
+It will save the result files inside a `/results` folder within this folder
+(`examples/synthesizing/multi_table`).
+
+> [!NOTE]
+> If you wish to change the save folder, you can do so by editing the `results_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`,
+which is a pickle file containing the synthetic data before the matching process, in case
+it's needed.
+
+The `/results/multi_table_synthesizing` folder will contain the final synthesized
+data, organized per table, in the form of `.csv` files with the following naming pattern:
+`/results/multi_table_synthesizing/{table_name}/_final/{table_name}_synthetic.csv`.
@@ -0,0 +1,49 @@
+# Training example configuration
+# Base data directory (can be overridden from command line)
+base_data_dir: examples/synthesizing/multi_table/data
+results_dir: examples/synthesizing/multi_table/results
+
+# diffusion_config, clustering_config, and classifier_config are only required
+# when training a new model from scratch
+diffusion_config:
+  d_layers: [512, 1024, 1024, 1024, 1024, 512]
+  dropout: 0.0
+  num_timesteps: 2000
+  model_type: mlp
+  iterations: 20000
+  batch_size: 4096
+  lr: 0.0006
+  gaussian_loss_type: mse
+  weight_decay: 1e-05
+  scheduler: cosine
+  data_split_ratios: [0.99, 0.005, 0.005]
+
+clustering_config:
+  parent_scale: 1.0
+  num_clusters: 50
+  clustering_method: kmeans_and_gmm
+
+classifier_config:
+    d_layers: [128, 256, 512, 1024, 512, 256, 128]
+    lr: 0.0001
+    dim_t: 128
+    batch_size: 4096
+    iterations: 20000
+
+# Synthesizing configuration
+general_config:
+    data_dir: examples/synthesizing/multi_table/data
+    test_data_dir: examples/synthesizing/multi_table/data
+    exp_name: multi_table_synthesizing
+    workspace_dir: examples/synthesizing/multi_table/results
+    sample_prefix: ""
+
+sampling_config:
+    batch_size: 20000
+    classifier_scale: 1.0
+
+matching_config:
+    num_matching_clusters: 1
+    matching_batch_size: 1000
+    unique_matching: True
+    no_matching: False
@@ -0,0 +1,88 @@
+import pickle
+from logging import INFO
+from pathlib import Path
+from typing import Any
+
+import hydra
+from omegaconf import DictConfig
+
+from examples.training.multi_table import run_training
+from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig
+from midst_toolkit.common.logger import TOOLKIT_LOGGER, log
+from midst_toolkit.models.clavaddpm.data_loaders import load_tables
+from midst_toolkit.models.clavaddpm.enumerations import Relation
+from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing
+
+
+# Preventing some excessive logging
+TOOLKIT_LOGGER.setLevel(INFO)
+
+
+@hydra.main(config_path=".", config_name="config", version_base=None)
+def main(config: DictConfig) -> None:
+    """
+    Run the synthesizing pipeline for a multi-table diffusion model.
+
+    It will load the config and then data from the `config.base_data_dir` folder,
+    train the model, synthesize the data and save the results in the
+    `config.results_dir` folder.
+
+    It will first look for a pre-trained model in the `config.results_dir` folder.
+    If it doesn't find one, it will train a new model from scratch.
+
+    Args:
+        config: Training and synthesizing configuration as an OmegaConf DictConfig object.
+    """
+    log(INFO, f"Checking for a pre-trained model in {config.results_dir}...")
+
+    _, relation_order, _ = load_tables(Path(config.base_data_dir))
+
+    model_file_paths: dict[Relation, dict[str, Any]] = {}
+    for relation in relation_order:
+        model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl"
+        model_file_paths[relation] = {
+            "file_path": model_file_path,
+            "exists": model_file_path.exists(),
+        }
+
+    clustering_results_file = Path(config.results_dir) / "cluster_ckpt.pkl"
+
+    if all(result["exists"] for result in model_file_paths.values()) and clustering_results_file.exists():
+        log(INFO, f"Found previous results in {config.results_dir}. Skipping training.")
+    else:
+        log(INFO, "Not all previous results found. Training a new model from scratch.")
+        log(INFO, f"Summary of results: {model_file_paths}")
+        log(INFO, f"Clustering results file: {clustering_results_file} exists? {clustering_results_file.exists()}")
+        run_training.main(config)
+
+    log(INFO, "Loading models...")
+
+    models = {}
+    for relation in relation_order:
+        with open(model_file_paths[relation]["file_path"], "rb") as f:
+            models[relation] = pickle.load(f)
+
+    with open(clustering_results_file, "rb") as f:
+        clustering_result = pickle.load(f)
+
+    tables = clustering_result["tables"]
+    all_group_lengths_prob_dicts = clustering_result["all_group_lengths_prob_dicts"]
+
+    log(INFO, "Synthesizing data...")
+
+    clava_synthesizing(
+        tables,
+        relation_order,
+        Path(config.results_dir),
+        models,
+        GeneralConfig(**config.general_config),
+        SamplingConfig(**config.sampling_config),
+        MatchingConfig(**config.matching_config),
+        all_group_lengths_prob_dicts,
+    )
+
+    log(INFO, "Data synthesized successfully.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,58 @@
+# Single-Table Synthesizing Example
+
+This example will go over synthesizing data for a single-table dataset from the ground
+up using the code in this toolkit.
+
+
+## Downloading data
+
+First, we need the data. Download it from this
+[Google Drive link](https://drive.google.com/file/d/1J5qDuMHHg4dm9c3ISmb41tcTHSu1SVUC/view?usp=drive_link),
+extract the files and place them in a `/data` folder within this folder
+(`examples/synthesizing/single_table`).
+
+> [!NOTE]
+> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+Here is a description of the files that have been extracted:
+- `trans.csv`: The training data. It consists of information about bank transactions and it
+contains 20,000 data points.
+- `trans_domain.json`: Metadata about the columns in `trans.csv`, such as data types and sizes.
+- `dataset_meta.json`: Metadata about the relationship between the tables. Since this is a
+single-table example, it will only contain information about the `trans` table.
+
+
+## Kicking off synthesizing
+
+If there is a `/results` folder within this folder (`examples/synthesizing/single_table`)
+from a previous training run, we will use that data to kick off synthesizing.
+For example, you can copy the results from another run (e.g. `examples.training.single_table.run_training`)
+and paste them here and it will be picked up by this example.
+
+The [`config.yaml`](config.yaml) file contains the parameters for the synthesizing and also
+for training, in case there is a need to run that. Please take a look at them before kicking
+off the synthesizing process and edit them as necessary.
+
+To kick off synthesizing, simply run the command below from the project's root folder:
+
+```bash
+python -m examples.synthesizing.single_table.run_synthesizing
+```
+
+## Results
+
+It will save the result files inside a `/results` folder within this folder
+(`examples/synthesizing/single_table`).
+
+> [!NOTE]
+> If you wish to change the save folder, you can do so by editing the `results_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+In the `/results/before_matching/` folder, there will be a file called `synthetic_tables.pkl`,
+which is a pickle file containing the synthetic data before the matching process, in case
+it's needed.
+
+The `/results/single_table_synthesizing` folder will contain the final synthesized
+data, organized per table. In this single-table example, there is only going to be one
+synthesized table under `/results/single_table_synthesizing/trans/_final/trans_synthetic.csv`.
@@ -0,0 +1,36 @@
+# Training example configuration
+# Base data directory (can be overridden from command line)
+base_data_dir: examples/synthesizing/single_table/data
+results_dir: examples/synthesizing/single_table/results
+
+# diffusion_config is only required when training a new model from scratch
+diffusion_config:
+  d_layers: [512, 1024, 1024, 1024, 1024, 512]
+  dropout: 0.0
+  num_timesteps: 2000
+  model_type: mlp
+  iterations: 20000
+  batch_size: 4096
+  lr: 0.0006
+  gaussian_loss_type: mse
+  weight_decay: 1e-05
+  scheduler: cosine
+  data_split_ratios: [0.99, 0.005, 0.005]
+
+# Synthesizing configuration
+general_config:
+    data_dir: examples/synthesizing/single_table/data
+    test_data_dir: examples/synthesizing/single_table/data
+    exp_name: single_table_synthesizing
+    workspace_dir: examples/synthesizing/single_table/results
+    sample_prefix: ""
+
+sampling_config:
+    batch_size: 20000
+    classifier_scale: 1.0
+
+matching_config:
+    num_matching_clusters: 1
+    matching_batch_size: 1000
+    unique_matching: True
+    no_matching: False
@@ -0,0 +1,84 @@
+import pickle
+from logging import INFO
+from pathlib import Path
+from typing import Any
+
+import hydra
+from omegaconf import DictConfig
+
+from examples.training.single_table import run_training
+from midst_toolkit.common.config import GeneralConfig, MatchingConfig, SamplingConfig
+from midst_toolkit.common.logger import TOOLKIT_LOGGER, log
+from midst_toolkit.models.clavaddpm.data_loaders import load_tables
+from midst_toolkit.models.clavaddpm.enumerations import Relation
+from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing
+
+
+# Preventing some excessive logging
+TOOLKIT_LOGGER.setLevel(INFO)
+
+
+@hydra.main(config_path=".", config_name="config", version_base=None)
+def main(config: DictConfig) -> None:
+    """
+    Run the synthesizing pipeline for a single-table diffusion model.
+
+    It will load the config and then data from the `config.base_data_dir` folder,
+    train the model, synthesize the data and save the results in the
+    `config.results_dir` folder.
+
+    It will first look for a pre-trained model in the `config.results_dir` folder.
+    If it doesn't find one, it will train a new model from scratch.
+
+    Args:
+        config: Training and synthesizing configuration as an OmegaConf DictConfig object.
+    """
+    log(INFO, f"Checking for a pre-trained model in {config.results_dir}...")
+
+    tables, relation_order, _ = load_tables(Path(config.base_data_dir))
+
+    assert len(relation_order) == 1 and relation_order[0][0] is None, (
+        "Relation order is not configured for single-table. "
+        "For multi-table synthesizing, please use the `examples.synthesizing.multi_table.run_synthesizing` example. "
+        f"Relation order: {relation_order}"
+    )
+
+    model_file_paths: dict[Relation, dict[str, Any]] = {}
+    for relation in relation_order:
+        model_file_path = Path(config.results_dir) / "models" / f"{relation[0]}_{relation[1]}_ckpt.pkl"
+        model_file_paths[relation] = {
+            "file_path": model_file_path,
+            "exists": model_file_path.exists(),
+        }
+
+    if all(result["exists"] for result in model_file_paths.values()):
+        log(INFO, f"Found previous results in {config.results_dir}. Skipping training.")
+    else:
+        log(INFO, "Not all previous results found. Training a new model from scratch.")
+        log(INFO, f"Summary of results: {model_file_paths}")
+        run_training.main(config)
+
+    log(INFO, "Loading models...")
+
+    models = {}
+    for relation in relation_order:
+        with open(model_file_paths[relation]["file_path"], "rb") as f:
+            models[relation] = pickle.load(f)
+
+    log(INFO, "Synthesizing data...")
+
+    clava_synthesizing(
+        tables,
+        relation_order,
+        Path(config.results_dir),
+        models,
+        GeneralConfig(**config.general_config),
+        SamplingConfig(**config.sampling_config),
+        MatchingConfig(**config.matching_config),
+    )
+
+    log(INFO, "Data synthesized successfully.")
+
+
+if __name__ == "__main__":
+    main()