Removing debug asserts, adding asserts for multi table generation

lotif · lotif · commit 378efef84e1b · 2025-08-12T16:42:10.000-03:00
diff --git a/tests/integration/data/multi_table/assertion_data/syntetic_data.json b/tests/integration/data/multi_table/assertion_data/syntetic_data.json
diff --git a/tests/integration/data/single_table/assertion_data/syntetic_data.json b/tests/integration/data/single_table/assertion_data/syntetic_data.json
@@ -0,0 +1,4 @@
+{
+    "X_gen_std": [1270.027488, 497.444234, 1669.368549, 2383.459776, 1170.001261],
+    "y_gen": [9, 47, 15, 6, 90]
+}
diff --git a/tests/integration/models/clavaddpm/test_model.py b/tests/integration/models/clavaddpm/test_model.py
@@ -267,7 +267,7 @@ def test_train_single_table(tmp_path: Path):
     )
     X_gen, y_gen = x_gen_tensor.numpy(), y_gen_tensor.numpy()
 
-    with open("tests/integration/data/single_table/assertion_data/syntetic_trans_data.json", "r") as f:
+    with open("tests/integration/data/single_table/assertion_data/syntetic_data.json", "r") as f:
         expected_results = json.load(f)
 
     # Assert the synthetic samples are within the expected values
@@ -281,14 +281,41 @@ def test_train_single_table(tmp_path: Path):
 
 @pytest.mark.integration_test()
 def test_train_multi_table(tmp_path: Path):
+    # Setup
+    set_all_random_seeds(seed=133742, use_deterministic_torch_algos=True, disable_torch_benchmarking=True)
+
+    # Act
     os.makedirs(tmp_path / "models")
     configs = {"clustering": CLUSTERING_CONFIG, "diffusion": DIFFUSION_CONFIG, "classifier": CLASSIFIER_CONFIG}
 
     tables, relation_order, dataset_meta = load_multi_table("tests/integration/data/multi_table/")
     tables, all_group_lengths_prob_dicts = clava_clustering(tables, relation_order, tmp_path, configs)
     models = clava_training(tables, relation_order, tmp_path, configs, device="cpu")
 
-    assert models
+    # Assert
+    with open(tmp_path / "models" / "None_trans_ckpt.pkl", "rb") as f:
+        table_info = pickle.load(f)["table_info"]
+
+    sample_size = 5
+    key = (None, "trans")
+    x_gen_tensor, y_gen_tensor = models[key]["diffusion"].sample_all(
+        sample_size,
+        DIFFUSION_CONFIG["batch_size"],
+        table_info[key]["empirical_class_dist"].float(),
+        ddim=False,
+    )
+    X_gen, y_gen = x_gen_tensor.numpy(), y_gen_tensor.numpy()
+
+    with open("tests/integration/data/multi_table/assertion_data/syntetic_data.json", "r") as f:
+        expected_results = json.load(f)
+
+    # Assert the synthetic samples are within the expected values
+    # For X_gen, we are checking if the standard deviations of each row
+    # are within a pre-defined range with some percentage of tolerance
+    assert np.allclose(X_gen.std(axis=1, ddof=0), expected_results["X_gen_std"], rtol=0.05, atol=0)
+    assert all(y_gen == expected_results["y_gen"])
+
+    unset_all_random_seeds()
 
 
 @pytest.mark.integration_test()
@@ -308,11 +335,6 @@ def test_clustering_reload(tmp_path: Path):
     account_original_df_as_float = tables["account"]["original_df"].astype(float)
     assert account_df_no_clustering.equals(account_original_df_as_float)
 
-    print(tables["account"]["df"]["account_trans_cluster"].tolist())
-    print(tables["trans"]["df"]["account_trans_cluster"].tolist())
-
-    assert account_df_no_clustering == account_original_df_as_float
-
     if _is_apple_silicon():
         # TODO: Figure out if there is a good way of testing the clustering results
         # on multiple platforms. https://app.clickup.com/t/868f43wp0

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +    "X_gen_std": [1270.027488, 497.444234, 1669.368549, 2383.459776, 1170.001261],
 +    "y_gen": [9, 47, 15, 6, 90]
 +}