more refactoring and training updates

Michael Fuest · Michael Fuest · commit 24d39f509114 · 2025-05-14T04:18:37.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -123,3 +123,4 @@ tutorials/outputs
 checkpoints
 bfg.jar
 lightning_logs/*
+wandb/*
diff --git a/cents/config/dataset/pecanstreet.yaml b/cents/config/dataset/pecanstreet.yaml
@@ -12,7 +12,6 @@ time_series_columns: ["grid", "solar"]
 data_columns: ["dataid","local_15min","car1","grid","solar"]
 metadata_columns: ["dataid","building_type","solar","car1","city","state","total_square_footage","house_construction_year"]
 user_group: all # non_pv_users, all, pv_users
-user_id: null
 numeric_context_bins: 5
 
 context_vars: # for each desired context variable, add the name and number of categories
diff --git a/cents/config/evaluator/default.yaml b/cents/config/evaluator/default.yaml
@@ -1,7 +1,7 @@
 model_name: ${model.name}
 eval_pv_shift: False
 eval_metrics: True
-eval_vis: True
-eval_context_sparse: False
+eval_context_sparse: True
 save_results: False
+eval_disentanglement: True
 save_dir: ${run_dir}/eval
diff --git a/cents/config/model/diffusion_ts.yaml b/cents/config/model/diffusion_ts.yaml
@@ -24,4 +24,4 @@ reg_weight: null
 gradient_accumulate_every: 2
 ema_decay: 0.99
 ema_update_interval: 10
-use_ema_sampling: True
+use_ema_sampling: False
diff --git a/cents/config/trainer/acgan.yaml b/cents/config/trainer/acgan.yaml
@@ -7,6 +7,7 @@ batch_size: 1024
 sampling_batch_size: 4096
 gradient_accumulate_every: 1
 log_every_n_steps: 1
+eval_after_training: False
 
 checkpoint:
   save_last: False
diff --git a/cents/config/trainer/diffusion_ts.yaml b/cents/config/trainer/diffusion_ts.yaml
@@ -7,6 +7,7 @@ log_every_n_steps: 1
 batch_size: 1024
 max_epochs: 5000
 base_lr: 1e-4
+eval_after_training: False
 
 checkpoint:
   save_last: False
diff --git a/cents/config/trainer/normalizer.yaml b/cents/config/trainer/normalizer.yaml
@@ -8,6 +8,7 @@ n_epochs: 2000
 batch_size: 4096
 lr: 3e-4
 save_cycle: 5000
+eval_after_training: False
 
 checkpoint:
   save_last: False
diff --git a/cents/data_generator.py b/cents/data_generator.py
@@ -178,7 +178,9 @@ def load_from_checkpoint(
         if ckpt_path.suffix == ".ckpt":
             self.model = (
                 ModelCls.load_from_checkpoint(
-                    checkpoint_path=ckpt_path, map_location=device
+                    checkpoint_path=ckpt_path,
+                    map_location=device,
+                    strict=False,
                 )
                 .to(device)
                 .eval()
diff --git a/cents/datasets/utils.py b/cents/datasets/utils.py
@@ -66,7 +66,7 @@ def check_inverse_transform(
         mse_list.append(mse)
 
     avg_mse = np.mean(mse_list)
-    print(f"Average MSE over all rows: {avg_mse}")
+    print(f"[Cents] Average MSE over all rows: {avg_mse}")
     return avg_mse
 
 
diff --git a/cents/eval/eval.py b/cents/eval/eval.py
@@ -11,15 +11,18 @@
 import numpy as np
 import pandas as pd
 import torch
-import wandb
 from omegaconf import DictConfig, OmegaConf
 
+import wandb
 from cents.eval.discriminative_score import discriminative_score_metrics
 from cents.eval.eval_metrics import (
     Context_FID,
     calculate_mmd,
+    compute_mig,
+    compute_sap,
     dynamic_time_warping_dist,
 )
+from cents.eval.eval_utils import flatten_log_dict
 from cents.eval.predictive_score import predictive_score_metrics
 from cents.models.acgan import ACGAN
 from cents.models.diffusion_ts import Diffusion_TS
@@ -33,8 +36,7 @@ class Evaluator:
     A class for evaluating generative models on time series data.
 
     This class handles the evaluation process, including metric computation,
-    visualization generation, and results storage. It can evaluate models on
-    either the entire dataset or specific users.
+    visualization generation, and results storage.
 
     Attributes:
         cfg (DictConfig): Configuration for the evaluation process
@@ -85,33 +87,26 @@ def __init__(
 
     def evaluate_model(
         self,
-        user_id: Optional[int] = None,
         model: Optional[Any] = None,
     ) -> Dict:
         """
         Evaluate the model and store results.
 
         Args:
-            user_id (Optional[int]): The ID of the user to evaluate. If None, evaluate on the entire dataset.
             model (Optional[Any]): The model to evaluate. If None, will load or train a model.
 
         Returns:
             Dict: Dictionary containing the evaluation results
         """
-        if user_id is not None:
-            dataset = self.real_dataset.create_user_dataset(user_id)
-        else:
-            dataset = self.real_dataset
+        dataset = self.real_dataset
 
         if not model:
             model = self.get_trained_model(dataset)
 
         model.to(self.device)
+        model.eval()
 
-        if user_id is not None:
-            logger.info(f"[Cents] Starting evaluation for user {user_id}")
-        else:
-            logger.info("[Cents] Starting evaluation for all users")
+        logger.info("[Cents] Starting evaluation")
         logger.info("----------------------")
 
         self.run_evaluation(dataset, model)
@@ -120,7 +115,7 @@ def evaluate_model(
             self.save_results()
 
         if self.cfg.get("wandb", {}).get("enabled", False) and wandb.run is not None:
-            wandb.log(self.current_results["metrics"])
+            wandb.log(flatten_log_dict(self.current_results["metrics"]))
 
         return self.current_results
 
@@ -172,7 +167,7 @@ def load_results(self, timestamp: Optional[str] = None) -> Dict:
 
         return {"metrics": metrics, "metadata": metadata}
 
-    def compute_metrics(
+    def compute_quality_metrics(
         self,
         real_data: np.ndarray,
         syn_data: np.ndarray,
@@ -213,8 +208,6 @@ def compute_metrics(
         metrics["Pred_Score"] = pred_score
         logger.info(f"[Cents] Pred Score completed")
 
-        self.current_results["metrics"] = metrics
-
         if mask is not None:
             logger.info("[Cents] Starting Rare-Subset Metrics")
             rare_metrics = {}
@@ -249,6 +242,42 @@ def compute_metrics(
             logger.info("[Cents] Done computing Rare-Subset Metrics.")
             metrics["rare_subset"] = rare_metrics
 
+        self.current_results["metrics"] = metrics
+
+    def compute_disentanglement_metrics(
+        self,
+        context_vars: Dict[str, torch.Tensor],
+        model: Any,
+    ) -> None:
+        """
+        Compute disentanglement metrics and store them in current_results.
+
+        Args:
+            context_vars (Dict[str, torch.Tensor]): Dictionary of context variables
+            model (Any): The model to evaluate
+        """
+        logger.info("[Cents] --- Starting Disentanglement Metrics ---")
+
+        with torch.no_grad():
+            h, _ = model.context_module(context_vars)  # (N, D)
+
+        emb_np = h.cpu().numpy()
+        ctx_np = {k: v.cpu().numpy() for k, v in context_vars.items()}
+
+        mig, mig_detail = compute_mig(emb_np, ctx_np)
+        sap, sap_detail = compute_sap(emb_np, ctx_np)
+
+        self.current_results["metrics"].setdefault("disentanglement", {})
+        self.current_results["metrics"]["disentanglement"].update(
+            {
+                "MIG": {"mean": mig, **mig_detail},
+                "SAP": {"mean": sap, **sap_detail},
+            }
+        )
+
+        logger.info("[Cents] MIG completed")
+        logger.info("[Cents] SAP completed")
+
     def get_trained_model(self, dataset: Any) -> Any:
         model_dict = {
             "acgan": ACGAN,
@@ -326,6 +355,9 @@ def evaluate_subset(
             ):
                 rare_mask = real_data_subset["is_rare"].values
 
-            self.compute_metrics(
+            self.compute_quality_metrics(
                 real_data_array, syn_data_array, real_data_inv, rare_mask
             )
+
+        if self.cfg.evaluator.eval_disentanglement:
+            self.compute_disentanglement_metrics(context_vars, model)
diff --git a/cents/eval/eval_metrics.py b/cents/eval/eval_metrics.py
@@ -1,11 +1,13 @@
 from functools import partial
-from typing import Tuple
+from typing import Dict, Tuple
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import scipy
 from dtaidistance import dtw
+from sklearn.linear_model import Ridge
+from sklearn.metrics import mutual_info_score, r2_score
 
 from cents.eval.eval_utils import (
     gaussian_kernel_matrix,
@@ -167,3 +169,96 @@ def Context_FID(ori_data: np.ndarray, generated_data: np.ndarray) -> float:
     gen_represenation = gen_represenation[idx]
     results = calculate_fid(ori_represenation, gen_represenation)
     return results
+
+
+def compute_mig(
+    embeddings: np.ndarray,
+    context_vars: Dict[str, np.ndarray],
+    n_bins: int = 10,
+) -> Tuple[float, Dict[str, float]]:
+    """
+    Mutual-Information Gap (MIG) with robust binning.
+
+    Args:
+        embeddings : (N, D) float array
+        context_vars : dict[str, (N,) int array]
+        n_bins : number of equal-width bins for each latent dim
+
+    Returns:
+        overall_mig : float
+        per_var      : dict[str, float]
+    """
+    N, D = embeddings.shape
+    per_var: Dict[str, float] = {}
+    for name, labels in context_vars.items():
+        # build MI vector over latent dims
+        mi_vec = []
+        for d in range(D):
+            # skip degenerate dimensions
+            if np.allclose(embeddings[:, d], embeddings[0, d]):
+                mi_vec.append(0.0)
+                continue
+            edges = np.histogram_bin_edges(embeddings[:, d], bins=n_bins)
+            codes = np.digitize(embeddings[:, d], bins=edges[1:-1], right=False)
+            mi_vec.append(mutual_info_score(labels, codes))
+        mi = np.asarray(mi_vec)
+
+        # if MI is all zeros, MIG is zero
+        if mi.max() == 0.0:
+            per_var[name] = 0.0
+            continue
+
+        top2 = np.sort(mi)[-2:]
+        entropy = mutual_info_score(labels, labels) + 1e-12
+        per_var[name] = (top2[1] - top2[0]) / entropy  # (largest - second) / H
+
+    overall = float(np.mean(list(per_var.values()))) if per_var else 0.0
+    return overall, per_var
+
+
+def compute_sap(
+    embeddings: np.ndarray,
+    context_vars: Dict[str, np.ndarray],
+    reg_strength: float = 1e-3,
+) -> Tuple[float, Dict[str, float]]:
+    """
+    Compute the Separability-Attribute-Predictability (SAP) score.
+
+    Args:
+        embeddings : (N, D) float array
+            Latent codes h for N samples and D dimensions.
+
+        context_vars : dict[str, (N,) int array]
+            Mapping of context variable names to discrete labels.
+
+        reg_strength : float, default 1e-3
+            ℓ2-regularisation strength for the ridge regressors that predict
+            the factor labels from *one* latent coordinate at a time.
+
+    Returns:
+        overall_sap : float
+        Mean SAP score across factors.
+
+        per_var : dict[str, float]
+            SAP score for each individual context variable.
+    """
+    N, D = embeddings.shape
+    per_var = {}
+
+    for name, labels in context_vars.items():
+        # Convert labels to a float vector for regression (one-vs-rest works too)
+        y = labels.astype(float)
+        scores = []
+
+        for d in range(D):
+            # fit 1-D ridge regressor  h_d  ->  y
+            model = Ridge(alpha=reg_strength, fit_intercept=True)
+            model.fit(embeddings[:, [d]], y)
+            y_pred = model.predict(embeddings[:, [d]])
+            scores.append(r2_score(y, y_pred))  # goodness of fit
+
+        top2 = np.sort(scores)[-2:]  # best & second-best
+        per_var[name] = top2[1] - top2[0]  # SAP_i
+
+    overall = float(np.mean(list(per_var.values()))) if per_var else 0.0
+    return overall, per_var
diff --git a/cents/eval/eval_utils.py b/cents/eval/eval_utils.py
@@ -1000,3 +1000,23 @@ def create_visualizations(
 #         fig.tight_layout()
 #         wandb.log({f"ShiftPlot_{j}": wandb.Image(fig)})
 #         plt.close(fig)
+
+
+def flatten_log_dict(d: Dict[str, Any], prefix: str = "") -> Dict[str, float]:
+    """
+    Flatten a dictionary of log values into a single dictionary of floats.
+
+    Args:
+        d (Dict[str, Any]): The dictionary to flatten
+        prefix (str): The prefix to add to the keys
+    Returns:
+        Dict[str, float]: A flattened dictionary of floats
+    """
+    flat = {}
+    for k, v in d.items():
+        name = f"{prefix}{k}"
+        if isinstance(v, dict):
+            flat.update(flatten_log_dict(v, prefix=name + "/"))
+        else:
+            flat[name] = float(v)
+    return flat
diff --git a/cents/models/normalizer.py b/cents/models/normalizer.py
@@ -151,7 +151,9 @@ def __init__(
         self.dataset = dataset
 
         self.context_vars = list(dataset_cfg.context_vars.keys())
-        self.time_series_cols = dataset_cfg.time_series_columns
+        self.time_series_cols = dataset_cfg.time_series_columns[
+            : dataset_cfg.time_series_dims
+        ]
         self.time_series_dims = dataset_cfg.time_series_dims
         self.do_scale = dataset_cfg.scale
 
diff --git a/cents/trainer.py b/cents/trainer.py
diff --git a/scripts/eval_pretrained.py b/scripts/eval_pretrained.py
diff --git a/scripts/train.py b/scripts/train.py
diff --git a/tutorials/generate_data.ipynb b/tutorials/generate_data.ipynb