fix: synthesize model when only saving diff

samedii · samedii · commit 12e09690c0a8 · 2025-01-27T07:50:15.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,5 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+/test_ema_checkpoint
diff --git a/posthoc_ema/posthoc_ema.py b/posthoc_ema/posthoc_ema.py
@@ -3,7 +3,7 @@
 from contextlib import contextmanager
 from copy import deepcopy
 from pathlib import Path
-from typing import Iterator, Optional
+from typing import Iterator, Optional, Generator
 
 import torch
 from PIL import Image
@@ -285,41 +285,41 @@ def _cleanup_old_checkpoints(self) -> None:
 
     @contextmanager
     def model(
-        self,
-        base_model: nn.Module,
-        sigma_rel: float,
-        step: int | None = None,
-    ) -> Iterator[nn.Module]:
-        """
-        Context manager for using synthesized EMA model.
+        self, model: nn.Module, sigma_rel: float
+    ) -> Generator[nn.Module, None, None]:
+        """Context manager that temporarily sets model parameters to EMA state.
 
         Args:
-            base_model: Model to apply EMA weights to
+            model: Model to update
             sigma_rel: Target relative standard deviation
-            step: Optional specific training step to synthesize for
 
-        Yields:
-            nn.Module: Model with synthesized EMA weights
+        Returns:
+            Model with EMA parameters
         """
-        # Store original device and move base model to CPU
-        original_device = next(base_model.parameters()).device
-        base_model.cpu()
+        # Store original device and move model to CPU
+        original_device = next(model.parameters()).device
+        model.cpu()
         torch.cuda.empty_cache()
 
-        # Get state dict and create EMA model
-        with self.state_dict(sigma_rel=sigma_rel, step=step) as state_dict:
-            ema_model = deepcopy(base_model)
-            ema_model.load_state_dict(state_dict)
-
-            try:
+        try:
+            with self.state_dict(sigma_rel=sigma_rel) as state_dict:
+                ema_model = deepcopy(model)
+                result = ema_model.load_state_dict(
+                    state_dict, strict=not self.only_save_diff
+                )
+                assert (
+                    len(result.unexpected_keys) == 0
+                ), f"Unexpected keys: {result.unexpected_keys}"
+                ema_model.eval()  # Set to eval mode to handle BatchNorm
                 yield ema_model
-            finally:
-                # Clean up EMA model and restore base model device
+                # Clean up EMA model
                 if hasattr(ema_model, "cuda"):
                     ema_model.cpu()
                 del ema_model
-                base_model.to(original_device)
-                torch.cuda.empty_cache()
+        finally:
+            # Restore model to original device
+            model.to(original_device)
+            torch.cuda.empty_cache()
 
     @contextmanager
     def state_dict(
diff --git a/posthoc_ema/utils.py b/posthoc_ema/utils.py
@@ -80,34 +80,18 @@ def p_dot_p(t_a: Tensor, gamma_a: Tensor, t_b: Tensor, gamma_b: Tensor) -> Tenso
     t_ratio = torch.where(
         (t_a == 0) & (t_b == 0),
         torch.ones_like(t_a),
-        t_a / torch.where(t_b == 0, torch.ones_like(t_b), t_b)
+        t_a / torch.where(t_b == 0, torch.ones_like(t_b), t_b),
     )
-    
+
     t_exp = torch.where(t_a < t_b, gamma_b, -gamma_a)
     t_max = torch.maximum(t_a, t_b)
-    
+
     # Handle t=0 case: if both times are 0, max is 1
-    t_max = torch.where(
-        (t_a == 0) & (t_b == 0),
-        torch.ones_like(t_max),
-        t_max
-    )
-    
-    # Print debug info for first few values
-    if t_a.shape[0] < 10:  # Only print for small tensors
-        print(f"\nt_ratio shape: {t_ratio.shape}")
-        print(f"t_ratio first few: {t_ratio[:5, :5]}")
-        print(f"t_exp first few: {t_exp[:5, :5]}")
-        print(f"t_max first few: {t_max[:5, :5]}")
-    
+    t_max = torch.where((t_a == 0) & (t_b == 0), torch.ones_like(t_max), t_max)
+
     num = (gamma_a + 1) * (gamma_b + 1) * t_ratio**t_exp
     den = (gamma_a + gamma_b + 1) * t_max
-    
-    if t_a.shape[0] < 10:  # Only print for small tensors
-        print(f"num first few: {num[:5, :5]}")
-        print(f"den first few: {den[:5, :5]}")
-        print(f"result first few: {(num/den)[:5, :5]}")
-    
+
     return num / den
 
 
@@ -127,13 +111,13 @@ def solve_weights(t_i: Tensor, gamma_i: Tensor, t_r: Tensor, gamma_r: Tensor) ->
     # Reshape tensors for matrix operations
     rv = lambda x: x.reshape(-1, 1)  # Column vector
     cv = lambda x: x.reshape(1, -1)  # Row vector
-    
+
     # Compute matrices A and b using p_dot_p
     A = p_dot_p(rv(t_i), rv(gamma_i), cv(t_i), cv(gamma_i))
     b = p_dot_p(rv(t_i), rv(gamma_i), cv(t_r), cv(gamma_r))
-    
+
     # Solve linear system
-    return torch.linalg.solve(A, b) 
+    return torch.linalg.solve(A, b)
 
 
 def _safe_torch_load(path: str | Path, *, map_location=None):
@@ -142,4 +126,4 @@ def _safe_torch_load(path: str | Path, *, map_location=None):
         return torch.load(path, map_location=map_location, weights_only=True)
     except TypeError:
         # Older PyTorch versions don't support weights_only
-        return torch.load(path, map_location=map_location) 
+        return torch.load(path, map_location=map_location)
diff --git a/tests/test_usage.py b/tests/test_usage.py