improve: optional calculation dtype

samedii · samedii · commit dff4a551dcd4 · 2025-01-27T23:00:04.000+01:00
diff --git a/posthoc_ema/posthoc_ema.py b/posthoc_ema/posthoc_ema.py
@@ -3,7 +3,7 @@
 from contextlib import contextmanager
 from copy import deepcopy
 from pathlib import Path
-from typing import Iterator, Optional, Generator
+from typing import Iterator, Optional, Generator, Dict
 
 import torch
 from PIL import Image
@@ -28,6 +28,7 @@ class PostHocEMA:
         update_every: Number of steps between EMA updates
         checkpoint_every: Number of steps between checkpoints
         checkpoint_dtype: Data type for checkpoint storage (if None, uses original parameter dtype)
+        calculation_dtype: Data type for synthesis calculations (default=torch.float32)
         only_save_diff: If True, only save parameters with requires_grad=True
     """
 
@@ -39,6 +40,7 @@ def __init__(
         update_every: int = 10,
         checkpoint_every: int = 1000,
         checkpoint_dtype: Optional[torch.dtype] = None,
+        calculation_dtype: torch.dtype = torch.float32,
         only_save_diff: bool = False,
     ):
         if sigma_rels is None:
@@ -47,6 +49,7 @@ def __init__(
         self.checkpoint_dir = Path(checkpoint_dir)
         self.max_checkpoints = max_checkpoints
         self.checkpoint_dtype = checkpoint_dtype
+        self.calculation_dtype = calculation_dtype
         self.update_every = update_every
         self.checkpoint_every = checkpoint_every
         self.only_save_diff = only_save_diff
@@ -67,6 +70,7 @@ def from_model(
         update_every: int = 10,
         checkpoint_every: int = 1000,
         checkpoint_dtype: Optional[torch.dtype] = None,
+        calculation_dtype: torch.dtype = torch.float32,
         only_save_diff: bool = False,
     ) -> PostHocEMA:
         """
@@ -80,6 +84,7 @@ def from_model(
             update_every: Number of steps between EMA updates
             checkpoint_every: Number of steps between checkpoints
             checkpoint_dtype: Data type for checkpoint storage (if None, uses original parameter dtype)
+            calculation_dtype: Data type for synthesis calculations (default=torch.float32)
             only_save_diff: If True, only save parameters with requires_grad=True
 
         Returns:
@@ -92,6 +97,7 @@ def from_model(
             update_every=update_every,
             checkpoint_every=checkpoint_every,
             checkpoint_dtype=checkpoint_dtype,
+            calculation_dtype=calculation_dtype,
             only_save_diff=only_save_diff,
         )
         instance.checkpoint_dir.mkdir(exist_ok=True, parents=True)
@@ -291,13 +297,16 @@ def model(
         self,
         model: nn.Module,
         sigma_rel: float,
+        *,
+        calculation_dtype: torch.dtype = torch.float32,
     ) -> Iterator[nn.Module]:
         """
         Context manager for temporarily setting model parameters to EMA state.
 
         Args:
             model: Model to temporarily set to EMA state
             sigma_rel: Target relative standard deviation
+            calculation_dtype: Data type for synthesis calculations (default=torch.float32)
 
         Yields:
             nn.Module: Model with EMA parameters
@@ -308,7 +317,9 @@ def model(
         torch.cuda.empty_cache()
 
         try:
-            with self.state_dict(sigma_rel=sigma_rel) as state_dict:
+            with self.state_dict(
+                sigma_rel, calculation_dtype=calculation_dtype
+            ) as state_dict:
                 # Store original state only for parameters that will be modified
                 original_state = {
                     name: param.detach().clone()
@@ -340,14 +351,15 @@ def model(
     def state_dict(
         self,
         sigma_rel: float,
-        step: int | None = None,
-    ) -> Iterator[dict[str, torch.Tensor]]:
+        *,
+        calculation_dtype: torch.dtype = torch.float32,
+    ) -> Iterator[Dict[str, torch.Tensor]]:
         """
         Context manager for getting state dict for synthesized EMA model.
 
         Args:
             sigma_rel: Target relative standard deviation
-            step: Optional specific training step to synthesize for
+            calculation_dtype: Data type for synthesis calculations (default=torch.float32)
 
         Yields:
             dict[str, torch.Tensor]: State dict with synthesized weights
@@ -387,8 +399,8 @@ def state_dict(
         if total_checkpoints == 0:
             raise ValueError("No checkpoints found")
 
-        # Pre-allocate tensors
-        gammas = torch.empty(total_checkpoints, device=device)
+        # Pre-allocate tensors in calculation dtype
+        gammas = torch.empty(total_checkpoints, dtype=calculation_dtype, device=device)
         timesteps = torch.empty(total_checkpoints, dtype=torch.long, device=device)
 
         # Fill tensors one value at a time
@@ -412,15 +424,20 @@ def state_dict(
                 del checkpoint  # Free memory immediately
                 torch.cuda.empty_cache()
 
-        # Solve for weights
-        weights = solve_weights(gammas, timesteps, gamma)
+        # Solve for weights in calculation dtype
+        weights = solve_weights(
+            gammas,
+            timesteps,
+            gamma,
+            calculation_dtype=calculation_dtype,
+        )
 
         # Free memory for gamma and timestep tensors
         del gammas
         del timesteps
         torch.cuda.empty_cache()
 
-        # Load first checkpoint to get parameter names
+        # Load first checkpoint to get parameter names and original dtypes
         first_checkpoint = torch.load(
             str(checkpoint_files[0]), weights_only=True, map_location="cpu"
         )
@@ -430,6 +447,12 @@ def state_dict(
             if k.startswith("ema_model.")
             and k.replace("ema_model.", "") not in ("initted", "step")
         }
+        # Store original dtypes for each parameter
+        param_dtypes = {
+            name: first_checkpoint[checkpoint_name].dtype
+            for name, checkpoint_name in param_names.items()
+            if isinstance(first_checkpoint[checkpoint_name], torch.Tensor)
+        }
         del first_checkpoint
         torch.cuda.empty_cache()
 
@@ -450,6 +473,9 @@ def state_dict(
                 if not isinstance(param_data, torch.Tensor):
                     continue
 
+                # Convert to calculation dtype for synthesis
+                param_data = param_data.to(calculation_dtype)
+
                 if file_idx == 0:
                     # Initialize parameter with first weighted contribution
                     state_dict[param_name] = param_data.to(device) * weight
@@ -461,6 +487,11 @@ def state_dict(
             del checkpoint
             torch.cuda.empty_cache()
 
+        # Convert back to original dtypes
+        for name, tensor in state_dict.items():
+            if name in param_dtypes:
+                state_dict[name] = tensor.to(param_dtypes[name])
+
         # Free memory
         del weights
         torch.cuda.empty_cache()
diff --git a/posthoc_ema/utils.py b/posthoc_ema/utils.py
@@ -63,18 +63,19 @@ def sigma_rel_to_gamma(sigma_rel: float) -> float:
     return np.roots([1, 7, 16 - t, 12 - t]).real.max().item()
 
 
-def p_dot_p(t_a: Tensor, gamma_a: Tensor, t_b: Tensor, gamma_b: Tensor) -> Tensor:
-    """
-    Compute dot product between two power function EMA profiles.
+def p_dot_p(
+    t_a: torch.Tensor, gamma_a: torch.Tensor, t_b: torch.Tensor, gamma_b: torch.Tensor
+) -> torch.Tensor:
+    """Compute p_dot_p value for EMA synthesis.
 
     Args:
-        t_a: First timestep tensor
-        gamma_a: First gamma parameter tensor
-        t_b: Second timestep tensor
-        gamma_b: Second gamma parameter tensor
+        t_a: First timestep
+        gamma_a: First gamma value
+        t_b: Second timestep
+        gamma_b: Second gamma value
 
     Returns:
-        Tensor: Dot product between the profiles
+        Tensor: p_dot_p value
     """
     # Handle t=0 case: if both times are 0, ratio is 1
     t_ratio = torch.where(
@@ -99,27 +100,34 @@ def solve_weights(
     gammas: torch.Tensor,
     timesteps: torch.Tensor,
     target_gamma: float,
+    *,
+    calculation_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
-    """
-    Solve for optimal weights to synthesize EMA model with target gamma.
+    """Solve for weights that produce target gamma when applied to gammas.
 
     Args:
-        gammas: Gamma values for each checkpoint
-        timesteps: Timesteps for each checkpoint
+        gammas: Tensor of gamma values
+        timesteps: Tensor of timesteps
         target_gamma: Target gamma value
+        calculation_dtype: Data type for calculations (default=torch.float32)
 
     Returns:
-        torch.Tensor: Optimal weights for each checkpoint
+        Tensor of weights
     """
-    # Convert to float32 for numerical stability
-    gammas = gammas.to(dtype=torch.float32)
-    timesteps = timesteps.to(dtype=torch.float32)
-    target_gamma = torch.tensor(target_gamma, dtype=torch.float32, device=gammas.device)
+    # Convert inputs to calculation dtype
+    gammas = gammas.to(calculation_dtype)
+    timesteps = timesteps.to(calculation_dtype)
+    target_gamma = torch.tensor(
+        target_gamma, dtype=calculation_dtype, device=gammas.device
+    )
+    target_timestep = timesteps[-1]  # Use last timestep as target
 
-    # Compute p_dot_p matrix
-    p_dot_p_matrix = torch.zeros(
-        (len(gammas), len(gammas)), dtype=torch.float32, device=gammas.device
+    # Pre-allocate tensor in calculation dtype
+    p_dot_p_matrix = torch.empty(
+        (len(gammas), len(gammas)), dtype=calculation_dtype, device=gammas.device
     )
+
+    # Compute p_dot_p matrix
     for i in range(len(gammas)):
         for j in range(len(gammas)):
             p_dot_p_matrix[i, j] = p_dot_p(
@@ -129,21 +137,15 @@ def solve_weights(
     # Compute target vector
     target_vector = torch.tensor(
         [
-            p_dot_p(timesteps[i], gammas[i], timesteps[-1], target_gamma)
+            p_dot_p(timesteps[i], gammas[i], target_timestep, target_gamma)
             for i in range(len(gammas))
         ],
-        dtype=torch.float32,
+        dtype=calculation_dtype,
         device=gammas.device,
     )
 
-    # Solve linear system
-    try:
-        weights = torch.linalg.solve(p_dot_p_matrix, target_vector)
-    except RuntimeError:
-        # If matrix is singular, use least squares
-        weights = torch.linalg.lstsq(p_dot_p_matrix, target_vector).solution
-
-    return weights
+    # Solve for weights
+    return torch.linalg.solve(p_dot_p_matrix, target_vector)
 
 
 def _safe_torch_load(path: str | Path, *, map_location=None):
diff --git a/tests/test_usage.py b/tests/test_usage.py
@@ -600,3 +600,58 @@ def test_context_manager_with_only_save_diff():
         for file in Path("posthoc-ema").glob("*"):
             file.unlink()
         Path("posthoc-ema").rmdir()
+
+
+def test_calculation_dtype():
+    """Test that synthesis calculations use specified calculation_dtype."""
+    # Create a model with mixed dtypes
+    model = torch.nn.Sequential(
+        torch.nn.Linear(512, 512),  # Default is float32
+        torch.nn.BatchNorm1d(512, track_running_stats=True),
+    )
+
+    # Convert model to float16
+    model = model.to(torch.float16)
+
+    # Create EMA instance
+    posthoc_ema = PostHocEMA.from_model(
+        model,
+        "posthoc-ema",
+        checkpoint_every=5,
+        sigma_rels=(0.05,),
+    )
+
+    # Update model
+    for _ in range(10):
+        with torch.no_grad():
+            model[0].weight.copy_(torch.randn_like(model[0].weight))
+            model[0].bias.copy_(torch.randn_like(model[0].bias))
+        posthoc_ema.update_(model)
+
+    # Test default behavior (float32 calculations, float16 output)
+    with posthoc_ema.state_dict(sigma_rel=0.05) as state_dict:
+        # All parameters should be float16 (original dtype)
+        assert state_dict["0.weight"].dtype == torch.float16
+        assert state_dict["0.bias"].dtype == torch.float16
+        assert state_dict["1.weight"].dtype == torch.float16
+        assert state_dict["1.bias"].dtype == torch.float16
+        assert state_dict["1.running_mean"].dtype == torch.float16
+        assert state_dict["1.running_var"].dtype == torch.float16
+
+    # Test float64 behavior
+    with posthoc_ema.state_dict(
+        sigma_rel=0.05, calculation_dtype=torch.float64
+    ) as state_dict:
+        # All parameters should still be float16 (original dtype)
+        assert state_dict["0.weight"].dtype == torch.float16
+        assert state_dict["0.bias"].dtype == torch.float16
+        assert state_dict["1.weight"].dtype == torch.float16
+        assert state_dict["1.bias"].dtype == torch.float16
+        assert state_dict["1.running_mean"].dtype == torch.float16
+        assert state_dict["1.running_var"].dtype == torch.float16
+
+    # Clean up
+    if Path("posthoc-ema").exists():
+        for file in Path("posthoc-ema").glob("*"):
+            file.unlink()
+        Path("posthoc-ema").rmdir()