fix: decrease peak ram usage

samedii · samedii · commit 6784d700974c · 2025-01-27T22:04:29.000+01:00
diff --git a/posthoc_ema/posthoc_ema.py b/posthoc_ema/posthoc_ema.py
@@ -8,6 +8,9 @@
 import torch
 from PIL import Image
 from torch import nn
+import pickle
+import io
+import torch.serialization
 
 from .karras_ema import KarrasEMA
 from .utils import _safe_torch_load, p_dot_p, sigma_rel_to_gamma, solve_weights
@@ -285,37 +288,49 @@ def _cleanup_old_checkpoints(self) -> None:
 
     @contextmanager
     def model(
-        self, model: nn.Module, sigma_rel: float
-    ) -> Generator[nn.Module, None, None]:
-        """Context manager that temporarily sets model parameters to EMA state.
+        self,
+        model: nn.Module,
+        sigma_rel: float,
+    ) -> Iterator[nn.Module]:
+        """
+        Context manager for temporarily setting model parameters to EMA state.
 
         Args:
-            model: Model to update
+            model: Model to temporarily set to EMA state
             sigma_rel: Target relative standard deviation
 
-        Returns:
-            Model with EMA parameters
+        Yields:
+            nn.Module: Model with EMA parameters
         """
-        # Store original device and move model to CPU
+        # Move model to CPU for memory efficiency
         original_device = next(model.parameters()).device
         model.cpu()
         torch.cuda.empty_cache()
 
         try:
             with self.state_dict(sigma_rel=sigma_rel) as state_dict:
-                ema_model = deepcopy(model)
-                result = ema_model.load_state_dict(
+                # Store original state only for parameters that will be modified
+                original_state = {
+                    name: param.detach().clone()
+                    for name, param in model.state_dict().items()
+                    if name in state_dict
+                }
+
+                # Load EMA state directly into model
+                result = model.load_state_dict(
                     state_dict, strict=not self.only_save_diff
                 )
                 assert (
                     len(result.unexpected_keys) == 0
                 ), f"Unexpected keys: {result.unexpected_keys}"
-                ema_model.eval()  # Set to eval mode to handle BatchNorm
-                yield ema_model
-                # Clean up EMA model
-                if hasattr(ema_model, "cuda"):
-                    ema_model.cpu()
-                del ema_model
+                model.eval()  # Set to eval mode to handle BatchNorm
+                yield model
+
+                # Restore original state
+                model.load_state_dict(original_state, strict=False)
+                del original_state
+                del state_dict  # Free memory for state dict
+                torch.cuda.empty_cache()
         finally:
             # Restore model to original device
             model.to(original_device)
@@ -341,10 +356,18 @@ def state_dict(
         gamma = sigma_rel_to_gamma(sigma_rel)
         device = torch.device("cpu")  # Keep synthesis on CPU for memory efficiency
 
-        # Get all checkpoint files
+        # First count total checkpoints to pre-allocate tensors
+        total_checkpoints = 0
+        checkpoint_files = []
         if self.ema_models is not None:
             # When we have ema_models, use their indices
-            indices = range(len(self.ema_models))
+            for idx in range(len(self.ema_models)):
+                files = sorted(
+                    self.checkpoint_dir.glob(f"{idx}.*.pt"),
+                    key=lambda p: int(p.stem.split(".")[1]),
+                )
+                total_checkpoints += len(files)
+                checkpoint_files.extend(files)
         else:
             # When loading from path, find all unique indices
             indices = set()
@@ -353,78 +376,101 @@ def state_dict(
                 indices.add(idx)
             indices = sorted(indices)
 
-        # Get checkpoint files and info
-        checkpoint_files = []
-        gammas = []
-        timesteps = []
-        for idx in indices:
-            files = sorted(
-                self.checkpoint_dir.glob(f"{idx}.*.pt"),
-                key=lambda p: int(p.stem.split(".")[1]),
-            )
-            for file in files:
-                _, timestep = map(int, file.stem.split("."))
-                if self.ema_models is not None:
-                    gammas.append(self.gammas[idx])
-                else:
-                    # Load gamma from checkpoint
-                    checkpoint = _safe_torch_load(str(file))
-                    sigma_rel = checkpoint.get("sigma_rel", None)
-                    if sigma_rel is not None:
-                        gammas.append(sigma_rel_to_gamma(sigma_rel))
-                    else:
-                        gammas.append(self.gammas[idx])
-                    del checkpoint  # Free memory
-                timesteps.append(timestep)
-                checkpoint_files.append(file)
-
-        if not gammas:
+            for idx in indices:
+                files = sorted(
+                    self.checkpoint_dir.glob(f"{idx}.*.pt"),
+                    key=lambda p: int(p.stem.split(".")[1]),
+                )
+                total_checkpoints += len(files)
+                checkpoint_files.extend(files)
+
+        if total_checkpoints == 0:
             raise ValueError("No checkpoints found")
 
-        # Convert to tensors
-        gammas = torch.tensor(gammas, device=device)
-        timesteps = torch.tensor(timesteps, device=device)
+        # Pre-allocate tensors
+        gammas = torch.empty(total_checkpoints, device=device)
+        timesteps = torch.empty(total_checkpoints, dtype=torch.long, device=device)
+
+        # Fill tensors one value at a time
+        for i, file in enumerate(checkpoint_files):
+            idx = int(file.stem.split(".")[0])
+            timestep = int(file.stem.split(".")[1])
+            timesteps[i] = timestep
+
+            if self.ema_models is not None:
+                gammas[i] = self.gammas[idx]
+            else:
+                # Load gamma from checkpoint
+                checkpoint = torch.load(
+                    str(file), weights_only=True, map_location="cpu"
+                )
+                sigma_rel = checkpoint.get("sigma_rel", None)
+                if sigma_rel is not None:
+                    gammas[i] = sigma_rel_to_gamma(sigma_rel)
+                else:
+                    gammas[i] = self.gammas[idx]
+                del checkpoint  # Free memory immediately
+                torch.cuda.empty_cache()
 
         # Solve for weights
         weights = solve_weights(gammas, timesteps, gamma)
 
-        # Load first checkpoint to get state dict structure
-        first_checkpoint = _safe_torch_load(str(checkpoint_files[0]))
-        state_dict = {}
+        # Free memory for gamma and timestep tensors
+        del gammas
+        del timesteps
+        torch.cuda.empty_cache()
 
-        # Get parameter names from first checkpoint
+        # Load first checkpoint to get parameter names
+        first_checkpoint = torch.load(
+            str(checkpoint_files[0]), weights_only=True, map_location="cpu"
+        )
         param_names = {
             k.replace("ema_model.", ""): k
             for k in first_checkpoint.keys()
             if k.startswith("ema_model.")
             and k.replace("ema_model.", "") not in ("initted", "step")
         }
+        del first_checkpoint
+        torch.cuda.empty_cache()
 
-        # Process one parameter at a time
-        for param_name, checkpoint_name in param_names.items():
-            param = first_checkpoint[checkpoint_name]
-            if not isinstance(param, torch.Tensor):
-                continue
+        # Initialize state dict with empty tensors
+        state_dict = {}
+
+        # Process one checkpoint at a time
+        for file_idx, (file, weight) in enumerate(zip(checkpoint_files, weights)):
+            # Load checkpoint
+            checkpoint = torch.load(str(file), weights_only=True, map_location="cpu")
 
-            # Initialize with first weighted contribution
-            state_dict[param_name] = param.to(device) * weights[0]
+            # Process all parameters from this checkpoint
+            for param_name, checkpoint_name in param_names.items():
+                if checkpoint_name not in checkpoint:
+                    continue
 
-            # Add remaining weighted contributions
-            for file, weight in zip(checkpoint_files[1:], weights[1:]):
-                checkpoint = _safe_torch_load(str(file))
-                param = checkpoint[checkpoint_name]
-                if isinstance(param, torch.Tensor):
-                    state_dict[param_name].add_(param.to(device) * weight)
-                del checkpoint  # Free memory
+                param_data = checkpoint[checkpoint_name]
+                if not isinstance(param_data, torch.Tensor):
+                    continue
+
+                if file_idx == 0:
+                    # Initialize parameter with first weighted contribution
+                    state_dict[param_name] = param_data.to(device) * weight
+                else:
+                    # Add weighted contribution to existing parameter
+                    state_dict[param_name].add_(param_data.to(device) * weight)
+
+            # Free memory for this checkpoint
+            del checkpoint
+            torch.cuda.empty_cache()
 
         # Free memory
-        del first_checkpoint
+        del weights
+        torch.cuda.empty_cache()
 
         try:
             yield state_dict
         finally:
             # Clean up
             del state_dict
+            torch.cuda.empty_cache()
 
     def _solve_weights(
         self,
diff --git a/tests/test_vram_usage.py b/tests/test_vram_usage.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+import time
 
 import psutil
 import torch
@@ -73,25 +74,21 @@ def reset_peak_ram():
     _peak_ram_usage = get_ram_usage()
 
 
-def monitor_operation(operation, interval=0.001):
+def monitor_operation(operation):
     """Monitor RAM usage during an operation."""
-    import threading
-    import time
-
-    stop_monitoring = threading.Event()
-
-    def monitor():
-        while not stop_monitoring.is_set():
-            get_ram_usage()  # This will update peak RAM
-            time.sleep(interval)
-
-    monitor_thread = threading.Thread(target=monitor)
-    monitor_thread.start()
-    try:
-        result = operation()
-    finally:
-        stop_monitoring.set()
-        monitor_thread.join()
+    # Get RAM usage before operation
+    pre_ram = get_ram_usage()
+
+    # Run operation
+    result = operation()
+
+    # Get RAM usage after operation
+    post_ram = get_ram_usage()
+
+    # Update peak RAM if needed
+    global _peak_ram_usage
+    _peak_ram_usage = max(_peak_ram_usage, pre_ram, post_ram)
+
     return result
 
 
@@ -352,25 +349,68 @@ def test_synthesis_memory_usage():
 
     # Monitor synthesis memory usage
     print("\nStarting synthesis...")
+
+    # First test: model context manager
+    print("\nTesting model context manager synthesis:")
     pre_synthesis_ram = get_ram_usage()
     reset_peak_ram()
 
-    def synthesize():
+    def synthesize_with_model():
         with posthoc_ema.model(model, sigma_rel=0.15) as ema_model:
             # Force a full synthesis by accessing parameters
             for param in ema_model.parameters():
                 _ = param.shape
 
-    monitor_operation(synthesize)
+    monitor_operation(synthesize_with_model)
+
+    post_synthesis_ram = get_ram_usage()
+    peak_synthesis_ram = get_peak_ram_usage()
+    print(f"Pre-synthesis RAM:  {pre_synthesis_ram:.2f}MB")
+    print(f"Post-synthesis RAM: {post_synthesis_ram:.2f}MB")
+    print(f"Peak synthesis RAM: {peak_synthesis_ram:.2f}MB")
+    print(f"RAM increase: {post_synthesis_ram - pre_synthesis_ram:.2f}MB")
+    print(f"RAM spike: {peak_synthesis_ram - pre_synthesis_ram:.2f}MB")
+    print(f"Relative peak RAM: {(peak_synthesis_ram/pre_synthesis_ram)*100:.1f}%")
+
+    # Assert RAM usage is within limits
+    assert peak_synthesis_ram <= pre_synthesis_ram * 2.5, (
+        f"Synthesis caused excessive RAM usage. "
+        f"Peak RAM ({peak_synthesis_ram:.2f}MB) was more than 2.5x "
+        f"pre-synthesis RAM ({pre_synthesis_ram:.2f}MB)"
+    )
+
+    # Second test: state_dict synthesis
+    print("\nTesting state_dict synthesis:")
+    pre_synthesis_ram = get_ram_usage()
+    reset_peak_ram()
+
+    synthesis_start = time.perf_counter()
+
+    def synthesize_with_state_dict():
+        with posthoc_ema.state_dict(sigma_rel=0.15) as ema_state_dict:
+            # Force processing by accessing dict
+            for param in ema_state_dict.values():
+                _ = param.shape
+
+    monitor_operation(synthesize_with_state_dict)
+    synthesis_end = time.perf_counter()
 
     post_synthesis_ram = get_ram_usage()
     peak_synthesis_ram = get_peak_ram_usage()
-    print(f"\nSynthesis memory usage:")
     print(f"Pre-synthesis RAM:  {pre_synthesis_ram:.2f}MB")
     print(f"Post-synthesis RAM: {post_synthesis_ram:.2f}MB")
     print(f"Peak synthesis RAM: {peak_synthesis_ram:.2f}MB")
     print(f"RAM increase: {post_synthesis_ram - pre_synthesis_ram:.2f}MB")
     print(f"RAM spike: {peak_synthesis_ram - pre_synthesis_ram:.2f}MB")
+    print(f"Relative peak RAM: {(peak_synthesis_ram/pre_synthesis_ram)*100:.1f}%")
+    print(f"Synthesis time: {synthesis_end - synthesis_start:.2f} seconds")
+
+    # Assert RAM usage is within limits
+    assert peak_synthesis_ram <= pre_synthesis_ram * 2.5, (
+        f"Synthesis caused excessive RAM usage. "
+        f"Peak RAM ({peak_synthesis_ram:.2f}MB) was more than 2.5x "
+        f"pre-synthesis RAM ({pre_synthesis_ram:.2f}MB)"
+    )
 
     # Cleanup
     model.cpu()