nextml-code
diff --git a/‎notebooks/visualize_error.py
Lines changed: 3 additions & 3 deletions b/‎notebooks/visualize_error.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎posthoc_ema/karras_ema.py
Lines changed: 54 additions & 23 deletions b/‎posthoc_ema/karras_ema.py
Lines changed: 54 additions & 23 deletions
diff --git a/‎posthoc_ema/posthoc_ema.py
Lines changed: 53 additions & 30 deletions b/‎posthoc_ema/posthoc_ema.py
Lines changed: 53 additions & 30 deletions
@@ -3,8 +3,8 @@
 import posthoc_ema.visualization
 
 posthoc_ema.visualization.reconstruction_error(
-    sigma_rels=(0.15, 0.28),
-    target_sigma_rel_range=(0.1, 0.3),
-    max_checkpoints=50,
+    sigma_rels=(0.15, 0.5),
+    target_sigma_rel_range=(0.05, 0.5),
+    max_checkpoints=20,
 )
 # %%
@@ -43,18 +43,18 @@ def inplace_lerp(tgt: Tensor, src: Tensor, weight):
 
 class KarrasEMA(Module):
     """
-    Exponential Moving Average module using hyperparameters from the Karras et al. paper.
+    Karras EMA implementation with power function decay profile.
 
     Args:
-        model: The model to create an EMA of
-        sigma_rel: Relative standard deviation for EMA profile width
-        gamma: Direct gamma parameter (alternative to sigma_rel)
+        model: Model to create EMA of
+        sigma_rel: Relative standard deviation for EMA profile
+        gamma: Alternative parameterization via gamma (don't specify both)
         ema_model: Optional pre-initialized EMA model
         update_every: Number of steps between EMA updates
-        frozen: If True, EMA weights are not updated
-        param_or_buffer_names_no_ema: Set of parameter/buffer names to exclude from EMA
-        ignore_names: Set of names to ignore
-        ignore_startswith_names: Set of name prefixes to ignore
+        frozen: Whether to freeze EMA updates
+        param_or_buffer_names_no_ema: Parameter/buffer names to exclude from EMA
+        ignore_names: Parameter/buffer names to ignore
+        ignore_startswith_names: Parameter/buffer name prefixes to ignore
         only_save_diff: If True, only save parameters with requires_grad=True
     """
 
@@ -111,12 +111,11 @@ def __init__(
             # Move model back to original device
             model.to(original_device)
 
-            # Get parameter names that require gradients
+            # Get parameter names for floating point or complex parameters
             self.param_names = {
                 name
                 for name, param in self.ema_model.named_parameters()
-                if (not only_save_diff or param.requires_grad)
-                and (torch.is_floating_point(param) or torch.is_complex(param))
+                if torch.is_floating_point(param) or torch.is_complex(param)
             }
 
             # Get buffer names for floating point or complex buffers
@@ -161,17 +160,27 @@ def update(self):
 
     def copy_params_from_model_to_ema(self):
         """Copy parameters from online model to EMA model."""
+        # Copy parameters
         for (name, ma_params), (_, current_params) in zip(
             self.get_params_iter(self.ema_model),
             self.get_params_iter(self.online_model[0]),
         ):
             if self._should_update_param(name):
                 inplace_copy(ma_params.data, current_params.data)
 
+        # Copy buffers
+        for (name, ma_buffer), (_, current_buffer) in zip(
+            self.get_buffers_iter(self.ema_model),
+            self.get_buffers_iter(self.online_model[0]),
+        ):
+            if self._should_update_param(name):
+                inplace_copy(ma_buffer.data, current_buffer.data)
+
     def update_moving_average(self):
         """Update EMA weights using current beta value."""
         current_decay = self.beta
 
+        # Update parameters
         for (name, current_params), (_, ma_params) in zip(
             self.get_params_iter(self.online_model[0]),
             self.get_params_iter(self.ema_model),
@@ -180,6 +189,15 @@ def update_moving_average(self):
                 continue
             inplace_lerp(ma_params.data, current_params.data, 1.0 - current_decay)
 
+        # Update buffers
+        for (name, current_buffer), (_, ma_buffer) in zip(
+            self.get_buffers_iter(self.online_model[0]),
+            self.get_buffers_iter(self.ema_model),
+        ):
+            if not self._should_update_param(name):
+                continue
+            inplace_lerp(ma_buffer.data, current_buffer.data, 1.0 - current_decay)
+
     def _should_update_param(self, name: str) -> bool:
         """Check if parameter should be updated based on ignore rules."""
         if name in self.ignore_names:
@@ -195,8 +213,19 @@ def get_params_iter(self, model):
         for name, param in model.named_parameters():
             if name not in self.param_names:
                 continue
+            if self.only_save_diff and not param.requires_grad:
+                continue
             yield name, param
 
+    def get_buffers_iter(self, model):
+        """Get iterator over model's buffers."""
+        for name, buffer in model.named_buffers():
+            if name not in self.buffer_names:
+                continue
+            if self.only_save_diff and not buffer.requires_grad:
+                continue
+            yield name, buffer
+
     def iter_all_ema_params_and_buffers(self):
         """Get iterator over all EMA parameters and buffers."""
         for name, param in self.ema_model.named_parameters():
@@ -250,24 +279,26 @@ def __call__(self, *args, **kwargs):
         return self.ema_model(*args, **kwargs)
 
     def state_dict(self):
-        """Get state dict of EMA model."""
+        """Get state dict for EMA model."""
         state_dict = {}
 
-        # Add parameters based on only_save_diff flag
+        # Save parameters based on only_save_diff flag
         for name, param in self.ema_model.named_parameters():
-            if (not self.only_save_diff or param.requires_grad) and (
-                torch.is_floating_point(param) or torch.is_complex(param)
-            ):
-                state_dict[name] = param
+            if name not in self.param_names:
+                continue
+            if self.only_save_diff and not param.requires_grad:
+                continue
+            state_dict[name] = param.data
 
-        # Add buffers (always included regardless of only_save_diff)
+        # Save buffers
         for name, buffer in self.ema_model.named_buffers():
-            if torch.is_floating_point(buffer) or torch.is_complex(buffer):
-                state_dict[name] = buffer
+            if name not in self.buffer_names:
+                continue
+            state_dict[name] = buffer.data
 
-        # Add internal state
-        state_dict["initted"] = self.initted
-        state_dict["step"] = self.step
+        # Save internal state
+        state_dict["initted"] = self.initted.data
+        state_dict["step"] = self.step.data
 
         return state_dict
 
 
@@ -89,7 +89,19 @@ def from_model(
 
         Returns:
             PostHocEMA: Instance ready for training
+
+        Raises:
+            ValueError: If checkpoint directory already exists and contains checkpoints
         """
+        checkpoint_dir = Path(checkpoint_dir)
+        if checkpoint_dir.exists():
+            checkpoints = list(checkpoint_dir.glob("*.pt"))
+            if checkpoints:
+                raise ValueError(
+                    f"Checkpoint directory {checkpoint_dir} already contains checkpoints. "
+                    "Use from_path() to load existing checkpoints instead of from_model()."
+                )
+
         instance = cls(
             checkpoint_dir=checkpoint_dir,
             max_checkpoints=max_checkpoints,
@@ -242,32 +254,35 @@ def _create_checkpoint(self) -> None:
             # Create checkpoint file
             checkpoint_file = self.checkpoint_dir / f"{idx}.{self.step}.pt"
 
-            # Get parameter and buffer names
-            param_names = {
-                name for name, param in ema_model.ema_model.named_parameters()
-            }
+            # Get state dict from EMA model
+            state_dict = ema_model.state_dict()
+
+            # Filter parameters based on only_save_diff
             if self.only_save_diff:
-                param_names = {
-                    name
-                    for name in param_names
-                    if ema_model.ema_model.get_parameter(name).requires_grad
+                filtered_state_dict = {}
+                for name, param in ema_model.ema_model.named_parameters():
+                    if param.requires_grad:
+                        key = name
+                        if key in state_dict:
+                            filtered_state_dict[key] = state_dict[key]
+                # Add buffers and internal state
+                for name, buffer in ema_model.ema_model.named_buffers():
+                    key = name
+                    if key in state_dict:
+                        filtered_state_dict[key] = state_dict[key]
+                for key in ["initted", "step"]:
+                    if key in state_dict:
+                        filtered_state_dict[key] = state_dict[key]
+                state_dict = filtered_state_dict
+
+            # Convert to checkpoint dtype if specified
+            if self.checkpoint_dtype is not None:
+                state_dict = {
+                    k: v.to(self.checkpoint_dtype) if isinstance(v, torch.Tensor) else v
+                    for k, v in state_dict.items()
                 }
-            buffer_names = {name for name, _ in ema_model.ema_model.named_buffers()}
-
-            # Save EMA model state with correct dtype and ema_model prefix
-            state_dict = {
-                f"ema_model.{k}": (
-                    v.to(self.checkpoint_dtype)
-                    if self.checkpoint_dtype is not None
-                    else v
-                )
-                for k, v in ema_model.state_dict().items()
-                if (
-                    k in param_names  # Include parameters based on only_save_diff
-                    or k in buffer_names  # Include all buffers
-                    or k in ("initted", "step")  # Include internal state
-                )
-            }
+
+            # Save checkpoint
             torch.save(state_dict, checkpoint_file)
 
             # Remove old checkpoints if needed
@@ -401,13 +416,15 @@ def state_dict(
 
         # Pre-allocate tensors in calculation dtype
         gammas = torch.empty(total_checkpoints, dtype=calculation_dtype, device=device)
-        timesteps = torch.empty(total_checkpoints, dtype=torch.long, device=device)
+        timesteps = torch.empty(
+            total_checkpoints, dtype=calculation_dtype, device=device
+        )
 
         # Fill tensors one value at a time
         for i, file in enumerate(checkpoint_files):
             idx = int(file.stem.split(".")[0])
             timestep = int(file.stem.split(".")[1])
-            timesteps[i] = timestep
+            timesteps[i] = float(timestep)  # Convert to float
 
             if self.ema_models is not None:
                 gammas[i] = self.gammas[idx]
@@ -430,6 +447,7 @@ def state_dict(
             timesteps,
             gamma,
             calculation_dtype=calculation_dtype,
+            target_sigma_rel=sigma_rel,
         )
 
         # Free memory for gamma and timestep tensors
@@ -442,10 +460,7 @@ def state_dict(
             str(checkpoint_files[0]), weights_only=True, map_location="cpu"
         )
         param_names = {
-            k.replace("ema_model.", ""): k
-            for k in first_checkpoint.keys()
-            if k.startswith("ema_model.")
-            and k.replace("ema_model.", "") not in ("initted", "step")
+            k: k for k in first_checkpoint.keys() if k not in ("initted", "step")
         }
         # Store original dtypes for each parameter
         param_dtypes = {
@@ -467,6 +482,14 @@ def state_dict(
             # Process all parameters from this checkpoint
             for param_name, checkpoint_name in param_names.items():
                 if checkpoint_name not in checkpoint:
+                    # If parameter is missing from checkpoint but we're not in only_save_diff mode,
+                    # or if it's a parameter with requires_grad=True, this is an error
+                    if not self.only_save_diff:
+                        raise ValueError(
+                            f"Parameter {param_name} missing from checkpoint {file} "
+                            "but only_save_diff=False"
+                        )
+                    # Skip parameters that are intentionally not saved in only_save_diff mode
                     continue
 
                 param_data = checkpoint[checkpoint_name]