Fix norm

kevin314 · kevin314 · commit 83c5dc32014d · 2025-09-24T06:58:20.000Z
diff --git a/fastvideo/layers/layernorm.py b/fastvideo/layers/layernorm.py
@@ -39,6 +39,22 @@ def __init__(
         if self.has_weight:
             self.weight = nn.Parameter(self.weight)
 
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Forward method that matches Diffusers RMSNorm implementation exactly."""
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        if self.has_weight and self.weight is not None:
+            # convert into half-precision if necessary (match Diffusers exactly)
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+            hidden_states = hidden_states * self.weight
+        else:
+            hidden_states = hidden_states.to(input_dtype)
+
+        return hidden_states
+
     # if we do fully_shard(model.layer_norm), and we call layer_form.forward_native(input) instead of layer_norm(input),
     # we need to call model.layer_norm.register_fsdp_forward_method(model, "forward_native") to make sure fsdp2 hooks are triggered
     # for mixed precision and cpu offloading
diff --git a/fastvideo/layers/rotary_embedding.py b/fastvideo/layers/rotary_embedding.py
@@ -64,8 +64,9 @@ def apply_rotary_emb(
     """
     if use_real:
         cos, sin = freqs_cis  # [S, D]
-        cos = cos[None, None]
-        sin = sin[None, None]
+        # Match Diffusers exact broadcasting (sequence_dim=2 case)
+        cos = cos[None, None, :, :]
+        sin = sin[None, None, :, :]
         cos, sin = cos.to(x.device), sin.to(x.device)
 
         if use_real_unbind_dim == -1:
diff --git a/fastvideo/models/dits/cosmos.py b/fastvideo/models/dits/cosmos.py
@@ -113,7 +113,9 @@ def forward(self,
                                                          self.embedding_dim]
 
         shift, scale = embedded_timestep.chunk(2, dim=-1)
-        hidden_states = self.norm(hidden_states)
+        # Disable autocast for LayerNorm to match Diffusers behavior
+        with torch.autocast(device_type="cuda", enabled=False):
+            hidden_states = self.norm(hidden_states)
 
         if embedded_timestep.ndim == 2:
             shift, scale = (x.unsqueeze(1) for x in (shift, scale))
@@ -147,6 +149,9 @@ def forward(
         embedded_timestep: torch.Tensor,
         temb: torch.Tensor | None = None,
     ) -> torch.Tensor:
+        instance_id = id(self)
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO NORM] Instance {instance_id}: forward hidden_states: {hidden_states.float().sum().item()}\n")
         embedded_timestep = self.activation(embedded_timestep)
         embedded_timestep = self.linear_1(embedded_timestep)
         embedded_timestep = self.linear_2(embedded_timestep)
@@ -155,8 +160,45 @@ def forward(
             embedded_timestep = embedded_timestep + temb
 
         shift, scale, gate = embedded_timestep.chunk(3, dim=-1)
-        hidden_states = self.norm(hidden_states)
-
+        print(f"[FASTVIDEO NORM] After chunk - shift sum: {shift.float().sum().item()}")
+        print(f"[FASTVIDEO NORM] After chunk - scale sum: {scale.float().sum().item()}")
+        print(f"[FASTVIDEO NORM] After chunk - gate sum: {gate.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO NORM] After chunk - shift sum: {shift.float().sum().item()}\n")
+            f.write(f"[FASTVIDEO NORM] After chunk - scale sum: {scale.float().sum().item()}\n")
+            f.write(f"[FASTVIDEO NORM] After chunk - gate sum: {gate.float().sum().item()}\n")
+        print(f"[FASTVIDEO NORM] Before LayerNorm - input shape: {hidden_states.shape}")
+        print(f"[FASTVIDEO NORM] Before LayerNorm - input dtype: {hidden_states.dtype}")
+        print(f"[FASTVIDEO NORM] Before LayerNorm - input sum: {hidden_states.float().sum().item()}")
+        print(f"[FASTVIDEO NORM] LayerNorm eps: {self.norm.eps}")
+        print(f"[FASTVIDEO NORM] LayerNorm elementwise_affine: {self.norm.elementwise_affine}")
+        print(f"[FASTVIDEO NORM] LayerNorm normalized_shape: {self.norm.normalized_shape}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO NORM] Before LayerNorm - input shape: {hidden_states.shape}\n")
+            f.write(f"[FASTVIDEO NORM] Before LayerNorm - input dtype: {hidden_states.dtype}\n")
+            f.write(f"[FASTVIDEO NORM] Before LayerNorm - input sum: {hidden_states.float().sum().item()}\n")
+            f.write(f"[FASTVIDEO NORM] LayerNorm eps: {self.norm.eps}\n")
+            f.write(f"[FASTVIDEO NORM] LayerNorm elementwise_affine: {self.norm.elementwise_affine}\n")
+            f.write(f"[FASTVIDEO NORM] LayerNorm normalized_shape: {self.norm.normalized_shape}\n")
+
+        # Save the input tensor for comparison (only once globally)
+        import os
+        if not hasattr(CosmosAdaLayerNormZero, '_global_tensor_saved'):
+            instance_id = id(self)
+            torch.save(hidden_states.float(), "/workspace/FastVideo/fastvideo_layernorm_input.pt")
+            print(f"[FASTVIDEO NORM] Instance {instance_id}: Saved input tensor sum={hidden_states.float().sum().item()}")
+            with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                f.write(f"[FASTVIDEO NORM] Instance {instance_id}: Saved input tensor sum={hidden_states.float().sum().item()}\n")
+            CosmosAdaLayerNormZero._global_tensor_saved = True
+
+        # Disable autocast for LayerNorm to match Diffusers behavior
+        with torch.autocast(device_type="cuda", enabled=False):
+            hidden_states = self.norm(hidden_states)
+
+        print(f"[FASTVIDEO NORM] After LayerNorm - output sum: {hidden_states.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO NORM] After norm: {hidden_states.float().sum().item()}\n")
+            f.write(f"embedded_timestep.ndim: {embedded_timestep.ndim}\n")
         if embedded_timestep.ndim == 2:
             shift, scale, gate = (x.unsqueeze(1) for x in (shift, scale, gate))
 
@@ -185,6 +227,7 @@ def __init__(self,
         self.to_k = nn.Linear(dim, dim, bias=False)
         self.to_v = nn.Linear(dim, dim, bias=False)
         self.to_out = nn.Linear(dim, dim, bias=False)
+        self.dropout = nn.Dropout(0.0)  # Match Diffusers dropout
             
         self.norm_q = RMSNorm(self.head_dim,
                               eps=eps) if qk_norm else nn.Identity()
@@ -215,15 +258,36 @@ def forward(self,
         query = query.unflatten(2, (self.num_heads, -1)).transpose(1, 2)
         key = key.unflatten(2, (self.num_heads, -1)).transpose(1, 2)
         value = value.unflatten(2, (self.num_heads, -1)).transpose(1, 2)
+        print(f"[FASTVIDEO ATTN] After reshape - Q: {query.float().sum().item()}, K: {key.float().sum().item()}, V: {value.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO ATTN] After reshape - Q: {query.float().sum().item()}, K: {key.float().sum().item()}, V: {value.float().sum().item()}\n")
 
         # Apply normalization
+        print(f"[FASTVIDEO ATTN] norm_q is not None: {self.norm_q is not None}, norm_k is not None: {self.norm_k is not None}")
+        print(f"[FASTVIDEO ATTN] norm_q type: {type(self.norm_q)}, norm_k type: {type(self.norm_k)}")
+        print(f"[FASTVIDEO ATTN] norm_q eps: {getattr(self.norm_q, 'variance_epsilon', 'N/A')}, norm_k eps: {getattr(self.norm_k, 'variance_epsilon', 'N/A')}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO ATTN] norm_q is not None: {self.norm_q is not None}, norm_k is not None: {self.norm_k is not None}\n")
+            f.write(f"[FASTVIDEO ATTN] norm_q type: {type(self.norm_q)}, norm_k type: {type(self.norm_k)}\n")
+            f.write(f"[FASTVIDEO ATTN] norm_q eps: {getattr(self.norm_q, 'variance_epsilon', 'N/A')}, norm_k eps: {getattr(self.norm_k, 'variance_epsilon', 'N/A')}\n")
         if self.norm_q is not None:
             query = self.norm_q(query)
         if self.norm_k is not None:
             key = self.norm_k(key)
+        print(f"[FASTVIDEO ATTN] After norm - Q: {query.float().sum().item()}, K: {key.float().sum().item()}, V: {value.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO ATTN] After norm - Q: {query.float().sum().item()}, K: {key.float().sum().item()}, V: {value.float().sum().item()}\n")
 
         # Apply RoPE if provided
         if image_rotary_emb is not None:
+            print(f"[FASTVIDEO ATTN] RoPE input shape: query={query.shape}, image_rotary_emb={len(image_rotary_emb) if isinstance(image_rotary_emb, tuple) else image_rotary_emb.shape}")
+            print(f"[FASTVIDEO ATTN] RoPE freqs shapes: cos={image_rotary_emb[0].shape}, sin={image_rotary_emb[1].shape}")
+            print(f"[FASTVIDEO ATTN] RoPE freqs sums: cos={image_rotary_emb[0].float().sum().item()}, sin={image_rotary_emb[1].float().sum().item()}")
+            with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                f.write(f"[FASTVIDEO ATTN] RoPE input shape: query={query.shape}, image_rotary_emb={len(image_rotary_emb) if isinstance(image_rotary_emb, tuple) else image_rotary_emb.shape}\n")
+                f.write(f"[FASTVIDEO ATTN] RoPE freqs shapes: cos={image_rotary_emb[0].shape}, sin={image_rotary_emb[1].shape}\n")
+                f.write(f"[FASTVIDEO ATTN] RoPE freqs sums: cos={image_rotary_emb[0].float().sum().item()}, sin={image_rotary_emb[1].float().sum().item()}\n")
+
             query = apply_rotary_emb(query,
                                      image_rotary_emb,
                                      use_real=True,
@@ -232,6 +296,9 @@ def forward(self,
                                    image_rotary_emb,
                                    use_real=True,
                                    use_real_unbind_dim=-2)
+            print(f"[FASTVIDEO ATTN] After RoPE - Q: {query.float().sum().item()}, K: {key.float().sum().item()}, V: {value.float().sum().item()}")
+            with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                f.write(f"[FASTVIDEO ATTN] After RoPE - Q: {query.float().sum().item()}, K: {key.float().sum().item()}, V: {value.float().sum().item()}\n")
 
         # Prepare for GQA (Grouped Query Attention)
         if torch.onnx.is_in_onnx_export():
@@ -244,6 +311,11 @@ def forward(self,
             value_idx = value.size(3)
         key = key.repeat_interleave(query_idx // key_idx, dim=3)
         value = value.repeat_interleave(query_idx // value_idx, dim=3)
+        print(f"[FASTVIDEO ATTN] After GQA - Q: {query.float().sum().item()}, K: {key.float().sum().item()}, V: {value.float().sum().item()}")
+        print(f"[FASTVIDEO ATTN] GQA indices - query_idx: {query_idx}, key_idx: {key_idx}, value_idx: {value_idx}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO ATTN] After GQA - Q: {query.float().sum().item()}, K: {key.float().sum().item()}, V: {value.float().sum().item()}\n")
+            f.write(f"[FASTVIDEO ATTN] GQA indices - query_idx: {query_idx}, key_idx: {key_idx}, value_idx: {value_idx}\n")
 
         # Attention computation
         # Use standard PyTorch scaled dot product attention
@@ -258,6 +330,7 @@ def forward(self,
 
         # Output projection
         attn_output = self.to_out(attn_output)
+        attn_output = self.dropout(attn_output)
 
         return attn_output
 
@@ -285,6 +358,7 @@ def __init__(self,
         self.to_k = nn.Linear(cross_attention_dim, dim, bias=False)
         self.to_v = nn.Linear(cross_attention_dim, dim, bias=False)
         self.to_out = nn.Linear(dim, dim, bias=False)
+        self.dropout = nn.Dropout(0.0)  # Match Diffusers dropout
             
         self.norm_q = RMSNorm(self.head_dim,
                               eps=eps) if qk_norm else nn.Identity()
@@ -336,6 +410,7 @@ def forward(self,
 
         # Output projection
         attn_output = self.to_out(attn_output)
+        attn_output = self.dropout(attn_output)
 
         return attn_output
 
@@ -368,6 +443,7 @@ def __init__(
             dim=hidden_size,
             num_heads=num_attention_heads,
             qk_norm=(qk_norm == "rms_norm"),
+            eps=1e-5,  # Match Diffusers default
             prefix=f"{prefix}.attn1")
 
         self.norm2 = CosmosAdaLayerNormZero(in_features=hidden_size,
@@ -377,6 +453,7 @@ def __init__(
             cross_attention_dim=cross_attention_dim,
             num_heads=num_attention_heads,
             qk_norm=(qk_norm == "rms_norm"),
+            eps=1e-5,  # Match Diffusers default
             prefix=f"{prefix}.attn2")
 
         self.norm3 = CosmosAdaLayerNormZero(in_features=hidden_size,
@@ -697,27 +774,14 @@ def forward(self,
         if condition_mask is not None:
             hidden_states = torch.cat([hidden_states, condition_mask], dim=1)
 
-        if self.concat_padding_mask and padding_mask is not None:
+        if self.concat_padding_mask:
             from torchvision import transforms
             padding_mask = transforms.functional.resize(
-                padding_mask,
-                list(hidden_states.shape[-2:]),
-                interpolation=transforms.InterpolationMode.NEAREST)
-            hidden_states = torch.cat([
-                hidden_states,
-                padding_mask.unsqueeze(2).repeat(batch_size, 1, num_frames, 1,
-                                                 1)
-            ],
-                                      dim=1)
-            # # Resize padding mask to match hidden states spatial dimensions
-            # padding_mask_resized = F.interpolate(
-            #     padding_mask.float().unsqueeze(1),
-            #     size=(height, width),
-            #     mode='nearest'
-            # ).squeeze(1)
-            # hidden_states = torch.cat(
-            #     [hidden_states, padding_mask_resized.unsqueeze(1).unsqueeze(2).repeat(1, 1, num_frames, 1, 1)], dim=1
-            # )
+                padding_mask, list(hidden_states.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
+            )
+            hidden_states = torch.cat(
+                [hidden_states, padding_mask.unsqueeze(2).repeat(batch_size, 1, num_frames, 1, 1)], dim=1
+            )
 
         if attention_mask is not None:
             attention_mask = attention_mask.unsqueeze(1).unsqueeze(
diff --git a/fastvideo/pipelines/stages/utils.py b/fastvideo/pipelines/stages/utils.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Utility functions for pipeline stages.
+"""
+
+import inspect
+from typing import List, Optional, Union
+
+import torch
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps