Addt. logging

kevin314 · kevin314 · commit f747b40b7023 · 2025-09-21T06:09:43.000Z
diff --git a/fastvideo/models/dits/cosmos.py b/fastvideo/models/dits/cosmos.py
@@ -76,8 +76,14 @@ def __init__(self, embedding_dim: int, condition_dim: int) -> None:
     def forward(self, hidden_states: torch.Tensor,
                 timestep: torch.LongTensor) -> torch.Tensor:
         timesteps_proj = self.time_proj(timestep).type_as(hidden_states)
+        print(f"[FASTVIDEO] timesteps_proj before norm: {timesteps_proj.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO] timesteps_proj before norm: {timesteps_proj.float().sum().item()}\n")
         temb = self.t_embedder(timesteps_proj)
         embedded_timestep = self.norm(timesteps_proj)
+        print(f"[FASTVIDEO] embedded_timestep after norm: {embedded_timestep.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO] embedded_timestep after norm: {embedded_timestep.float().sum().item()}\n")
         return temb, embedded_timestep
 
 
@@ -133,10 +139,7 @@ def __init__(self,
         else:
             self.linear_1 = nn.Linear(in_features, hidden_features, bias=False)
 
-        self.linear_2 = nn.Linear(
-            hidden_features if hidden_features is not None else in_features,
-            3 * in_features,
-            bias=False)
+        self.linear_2 = nn.Linear(hidden_features, 3 * in_features, bias=False)
 
     def forward(
         self,
@@ -197,10 +200,16 @@ def forward(self,
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
 
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO SELF-ATTN] INIT hidden_states: Q={hidden_states.float().sum().item()}\n")
+
         # Get QKV
         query = self.to_q(hidden_states)
         key = self.to_k(encoder_hidden_states)
         value = self.to_v(encoder_hidden_states)
+        print(f"[FASTVIDEO SELF-ATTN] QKV sums: Q={query.float().sum().item()}, K={key.float().sum().item()}, V={value.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO SELF-ATTN] QKV sums: Q={query.float().sum().item()}, K={key.float().sum().item()}, V={value.float().sum().item()}\n")
 
         # Reshape for multi-head attention
         query = query.unflatten(2, (self.num_heads, -1)).transpose(1, 2)
@@ -209,9 +218,9 @@ def forward(self,
 
         # Apply normalization
         if self.norm_q is not None:
-            query = self.norm_q.forward_native(query)
+            query = self.norm_q(query)
         if self.norm_k is not None:
-            key = self.norm_k.forward_native(key)
+            key = self.norm_k(key)
 
         # Apply RoPE if provided
         if image_rotary_emb is not None:
@@ -224,12 +233,28 @@ def forward(self,
                                    use_real=True,
                                    use_real_unbind_dim=-2)
 
+        # Prepare for GQA (Grouped Query Attention)
+        if torch.onnx.is_in_onnx_export():
+            query_idx = torch.tensor(query.size(3), device=query.device)
+            key_idx = torch.tensor(key.size(3), device=key.device)
+            value_idx = torch.tensor(value.size(3), device=value.device)
+        else:
+            query_idx = query.size(3)
+            key_idx = key.size(3)
+            value_idx = value.size(3)
+        key = key.repeat_interleave(query_idx // key_idx, dim=3)
+        value = value.repeat_interleave(query_idx // value_idx, dim=3)
+
         # Attention computation
         # Use standard PyTorch scaled dot product attention
         attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
         )
         attn_output = attn_output.transpose(1, 2).flatten(2, 3).type_as(query)
+        print(f"[FASTVIDEO TRANSFORMER] hidden_states: {attn_output.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO TRANSFORMER] hidden_states: {attn_output.float().sum().item()}\n")
+            f.write(f"self.to_out: {self.to_out}")
 
         # Output projection
         attn_output = self.to_out(attn_output)
@@ -275,6 +300,9 @@ def forward(self,
         query = self.to_q(hidden_states)
         key = self.to_k(encoder_hidden_states)
         value = self.to_v(encoder_hidden_states)
+        # print(f"[FASTVIDEO CROSS-ATTN] QKV sums: Q={query.float().sum().item()}, K={key.float().sum().item()}, V={value.float().sum().item()}")
+        # with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+        #     f.write(f"[FASTVIDEO CROSS-ATTN] QKV sums: Q={query.float().sum().item()}, K={key.float().sum().item()}, V={value.float().sum().item()}\n")
 
         # Reshape for multi-head attention
             # Standard PyTorch attention expects [batch, num_heads, seq_len, head_dim]
@@ -284,13 +312,25 @@ def forward(self,
 
         # Apply normalization
         if self.norm_q is not None:
-            query = self.norm_q.forward_native(query)
+            query = self.norm_q(query)
         if self.norm_k is not None:
-            key = self.norm_k.forward_native(key)
+            key = self.norm_k(key)
+
+        # Prepare for GQA (Grouped Query Attention)
+        if torch.onnx.is_in_onnx_export():
+            query_idx = torch.tensor(query.size(3), device=query.device)
+            key_idx = torch.tensor(key.size(3), device=key.device)
+            value_idx = torch.tensor(value.size(3), device=value.device)
+        else:
+            query_idx = query.size(3)
+            key_idx = key.size(3)
+            value_idx = value.size(3)
+        key = key.repeat_interleave(query_idx // key_idx, dim=3)
+        value = value.repeat_interleave(query_idx // value_idx, dim=3)
 
         # Attention computation
         attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
         )
         attn_output = attn_output.transpose(1, 2).flatten(2, 3).type_as(query)
 
@@ -317,6 +357,11 @@ def __init__(
 
         hidden_size = num_attention_heads * attention_head_dim
 
+        print(f"[FASTVIDEO TRANSFORMER] hidden_size: Q={hidden_size}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO TRANSFORMER] hidden_size: Q={hidden_size}\n")
+
+
         self.norm1 = CosmosAdaLayerNormZero(in_features=hidden_size,
                                             hidden_features=adaln_lora_dim)
         self.attn1 = CosmosSelfAttention(
@@ -355,18 +400,51 @@ def forward(
             hidden_states = hidden_states + extra_pos_emb
 
         # 1. Self Attention
+        print(f"[FASTVIDEO DEBUG] Before norm1: hidden_states={hidden_states.float().sum().item()}")
+        print(f"[FASTVIDEO DEBUG] Before norm1: embedded_timestep={embedded_timestep.float().sum().item()}")
+        print(f"[FASTVIDEO DEBUG] Before norm1: temb={temb.float().sum().item() if temb is not None else 'None'}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO DEBUG] Before norm1: hidden_states={hidden_states.float().sum().item()}\n")
+            f.write(f"[FASTVIDEO DEBUG] Before norm1: embedded_timestep={embedded_timestep.float().sum().item()}\n")
+            f.write(f"[FASTVIDEO DEBUG] Before norm1: temb={temb.float().sum().item() if temb is not None else 'None'}\n")
+        # Debug norm1 weights
+        print(f"[FASTVIDEO DEBUG] norm1.linear_1.weight sum: {self.norm1.linear_1.weight.float().sum().item()}")
+        print(f"[FASTVIDEO DEBUG] norm1.linear_2.weight sum: {self.norm1.linear_2.weight.float().sum().item()}")
+        print(f"[FASTVIDEO DEBUG] hidden_states dtype: {hidden_states.dtype}")
+        print(f"[FASTVIDEO DEBUG] embedded_timestep dtype: {embedded_timestep.dtype}")
+        print(f"[FASTVIDEO DEBUG] temb dtype: {temb.dtype if temb is not None else 'None'}")
+        print(f"[FASTVIDEO DEBUG] norm1.linear_1.weight dtype: {self.norm1.linear_1.weight.dtype}")
+        print(f"[FASTVIDEO DEBUG] norm1.linear_2.weight dtype: {self.norm1.linear_2.weight.dtype}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO DEBUG] norm1.linear_1.weight sum: {self.norm1.linear_1.weight.float().sum().item()}\n")
+            f.write(f"[FASTVIDEO DEBUG] norm1.linear_2.weight sum: {self.norm1.linear_2.weight.float().sum().item()}\n")
+            f.write(f"[FASTVIDEO DEBUG] hidden_states dtype: {hidden_states.dtype}\n")
+            f.write(f"[FASTVIDEO DEBUG] embedded_timestep dtype: {embedded_timestep.dtype}\n")
+            f.write(f"[FASTVIDEO DEBUG] temb dtype: {temb.dtype if temb is not None else 'None'}\n")
+            f.write(f"[FASTVIDEO DEBUG] norm1.linear_1.weight dtype: {self.norm1.linear_1.weight.dtype}\n")
+            f.write(f"[FASTVIDEO DEBUG] norm1.linear_2.weight dtype: {self.norm1.linear_2.weight.dtype}\n")
+
         norm_hidden_states, gate = self.norm1(hidden_states, embedded_timestep,
                                               temb)
+        print(f"[FASTVIDEO DEBUG] After norm1: norm_hidden_states={norm_hidden_states.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO DEBUG] After norm1: norm_hidden_states={norm_hidden_states.float().sum().item()}\n")
         attn_output = self.attn1(norm_hidden_states,
                                  image_rotary_emb=image_rotary_emb)
         hidden_states = hidden_states + gate * attn_output
 
         # 2. Cross Attention
+        # print(f"[FASTVIDEO] About to call cross-attention")
+        # with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+        #     f.write(f"[FASTVIDEO] About to call cross-attention\n")
         norm_hidden_states, gate = self.norm2(hidden_states, embedded_timestep,
                                               temb)
         attn_output = self.attn2(norm_hidden_states,
                                  encoder_hidden_states=encoder_hidden_states,
                                  attention_mask=attention_mask)
+        # print(f"[FASTVIDEO] Cross-attention completed")
+        # with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+        #     f.write(f"[FASTVIDEO] Cross-attention completed\n")
         hidden_states = hidden_states + gate * attn_output
 
         # 3. Feed Forward
@@ -604,6 +682,8 @@ def forward(self,
                 padding_mask: torch.Tensor | None = None,
                 **kwargs) -> torch.Tensor:
         print(f"[FASTVIDEO TRANSFORMER] Input hidden_states sum = {hidden_states.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO TRANSFORMER] Input hidden_states sum = {hidden_states.float().sum().item()}\n")
         forward_batch = get_forward_context().forward_batch
         enable_teacache = forward_batch is not None and forward_batch.enable_teacache
 
@@ -676,9 +756,19 @@ def forward(self,
         else:
             raise ValueError(f"Unsupported timestep shape: {timestep.shape}")
 
+        print(f"[FASTVIDEO] After patch_embed: {hidden_states.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO] After patch_embed: {hidden_states.float().sum().item()}\n")
+        print(f"[FASTVIDEO] After time_embed temb: {temb.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO] After time_embed temb: {temb.float().sum().item()}\n")
+        print(f"[FASTVIDEO] After time_embed embedded_timestep: {embedded_timestep.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO] After time_embed embedded_timestep: {embedded_timestep.float().sum().item()}\n")
+
         # 6. Transformer blocks
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for block in self.transformer_blocks:
+            for i, block in enumerate(self.transformer_blocks):
                 hidden_states = self._gradient_checkpointing_func(
                     block,
                     hidden_states,
@@ -689,8 +779,12 @@ def forward(self,
                     extra_pos_emb,
                     attention_mask,
                 )
+                if i < 3:  # Log first 3 blocks
+                    print(f"[FASTVIDEO] After block {i}: {hidden_states.float().sum().item()}")
+                    with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                        f.write(f"[FASTVIDEO] After block {i}: {hidden_states.float().sum().item()}\n")
         else:
-            for block in self.transformer_blocks:
+            for i, block in enumerate(self.transformer_blocks):
                 hidden_states = block(
                     hidden_states=hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
@@ -700,10 +794,20 @@ def forward(self,
                     extra_pos_emb=extra_pos_emb,
                     attention_mask=attention_mask,
                 )
+                if i < 3:  # Log first 3 blocks
+                    print(f"[FASTVIDEO] After block! {i}: {hidden_states.float().sum().item()}")
+                    with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                        f.write(f"[FASTVIDEO] After block! {i}: {hidden_states.float().sum().item()}\n")
 
         # 7. Output norm & projection & unpatchify
         hidden_states = self.norm_out(hidden_states, embedded_timestep, temb)
+        print(f"[FASTVIDEO] After norm_out: {hidden_states.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO] After norm_out: {hidden_states.float().sum().item()}\n")
         hidden_states = self.proj_out(hidden_states)
+        print(f"[FASTVIDEO] After proj_out: {hidden_states.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO] After proj_out: {hidden_states.float().sum().item()}\n")
         hidden_states = hidden_states.unflatten(2, (p_h, p_w, p_t, -1))
         hidden_states = hidden_states.unflatten(
             1, (post_patch_num_frames, post_patch_height, post_patch_width))
@@ -713,4 +817,6 @@ def forward(self,
         hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
 
         print(f"[FASTVIDEO TRANSFORMER] Output hidden_states sum = {hidden_states.float().sum().item()}")
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"[FASTVIDEO TRANSFORMER] Output hidden_states sum = {hidden_states.float().sum().item()}\n")
         return hidden_states
diff --git a/fastvideo/pipelines/basic/cosmos/cosmos_pipeline.py b/fastvideo/pipelines/basic/cosmos/cosmos_pipeline.py
@@ -10,6 +10,11 @@
 import numpy as np
 import torch
 
+# TEMPORARY: Import diffusers VAE for comparison
+import sys
+sys.path.insert(0, '/workspace/diffusers/src')
+from diffusers.models.autoencoders.autoencoder_kl_wan import AutoencoderKLWan as DiffusersAutoencoderKLWan
+
 from fastvideo.fastvideo_args import FastVideoArgs
 from fastvideo.logger import init_logger
 from fastvideo.pipelines.composed_pipeline_base import ComposedPipelineBase
@@ -33,6 +38,23 @@ class Cosmos2VideoToWorldPipeline(ComposedPipelineBase):
 
     def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
 
+        # TEMPORARY: Replace FastVideo VAE with diffusers VAE for testing
+        print("[TEMPORARY] Replacing FastVideo VAE with diffusers VAE...")
+        original_vae = self.modules["vae"]
+        print(f"[TEMPORARY] Original VAE type: {type(original_vae)}")
+
+        # Load diffusers VAE with same config
+        diffusers_vae = DiffusersAutoencoderKLWan.from_pretrained(
+            self.model_path,
+            subfolder="vae",
+            torch_dtype=torch.bfloat16,
+        )
+        print(f"[TEMPORARY] Diffusers VAE type: {type(diffusers_vae)}")
+
+        # Replace the VAE module
+        self.modules["vae"] = diffusers_vae
+        print("[TEMPORARY] VAE replacement complete!")
+
         self.modules["scheduler"] = FlowMatchEulerDiscreteScheduler(
             shift=fastvideo_args.pipeline_config.flow_shift)
         
diff --git a/fastvideo/pipelines/stages/decoding.py b/fastvideo/pipelines/stages/decoding.py
@@ -92,11 +92,16 @@ def forward(
             vae_autocast_enabled = (vae_dtype != torch.float32
                                     ) and not fastvideo_args.disable_autocast
 
-            if isinstance(self.vae.scaling_factor, torch.Tensor):
-                latents = latents / self.vae.scaling_factor.to(
-                    latents.device, latents.dtype)
-            else:
-                latents = latents / self.vae.scaling_factor
+            # TEMPORARY: Handle diffusers VAE compatibility
+            if hasattr(self.vae, 'scaling_factor'):
+                if isinstance(self.vae.scaling_factor, torch.Tensor):
+                    latents = latents / self.vae.scaling_factor.to(
+                        latents.device, latents.dtype)
+                else:
+                    latents = latents / self.vae.scaling_factor
+            elif hasattr(self.vae, 'config') and hasattr(self.vae.config, 'scaling_factor'):
+                # Fallback to config scaling factor for diffusers VAE
+                latents = latents / self.vae.config.scaling_factor
 
             # Apply shifting if needed
             if (hasattr(self.vae, "shift_factor")
@@ -117,7 +122,15 @@ def forward(
                 #     self.vae.enable_parallel()
                 if not vae_autocast_enabled:
                     latents = latents.to(vae_dtype)
-                image = self.vae.decode(latents)
+                decode_output = self.vae.decode(latents)
+
+                # TEMPORARY: Handle diffusers VAE decode output compatibility
+                if hasattr(decode_output, 'sample'):
+                    # Diffusers VAE returns DecoderOutput with .sample attribute
+                    image = decode_output.sample
+                else:
+                    # FastVideo VAE returns tensor directly
+                    image = decode_output
 
         # Normalize image to [0, 1] range
         image = (image / 2 + 0.5).clamp(0, 1)
diff --git a/fastvideo/pipelines/stages/denoising.py b/fastvideo/pipelines/stages/denoising.py
@@ -815,8 +815,9 @@ def forward(
                                 f.write(f"  DTYPES: hidden_states={cond_latent.dtype}, timestep={cond_timestep.dtype}, encoder_hidden_states={batch.prompt_embeds[0].dtype}\n")
                                 f.write(f"  hidden_states first 5 values: {cond_latent.flatten()[:5].float()}\n")
                                 f.write(f"  encoder_hidden_states first 5 values: {batch.prompt_embeds[0].flatten()[:5].float()}\n")
-                        
+                                f.write(f"  [FASTVIDEO DENOISING] About to call transformer with hidden_states sum = {cond_latent.float().sum().item()}\n")
                         print(f"[FASTVIDEO DENOISING] About to call transformer with hidden_states sum = {cond_latent.float().sum().item()}")
+
                         noise_pred = self.transformer(
                             hidden_states=cond_latent,  # Already converted to target_dtype above
                             timestep=cond_timestep.to(target_dtype),
diff --git a/fastvideo/pipelines/stages/latent_preparation.py b/fastvideo/pipelines/stages/latent_preparation.py