Fix conditioning mismatch

kevin314 · kevin314 · commit 719af4617b06 · 2025-09-09T06:51:02.000Z
diff --git a/fastvideo/configs/pipelines/cosmos.py b/fastvideo/configs/pipelines/cosmos.py
@@ -16,22 +16,13 @@
 def t5_large_postprocess_text(outputs: BaseEncoderOutput) -> torch.Tensor:
     """Postprocess T5 Large text encoder outputs for Cosmos pipeline.
     
-    Handles attention masks and sequence padding for the T5 Large model.
+    Return raw last_hidden_state without truncation/padding.
     """
     hidden_state = outputs.last_hidden_state
     
     if hidden_state is None:
         raise ValueError("T5 Large outputs missing last_hidden_state")
     
-    mask = outputs.attention_mask
-    
-    # If no attention mask provided, assume all tokens are valid
-    if mask is None:
-        batch_size, seq_len = hidden_state.shape[:2]
-        mask = torch.ones(batch_size, seq_len, device=hidden_state.device, dtype=torch.long)
-    
-    seq_lens = mask.gt(0).sum(dim=1).long()
-    
     # Check for NaN values and provide debugging info
     nan_count = torch.isnan(hidden_state).sum()
     if nan_count > 0:
@@ -42,16 +33,8 @@ def t5_large_postprocess_text(outputs: BaseEncoderOutput) -> torch.Tensor:
         # Replace NaN values with zeros to avoid pipeline failure
         hidden_state = hidden_state.masked_fill(torch.isnan(hidden_state), 0.0)
     
-    # Create list of tensors with proper sequence lengths
-    prompt_embeds = [u[:v] for u, v in zip(hidden_state, seq_lens, strict=True)]
-    
-    # Stack tensors with padding to fixed length (like wan.py implementation)
-    prompt_embeds_tensor: torch.Tensor = torch.stack([
-        torch.cat([u, u.new_zeros(512 - u.size(0), u.size(1))])
-        for u in prompt_embeds
-    ], dim=0)
-    
-    return prompt_embeds_tensor
+    # Return raw last_hidden_state (no truncation/padding)
+    return hidden_state
 
 
 @dataclass
@@ -134,7 +117,7 @@ class CosmosConfig(PipelineConfig):
     
     # Denoising parameters
     embedded_cfg_scale: int = 6
-    flow_shift: int = 7
+    flow_shift: float = 1.0  # Changed to 1.0 to match diffusers (no shift transformation)
 
     def __post_init__(self):
         self.vae_config.load_encoder = True
diff --git a/fastvideo/pipelines/stages/denoising.py b/fastvideo/pipelines/stages/denoising.py
@@ -750,15 +750,20 @@ def forward(
                         print(f"[FASTVIDEO DEBUG] Step {i}: SKIPPING conditioning frame injection!")
                         logger.warning(f"Step {i}: Missing conditioning data - cond_indicator: {hasattr(batch, 'cond_indicator')}, conditioning_latents: {conditioning_latents is not None}")
                     
-                    # cond_latent = cond_latent.to(target_dtype)
+                    # Convert cond_latent to target dtype BEFORE debug logging to match Diffusers
+                    cond_latent = cond_latent.to(target_dtype)
                     
-                    # # Apply conditional timestep processing like diffusers (lines 720-721)
-                    # cond_timestep = timestep
-                    # if hasattr(batch, 'cond_indicator') and batch.cond_indicator is not None:
-                    #     cond_timestep = batch.cond_indicator * t_conditioning + (1 - batch.cond_indicator) * timestep
-                    #     cond_timestep = cond_timestep.to(target_dtype)
-                    #     if i < 3:
-                    #         logger.info(f"Step {i}: Applied conditional timestep - t_conditioning: {t_conditioning:.6f}, cond_timestep sum: {cond_timestep.float().sum().item():.6f}")
+                    # Apply conditional timestep processing like Diffusers (lines 792-793)
+                    cond_timestep = timestep
+                    if hasattr(batch, 'cond_indicator') and batch.cond_indicator is not None:
+                        # Exactly match Diffusers: cond_timestep = cond_indicator * t_conditioning + (1 - cond_indicator) * timestep
+                        # First get t_conditioning (sigma_conditioning value from Diffusers)
+                        sigma_conditioning = 0.0001  # Same as Diffusers default
+                        t_conditioning = sigma_conditioning / (sigma_conditioning + 1)
+                        cond_timestep = batch.cond_indicator * t_conditioning + (1 - batch.cond_indicator) * timestep
+                        cond_timestep = cond_timestep.to(target_dtype)
+                        if i < 3:
+                            logger.info(f"Step {i}: Applied conditional timestep - t_conditioning: {t_conditioning:.6f}, cond_timestep sum: {cond_timestep.float().sum().item():.6f}")
                     
                     with set_forward_context(
                         current_timestep=i,
@@ -767,7 +772,8 @@ def forward(
                     ):
                         # Use conditioning masks from CosmosLatentPreparationStage
                         condition_mask = batch.cond_mask.to(target_dtype) if hasattr(batch, 'cond_mask') else None
-                        padding_mask = torch.zeros(1, 1, cond_latent.shape[3], cond_latent.shape[4], 
+                        # Padding mask should match original image dimensions like Diffusers (704, 1280)
+                        padding_mask = torch.zeros(1, 1, batch.height, batch.width, 
                                                  device=cond_latent.device, dtype=target_dtype)
                         
                         # Fallback if masks not available
@@ -786,10 +792,34 @@ def forward(
                             logger.info(f"  condition_mask shape: {condition_mask.shape if condition_mask is not None else None}")
                             logger.info(f"  padding_mask shape: {padding_mask.shape}")
                         
+                        # Log detailed transformer inputs for comparison with Diffusers
+                        if i < 3:
+                            print(f"FASTVIDEO TRANSFORMER INPUTS (step {i}):")
+                            print(f"  hidden_states: shape={cond_latent.shape}, sum={cond_latent.float().sum().item():.6f}, mean={cond_latent.float().mean().item():.6f}")
+                            print(f"  timestep: shape={cond_timestep.shape}, sum={cond_timestep.float().sum().item():.6f}, values={cond_timestep.flatten()[:5].float()}")
+                            print(f"  encoder_hidden_states: shape={batch.prompt_embeds[0].shape}, sum={batch.prompt_embeds[0].float().sum().item():.6f}")
+                            print(f"  condition_mask: shape={condition_mask.shape if condition_mask is not None else None}, sum={condition_mask.float().sum().item() if condition_mask is not None else None}")
+                            print(f"  padding_mask: shape={padding_mask.shape}, sum={padding_mask.float().sum().item():.6f}")
+                            print(f"  fps: {24}, target_dtype: {target_dtype}")
+                            print(f"  DTYPES: hidden_states={cond_latent.dtype}, timestep={cond_timestep.dtype}, encoder_hidden_states={batch.prompt_embeds[0].dtype}")
+                            print(f"  hidden_states first 5 values: {cond_latent.flatten()[:5].float()}")
+                            print(f"  encoder_hidden_states first 5 values: {batch.prompt_embeds[0].flatten()[:5].float()}")
+                            with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                                f.write(f"FASTVIDEO TRANSFORMER INPUTS (step {i}):\n")
+                                f.write(f"  hidden_states: shape={cond_latent.shape}, sum={cond_latent.float().sum().item():.6f}, mean={cond_latent.float().mean().item():.6f}\n")
+                                f.write(f"  timestep: shape={cond_timestep.shape}, sum={cond_timestep.float().sum().item():.6f}, values={cond_timestep.flatten()[:5].float()}\n")
+                                f.write(f"  encoder_hidden_states: shape={batch.prompt_embeds[0].shape}, sum={batch.prompt_embeds[0].float().sum().item():.6f}\n")
+                                f.write(f"  condition_mask: shape={condition_mask.shape if condition_mask is not None else None}, sum={condition_mask.float().sum().item() if condition_mask is not None else None}\n")
+                                f.write(f"  padding_mask: shape={padding_mask.shape}, sum={padding_mask.float().sum().item():.6f}\n")
+                                f.write(f"  fps: {24}, target_dtype: {target_dtype}\n")
+                                f.write(f"  DTYPES: hidden_states={cond_latent.dtype}, timestep={cond_timestep.dtype}, encoder_hidden_states={batch.prompt_embeds[0].dtype}\n")
+                                f.write(f"  hidden_states first 5 values: {cond_latent.flatten()[:5].float()}\n")
+                                f.write(f"  encoder_hidden_states first 5 values: {batch.prompt_embeds[0].flatten()[:5].float()}\n")
+                        
                         print(f"[FASTVIDEO DENOISING] About to call transformer with hidden_states sum = {cond_latent.float().sum().item()}")
                         noise_pred = self.transformer(
-                            hidden_states=cond_latent.to(target_dtype),
-                            timestep=timestep.to(target_dtype),
+                            hidden_states=cond_latent,  # Already converted to target_dtype above
+                            timestep=cond_timestep.to(target_dtype),
                             encoder_hidden_states=batch.prompt_embeds[0].to(target_dtype),
                             fps=24,  # TODO: get fps from batch or config
                             condition_mask=condition_mask,
@@ -805,11 +835,7 @@ def forward(
                     # Apply preconditioning exactly like diffusers
                     cond_pred = (c_skip * latents + c_out * noise_pred.float()).to(target_dtype)
                     
-                    # Apply conditional indicator masking (from CosmosLatentPreparationStage)
-                    if hasattr(batch, 'cond_indicator') and batch.cond_indicator is not None:
-                        conditioning_latents = batch.conditioning_latents if batch.conditioning_latents is not None else torch.zeros_like(latents)
-                        cond_pred = batch.cond_indicator * conditioning_latents + (1 - batch.cond_indicator) * cond_pred
-                    
+                    # NOTE: Conditioning frame injection is applied to cond_latent BEFORE transformer call (line 746), not after                    
                     # Classifier-free guidance
                     if batch.do_classifier_free_guidance and batch.negative_prompt_embeds is not None:
                         # Unconditional pass - match diffusers logic (lines 755-759)
@@ -830,9 +856,17 @@ def forward(
                                 logger.info(f"  negative_prompt_embeds shape: {batch.negative_prompt_embeds[0].shape}")
                                 # sum: {uncond_timestep.float().sum().item():.6f}")
                             
+                            # Apply same conditional timestep processing for unconditional pass
+                            uncond_timestep = timestep
+                            if hasattr(batch, 'uncond_indicator') and batch.uncond_indicator is not None:
+                                sigma_conditioning = 0.0001  # Same as Diffusers default
+                                t_conditioning = sigma_conditioning / (sigma_conditioning + 1)
+                                uncond_timestep = batch.uncond_indicator * t_conditioning + (1 - batch.uncond_indicator) * timestep
+                                uncond_timestep = uncond_timestep.to(target_dtype)
+                            
                             noise_pred_uncond = self.transformer(
                                 hidden_states=uncond_latent.to(target_dtype),
-                                timestep=timestep.to(target_dtype),
+                                timestep=uncond_timestep.to(target_dtype),
                                 encoder_hidden_states=batch.negative_prompt_embeds[0].to(target_dtype),
                                 fps=24,  # TODO: get fps from batch or config
                                 condition_mask=uncond_condition_mask,
diff --git a/fastvideo/pipelines/stages/latent_preparation.py b/fastvideo/pipelines/stages/latent_preparation.py
@@ -240,36 +240,30 @@ def forward(
                     logger.info(f"CosmosLatentPreparationStage - Using 5D tensor as-is: {video.shape}")
             else:
                 logger.info("CosmosLatentPreparationStage - pil_image is not a tensor, needs preprocessing")
-                # Following diffusers approach for image-to-video preprocessing
-                # Convert PIL image to tensor and add temporal dimension
-                import torchvision.transforms as transforms
+                # Use same preprocessing as diffusers VideoProcessor
+                from diffusers.video_processor import VideoProcessor
                 
-                # Create transform pipeline similar to diffusers VideoProcessor
-                transform = transforms.Compose([
-                    transforms.Resize((height, width), antialias=True),
-                    transforms.ToTensor(),
-                    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])  # Normalize to [-1, 1]
-                ])
+                # Create VideoProcessor with same parameters as diffusers Cosmos pipeline
+                vae_scale_factor_spatial = 8  # Same as diffusers
+                video_processor = VideoProcessor(vae_scale_factor=vae_scale_factor_spatial)
                 
-                # Apply transform to get [C, H, W] tensor
-                image_tensor = transform(batch.pil_image)
-                logger.info(f"CosmosLatentPreparationStage - Transformed PIL to tensor: {image_tensor.shape}")
+                # Use exact same method as diffusers: preprocess image then unsqueeze for time dimension
+                processed_image = video_processor.preprocess(batch.pil_image, height, width)
+                logger.info(f"CosmosLatentPreparationStage - VideoProcessor preprocess result: shape={processed_image.shape}, dtype={processed_image.dtype}, device={processed_image.device}")
                 
-                # Add batch dimension: [C, H, W] -> [B, C, H, W]
-                image_tensor = image_tensor.unsqueeze(0)
+                # Add time dimension exactly like diffusers: unsqueeze(2)  
+                video = processed_image.unsqueeze(2)
+                logger.info(f"CosmosLatentPreparationStage - After unsqueeze(2): shape={video.shape}, dtype={video.dtype}, device={video.device}")
                 
-                # Add time dimension like diffusers: [B, C, H, W] -> [B, C, T, H, W]
-                video = image_tensor.unsqueeze(2)  # Add time dim at position 2
-                logger.info(f"CosmosLatentPreparationStage - Added batch and time dims: {video.shape}")
-                
-                # Move to correct device and ensure compatible dtype for VAE
-                # Use VAE's parameter dtype to avoid dtype mismatches
+                # Exactly match diffusers' device/dtype handling: to(device=device, dtype=vae_dtype)
+                # Get VAE dtype exactly like diffusers
                 if self.vae is not None:
-                    vae_dtype = next(self.vae.parameters()).dtype
+                    vae_dtype = next(self.vae.parameters()).dtype  # Get VAE's parameter dtype
                 else:
                     vae_dtype = dtype
+                
                 video = video.to(device=device, dtype=vae_dtype)
-                logger.info(f"CosmosLatentPreparationStage - Video tensor device: {video.device}, dtype: {video.dtype}")
+                logger.info(f"CosmosLatentPreparationStage - After to(device, dtype): shape={video.shape}, dtype={video.dtype}, device={video.device}, vae_dtype={vae_dtype}")
         elif hasattr(batch, 'preprocessed_image') and batch.preprocessed_image is not None:
             logger.info(f"CosmosLatentPreparationStage - Found preprocessed_image of type: {type(batch.preprocessed_image)}")
             # Convert preprocessed image to video format
diff --git a/fastvideo/pipelines/stages/text_encoding.py b/fastvideo/pipelines/stages/text_encoding.py
@@ -90,6 +90,11 @@ def forward(
             # Write to output file
             with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
                 f.write(f"TextEncodingStage: prompt_embeds sum = {sum_value:.6f}\n")
+
+            lengths = attention_mask.sum(dim=1).cpu()
+            for i, length in enumerate(lengths):
+                prompt_embeds[i, length:] = 0
+
             batch.prompt_embeds.append(prompt_embeds)
             if batch.prompt_attention_mask is not None:
                 batch.prompt_attention_mask.append(attention_mask)