Update

kevin314 · kevin314 · commit 8fafbadb4bc3 · 2025-08-26T07:38:59.000Z
diff --git a/fastvideo/pipelines/basic/cosmos/cosmos_pipeline.py b/fastvideo/pipelines/basic/cosmos/cosmos_pipeline.py
@@ -35,36 +35,72 @@ def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
 
         self.modules["scheduler"] = FlowMatchEulerDiscreteScheduler(
             shift=fastvideo_args.pipeline_config.flow_shift)
+        
+        # Configure Cosmos-specific scheduler parameters (matching diffusers)
+        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:209-219
+        sigma_max = 80.0
+        sigma_min = 0.002
+        sigma_data = 1.0
+        final_sigmas_type = "sigma_min"
+        
+        if self.modules["scheduler"] is not None:
+            # Update scheduler config and attributes directly
+            scheduler = self.modules["scheduler"]
+            scheduler.config.sigma_max = sigma_max
+            scheduler.config.sigma_min = sigma_min
+            scheduler.config.sigma_data = sigma_data
+            scheduler.config.final_sigmas_type = final_sigmas_type
+            # Also set the direct attributes used by the scheduler
+            scheduler.sigma_max = sigma_max
+            scheduler.sigma_min = sigma_min
+            scheduler.sigma_data = sigma_data
 
     def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
         """Set up pipeline stages with proper dependency injection."""
 
+        # Input validation - corresponds to diffusers check_inputs method
+        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:427-456
         self.add_stage(stage_name="input_validation_stage",
                        stage=InputValidationStage())
 
+        # Text encoding - corresponds to diffusers encode_prompt method
+        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:265-346
+        # Also uses _get_t5_prompt_embeds method: lines 222-262
         self.add_stage(stage_name="prompt_encoding_stage",
                        stage=TextEncodingStage(
                            text_encoders=[self.get_module("text_encoder")],
                            tokenizers=[self.get_module("tokenizer")],
                        ))
 
+        # Conditioning preparation - part of main __call__ method setup
+        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:607-628
         self.add_stage(stage_name="conditioning_stage",
                        stage=ConditioningStage())
 
+        # Timestep preparation - corresponds to timestep setup in __call__
+        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:630-637
+        # Uses retrieve_timesteps function: lines 81-137
         self.add_stage(stage_name="timestep_preparation_stage",
                        stage=TimestepPreparationStage(
                            scheduler=self.get_module("scheduler")))
 
+        # Latent preparation - corresponds to prepare_latents method
+        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:348-424
+        # Also includes video preprocessing: lines 642-661
         self.add_stage(stage_name="latent_preparation_stage",
                        stage=CosmosLatentPreparationStage(
                            scheduler=self.get_module("scheduler"),
                            transformer=self.get_module("transformer")))
 
+        # Denoising loop - corresponds to main denoising loop in __call__
+        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:673-752
         self.add_stage(stage_name="denoising_stage",
                        stage=CosmosDenoisingStage(
                            transformer=self.get_module("transformer"),
                            scheduler=self.get_module("scheduler")))
 
+        # VAE decoding - corresponds to final decoding section in __call__
+        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:755-784
         self.add_stage(stage_name="decoding_stage",
                        stage=DecodingStage(vae=self.get_module("vae")))
         
diff --git a/fastvideo/pipelines/stages/denoising.py b/fastvideo/pipelines/stages/denoising.py
@@ -292,6 +292,11 @@ def forward(
                             **image_kwargs,
                             **pos_cond_kwargs,
                         )
+                        sum_value = noise_pred.float().sum().item()
+                        logger.info(f"DenoisingStage: step {i}, noise_pred sum = {sum_value:.6f}")
+                        # Write to output file
+                        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                            f.write(f"DenoisingStage: step {i}, noise_pred sum = {sum_value:.6f}\n")
 
                     # Apply guidance
                     if batch.do_classifier_free_guidance:
@@ -311,9 +316,19 @@ def forward(
                                 **image_kwargs,
                                 **neg_cond_kwargs,
                             )
+                        sum_value = noise_pred_uncond.float().sum().item()
+                        logger.info(f"DenoisingStage: step {i}, noise_pred_uncond sum = {sum_value:.6f}")
+                        # Write to output file
+                        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                            f.write(f"DenoisingStage: step {i}, noise_pred_uncond sum = {sum_value:.6f}\n")
                         noise_pred_text = noise_pred
                         noise_pred = noise_pred_uncond + current_guidance_scale * (
                             noise_pred_text - noise_pred_uncond)
+                        sum_value = noise_pred.float().sum().item()
+                        logger.info(f"DenoisingStage: step {i}, final noise_pred sum = {sum_value:.6f}")
+                        # Write to output file
+                        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                            f.write(f"DenoisingStage: step {i}, final noise_pred sum = {sum_value:.6f}\n")
 
                         # Apply guidance rescale if needed
                         if batch.guidance_rescale > 0.0:
@@ -329,6 +344,11 @@ def forward(
                                                   latents,
                                                   **extra_step_kwargs,
                                                   return_dict=False)[0]
+                    sum_value = latents.float().sum().item()
+                    logger.info(f"DenoisingStage: step {i}, updated latents sum = {sum_value:.6f}")
+                    # Write to output file
+                    with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                        f.write(f"DenoisingStage: step {i}, updated latents sum = {sum_value:.6f}\n")
                 # Update progress bar
                 if i == len(timesteps) - 1 or (
                     (i + 1) > num_warmup_steps and
@@ -715,6 +735,11 @@ def forward(
                             padding_mask=padding_mask,
                             return_dict=False,
                         )[0]
+                        sum_value = cond_velocity.float().sum().item()
+                        logger.info(f"CosmosDenoisingStage: step {i}, cond_velocity sum = {sum_value:.6f}")
+                        # Write to output file
+                        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                            f.write(f"CosmosDenoisingStage: step {i}, cond_velocity sum = {sum_value:.6f}\n")
                     
                     # Apply preconditioning and conditional masking
                     cond_pred = (c_skip * latents + c_out * cond_velocity.float()).to(target_dtype)
@@ -745,6 +770,11 @@ def forward(
                                 padding_mask=padding_mask,
                                 return_dict=False,
                             )[0]
+                            sum_value = uncond_velocity.float().sum().item()
+                            logger.info(f"CosmosDenoisingStage: step {i}, uncond_velocity sum = {sum_value:.6f}")
+                            # Write to output file
+                            with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                                f.write(f"CosmosDenoisingStage: step {i}, uncond_velocity sum = {sum_value:.6f}\n")
                         
                         uncond_pred = (c_skip * latents + c_out * uncond_velocity.float()).to(target_dtype)
                         
@@ -755,6 +785,11 @@ def forward(
                         
                         # Apply guidance
                         noise_pred = cond_pred + guidance_scale * (cond_pred - uncond_pred)
+                        sum_value = noise_pred.float().sum().item()
+                        logger.info(f"CosmosDenoisingStage: step {i}, final noise_pred sum = {sum_value:.6f}")
+                        # Write to output file
+                        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                            f.write(f"CosmosDenoisingStage: step {i}, final noise_pred sum = {sum_value:.6f}\n")
                     else:
                         noise_pred = cond_pred
                 
@@ -774,6 +809,11 @@ def forward(
                 # Standard scheduler step
                 latents_before = latents.clone()
                 latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                sum_value = latents.float().sum().item()
+                logger.info(f"CosmosDenoisingStage: step {i}, updated latents sum = {sum_value:.6f}")
+                # Write to output file
+                with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                    f.write(f"CosmosDenoisingStage: step {i}, updated latents sum = {sum_value:.6f}\n")
                 
                 # Debug: Check for NaN values after scheduler step
                 logger.info(f"Step {i}: After scheduler - latents NaN count: {torch.isnan(latents).sum()}")
diff --git a/fastvideo/pipelines/stages/latent_preparation.py b/fastvideo/pipelines/stages/latent_preparation.py
@@ -105,6 +105,11 @@ def forward(
         # Update batch with prepared latents
         batch.latents = latents
         batch.raw_latent_shape = latents.shape
+        sum_value = latents.float().sum().item()
+        logger.info(f"LatentPreparationStage: latents sum = {sum_value:.6f}")
+        # Write to output file
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"LatentPreparationStage: latents sum = {sum_value:.6f}\n")
 
         return batch
 
@@ -162,9 +167,10 @@ def forward(
             raise ValueError("Height and width must be provided")
 
         # Calculate Cosmos-specific dimensions
-        # Note: Cosmos uses different scale factors than other models  
-        vae_scale_factor_spatial = 8  # Cosmos VAE spatial compression
-        vae_scale_factor_temporal = 8  # Cosmos VAE temporal compression
+        # Use the same VAE scale factors as diffusers to match their latent shapes
+        # Based on diffusers pipeline: lines 205-206
+        vae_scale_factor_spatial = 8  # Standard spatial compression (matches diffusers)
+        vae_scale_factor_temporal = 4  # Temporal compression (matches diffusers default)
         
         # Use same formula as diffusers cosmos pipeline
         num_latent_frames = (num_frames - 1) // vae_scale_factor_temporal + 1
@@ -217,6 +223,11 @@ def forward(
         # Store in batch
         batch.latents = latents
         batch.raw_latent_shape = latents.shape
+        sum_value = latents.float().sum().item()
+        logger.info(f"CosmosLatentPreparationStage: latents sum = {sum_value:.6f}, shape = {latents.shape}, sigma_max = {self.scheduler.sigma_max}")
+        # Write to output file
+        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            f.write(f"CosmosLatentPreparationStage: latents sum = {sum_value:.6f}, shape = {latents.shape}, sigma_max = {self.scheduler.sigma_max}\n")
         
         # Store Cosmos-specific conditioning data
         batch.conditioning_latents = None  # No conditioning frames for now
diff --git a/fastvideo/pipelines/stages/text_encoding.py b/fastvideo/pipelines/stages/text_encoding.py
@@ -85,6 +85,11 @@ def forward(
                     output_hidden_states=True,
                 )
             prompt_embeds = postprocess_func(outputs)
+            sum_value = prompt_embeds.float().sum().item()
+            logger.info(f"TextEncodingStage: prompt_embeds sum = {sum_value:.6f}")
+            # Write to output file
+            with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                f.write(f"TextEncodingStage: prompt_embeds sum = {sum_value:.6f}\n")
             batch.prompt_embeds.append(prompt_embeds)
             if batch.prompt_attention_mask is not None:
                 batch.prompt_attention_mask.append(attention_mask)
@@ -105,6 +110,11 @@ def forward(
                         output_hidden_states=True,
                     )
                 negative_prompt_embeds = postprocess_func(negative_outputs)
+                sum_value = negative_prompt_embeds.float().sum().item()
+                logger.info(f"TextEncodingStage: negative_prompt_embeds sum = {sum_value:.6f}")
+                # Write to output file
+                with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                    f.write(f"TextEncodingStage: negative_prompt_embeds sum = {sum_value:.6f}\n")
 
                 assert batch.negative_prompt_embeds is not None
                 batch.negative_prompt_embeds.append(negative_prompt_embeds)
diff --git a/test2.py b/test2.py
@@ -0,0 +1,19 @@
+import torch
+from diffusers import Cosmos2VideoToWorldPipeline
+from diffusers.utils import export_to_video, load_image
+
+# Available checkpoints: nvidia/Cosmos-Predict2-2B-Video2World, nvidia/Cosmos-Predict2-14B-Video2World
+model_id = "nvidia/Cosmos-Predict2-2B-Video2World"
+pipe = Cosmos2VideoToWorldPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
+prompt = "A close-up shot captures a vibrant yellow scrubber vigorously working on a grimy plate, its bristles moving in circular motions to lift stubborn grease and food residue. The dish, once covered in remnants of a hearty meal, gradually reveals its original glossy surface. Suds form and bubble around the scrubber, creating a satisfying visual of cleanliness in progress. The sound of scrubbing fills the air, accompanied by the gentle clinking of the dish against the sink. As the scrubber continues its task, the dish transforms, gleaming under the bright kitchen lights, symbolizing the triumph of cleanliness over mess."
+negative_prompt = "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality."
+image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yellow-scrubber.png"
+)
+
+video = pipe(
+    image=image, prompt=prompt, negative_prompt=negative_prompt, generator=torch.Generator().manual_seed(1), num_frames=25
+).frames[0]
+export_to_video(video, "output.mp4", fps=16)
diff --git a/test_diffusers.py b/test_diffusers.py
diff --git a/test_fastvideo_pipeline.py b/test_fastvideo_pipeline.py