Skip to content

Commit 8fafbad

Browse files
committed
Update
1 parent cc197e7 commit 8fafbad

File tree

7 files changed

+245
-8
lines changed

7 files changed

+245
-8
lines changed

fastvideo/pipelines/basic/cosmos/cosmos_pipeline.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,36 +35,72 @@ def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
3535

3636
self.modules["scheduler"] = FlowMatchEulerDiscreteScheduler(
3737
shift=fastvideo_args.pipeline_config.flow_shift)
38+
39+
# Configure Cosmos-specific scheduler parameters (matching diffusers)
40+
# Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:209-219
41+
sigma_max = 80.0
42+
sigma_min = 0.002
43+
sigma_data = 1.0
44+
final_sigmas_type = "sigma_min"
45+
46+
if self.modules["scheduler"] is not None:
47+
# Update scheduler config and attributes directly
48+
scheduler = self.modules["scheduler"]
49+
scheduler.config.sigma_max = sigma_max
50+
scheduler.config.sigma_min = sigma_min
51+
scheduler.config.sigma_data = sigma_data
52+
scheduler.config.final_sigmas_type = final_sigmas_type
53+
# Also set the direct attributes used by the scheduler
54+
scheduler.sigma_max = sigma_max
55+
scheduler.sigma_min = sigma_min
56+
scheduler.sigma_data = sigma_data
3857

3958
def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
4059
"""Set up pipeline stages with proper dependency injection."""
4160

61+
# Input validation - corresponds to diffusers check_inputs method
62+
# Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:427-456
4263
self.add_stage(stage_name="input_validation_stage",
4364
stage=InputValidationStage())
4465

66+
# Text encoding - corresponds to diffusers encode_prompt method
67+
# Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:265-346
68+
# Also uses _get_t5_prompt_embeds method: lines 222-262
4569
self.add_stage(stage_name="prompt_encoding_stage",
4670
stage=TextEncodingStage(
4771
text_encoders=[self.get_module("text_encoder")],
4872
tokenizers=[self.get_module("tokenizer")],
4973
))
5074

75+
# Conditioning preparation - part of main __call__ method setup
76+
# Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:607-628
5177
self.add_stage(stage_name="conditioning_stage",
5278
stage=ConditioningStage())
5379

80+
# Timestep preparation - corresponds to timestep setup in __call__
81+
# Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:630-637
82+
# Uses retrieve_timesteps function: lines 81-137
5483
self.add_stage(stage_name="timestep_preparation_stage",
5584
stage=TimestepPreparationStage(
5685
scheduler=self.get_module("scheduler")))
5786

87+
# Latent preparation - corresponds to prepare_latents method
88+
# Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:348-424
89+
# Also includes video preprocessing: lines 642-661
5890
self.add_stage(stage_name="latent_preparation_stage",
5991
stage=CosmosLatentPreparationStage(
6092
scheduler=self.get_module("scheduler"),
6193
transformer=self.get_module("transformer")))
6294

95+
# Denoising loop - corresponds to main denoising loop in __call__
96+
# Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:673-752
6397
self.add_stage(stage_name="denoising_stage",
6498
stage=CosmosDenoisingStage(
6599
transformer=self.get_module("transformer"),
66100
scheduler=self.get_module("scheduler")))
67101

102+
# VAE decoding - corresponds to final decoding section in __call__
103+
# Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:755-784
68104
self.add_stage(stage_name="decoding_stage",
69105
stage=DecodingStage(vae=self.get_module("vae")))
70106

fastvideo/pipelines/stages/denoising.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,11 @@ def forward(
292292
**image_kwargs,
293293
**pos_cond_kwargs,
294294
)
295+
sum_value = noise_pred.float().sum().item()
296+
logger.info(f"DenoisingStage: step {i}, noise_pred sum = {sum_value:.6f}")
297+
# Write to output file
298+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
299+
f.write(f"DenoisingStage: step {i}, noise_pred sum = {sum_value:.6f}\n")
295300

296301
# Apply guidance
297302
if batch.do_classifier_free_guidance:
@@ -311,9 +316,19 @@ def forward(
311316
**image_kwargs,
312317
**neg_cond_kwargs,
313318
)
319+
sum_value = noise_pred_uncond.float().sum().item()
320+
logger.info(f"DenoisingStage: step {i}, noise_pred_uncond sum = {sum_value:.6f}")
321+
# Write to output file
322+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
323+
f.write(f"DenoisingStage: step {i}, noise_pred_uncond sum = {sum_value:.6f}\n")
314324
noise_pred_text = noise_pred
315325
noise_pred = noise_pred_uncond + current_guidance_scale * (
316326
noise_pred_text - noise_pred_uncond)
327+
sum_value = noise_pred.float().sum().item()
328+
logger.info(f"DenoisingStage: step {i}, final noise_pred sum = {sum_value:.6f}")
329+
# Write to output file
330+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
331+
f.write(f"DenoisingStage: step {i}, final noise_pred sum = {sum_value:.6f}\n")
317332

318333
# Apply guidance rescale if needed
319334
if batch.guidance_rescale > 0.0:
@@ -329,6 +344,11 @@ def forward(
329344
latents,
330345
**extra_step_kwargs,
331346
return_dict=False)[0]
347+
sum_value = latents.float().sum().item()
348+
logger.info(f"DenoisingStage: step {i}, updated latents sum = {sum_value:.6f}")
349+
# Write to output file
350+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
351+
f.write(f"DenoisingStage: step {i}, updated latents sum = {sum_value:.6f}\n")
332352
# Update progress bar
333353
if i == len(timesteps) - 1 or (
334354
(i + 1) > num_warmup_steps and
@@ -715,6 +735,11 @@ def forward(
715735
padding_mask=padding_mask,
716736
return_dict=False,
717737
)[0]
738+
sum_value = cond_velocity.float().sum().item()
739+
logger.info(f"CosmosDenoisingStage: step {i}, cond_velocity sum = {sum_value:.6f}")
740+
# Write to output file
741+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
742+
f.write(f"CosmosDenoisingStage: step {i}, cond_velocity sum = {sum_value:.6f}\n")
718743

719744
# Apply preconditioning and conditional masking
720745
cond_pred = (c_skip * latents + c_out * cond_velocity.float()).to(target_dtype)
@@ -745,6 +770,11 @@ def forward(
745770
padding_mask=padding_mask,
746771
return_dict=False,
747772
)[0]
773+
sum_value = uncond_velocity.float().sum().item()
774+
logger.info(f"CosmosDenoisingStage: step {i}, uncond_velocity sum = {sum_value:.6f}")
775+
# Write to output file
776+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
777+
f.write(f"CosmosDenoisingStage: step {i}, uncond_velocity sum = {sum_value:.6f}\n")
748778

749779
uncond_pred = (c_skip * latents + c_out * uncond_velocity.float()).to(target_dtype)
750780

@@ -755,6 +785,11 @@ def forward(
755785

756786
# Apply guidance
757787
noise_pred = cond_pred + guidance_scale * (cond_pred - uncond_pred)
788+
sum_value = noise_pred.float().sum().item()
789+
logger.info(f"CosmosDenoisingStage: step {i}, final noise_pred sum = {sum_value:.6f}")
790+
# Write to output file
791+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
792+
f.write(f"CosmosDenoisingStage: step {i}, final noise_pred sum = {sum_value:.6f}\n")
758793
else:
759794
noise_pred = cond_pred
760795

@@ -774,6 +809,11 @@ def forward(
774809
# Standard scheduler step
775810
latents_before = latents.clone()
776811
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
812+
sum_value = latents.float().sum().item()
813+
logger.info(f"CosmosDenoisingStage: step {i}, updated latents sum = {sum_value:.6f}")
814+
# Write to output file
815+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
816+
f.write(f"CosmosDenoisingStage: step {i}, updated latents sum = {sum_value:.6f}\n")
777817

778818
# Debug: Check for NaN values after scheduler step
779819
logger.info(f"Step {i}: After scheduler - latents NaN count: {torch.isnan(latents).sum()}")

fastvideo/pipelines/stages/latent_preparation.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,11 @@ def forward(
105105
# Update batch with prepared latents
106106
batch.latents = latents
107107
batch.raw_latent_shape = latents.shape
108+
sum_value = latents.float().sum().item()
109+
logger.info(f"LatentPreparationStage: latents sum = {sum_value:.6f}")
110+
# Write to output file
111+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
112+
f.write(f"LatentPreparationStage: latents sum = {sum_value:.6f}\n")
108113

109114
return batch
110115

@@ -162,9 +167,10 @@ def forward(
162167
raise ValueError("Height and width must be provided")
163168

164169
# Calculate Cosmos-specific dimensions
165-
# Note: Cosmos uses different scale factors than other models
166-
vae_scale_factor_spatial = 8 # Cosmos VAE spatial compression
167-
vae_scale_factor_temporal = 8 # Cosmos VAE temporal compression
170+
# Use the same VAE scale factors as diffusers to match their latent shapes
171+
# Based on diffusers pipeline: lines 205-206
172+
vae_scale_factor_spatial = 8 # Standard spatial compression (matches diffusers)
173+
vae_scale_factor_temporal = 4 # Temporal compression (matches diffusers default)
168174

169175
# Use same formula as diffusers cosmos pipeline
170176
num_latent_frames = (num_frames - 1) // vae_scale_factor_temporal + 1
@@ -217,6 +223,11 @@ def forward(
217223
# Store in batch
218224
batch.latents = latents
219225
batch.raw_latent_shape = latents.shape
226+
sum_value = latents.float().sum().item()
227+
logger.info(f"CosmosLatentPreparationStage: latents sum = {sum_value:.6f}, shape = {latents.shape}, sigma_max = {self.scheduler.sigma_max}")
228+
# Write to output file
229+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
230+
f.write(f"CosmosLatentPreparationStage: latents sum = {sum_value:.6f}, shape = {latents.shape}, sigma_max = {self.scheduler.sigma_max}\n")
220231

221232
# Store Cosmos-specific conditioning data
222233
batch.conditioning_latents = None # No conditioning frames for now

fastvideo/pipelines/stages/text_encoding.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ def forward(
8585
output_hidden_states=True,
8686
)
8787
prompt_embeds = postprocess_func(outputs)
88+
sum_value = prompt_embeds.float().sum().item()
89+
logger.info(f"TextEncodingStage: prompt_embeds sum = {sum_value:.6f}")
90+
# Write to output file
91+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
92+
f.write(f"TextEncodingStage: prompt_embeds sum = {sum_value:.6f}\n")
8893
batch.prompt_embeds.append(prompt_embeds)
8994
if batch.prompt_attention_mask is not None:
9095
batch.prompt_attention_mask.append(attention_mask)
@@ -105,6 +110,11 @@ def forward(
105110
output_hidden_states=True,
106111
)
107112
negative_prompt_embeds = postprocess_func(negative_outputs)
113+
sum_value = negative_prompt_embeds.float().sum().item()
114+
logger.info(f"TextEncodingStage: negative_prompt_embeds sum = {sum_value:.6f}")
115+
# Write to output file
116+
with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
117+
f.write(f"TextEncodingStage: negative_prompt_embeds sum = {sum_value:.6f}\n")
108118

109119
assert batch.negative_prompt_embeds is not None
110120
batch.negative_prompt_embeds.append(negative_prompt_embeds)

test2.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import torch
2+
from diffusers import Cosmos2VideoToWorldPipeline
3+
from diffusers.utils import export_to_video, load_image
4+
5+
# Available checkpoints: nvidia/Cosmos-Predict2-2B-Video2World, nvidia/Cosmos-Predict2-14B-Video2World
6+
model_id = "nvidia/Cosmos-Predict2-2B-Video2World"
7+
pipe = Cosmos2VideoToWorldPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
8+
pipe.to("cuda")
9+
10+
prompt = "A close-up shot captures a vibrant yellow scrubber vigorously working on a grimy plate, its bristles moving in circular motions to lift stubborn grease and food residue. The dish, once covered in remnants of a hearty meal, gradually reveals its original glossy surface. Suds form and bubble around the scrubber, creating a satisfying visual of cleanliness in progress. The sound of scrubbing fills the air, accompanied by the gentle clinking of the dish against the sink. As the scrubber continues its task, the dish transforms, gleaming under the bright kitchen lights, symbolizing the triumph of cleanliness over mess."
11+
negative_prompt = "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality."
12+
image = load_image(
13+
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yellow-scrubber.png"
14+
)
15+
16+
video = pipe(
17+
image=image, prompt=prompt, negative_prompt=negative_prompt, generator=torch.Generator().manual_seed(1), num_frames=25
18+
).frames[0]
19+
export_to_video(video, "output.mp4", fps=16)

0 commit comments

Comments
 (0)