Skip to content

Commit b94c704

Browse files
committed
use even number of latent frames only
1 parent be8aff7 commit b94c704

File tree

3 files changed

+10
-13
lines changed

3 files changed

+10
-13
lines changed

scripts/convert_cogvideox_to_diffusers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ def get_init_kwargs(version: str):
213213
"patch_bias": False,
214214
"sample_height": 768 // vae_scale_factor_spatial,
215215
"sample_width": 1360 // vae_scale_factor_spatial,
216-
"sample_frames": 81, # TODO: Need Test with 161 for 10 seconds
216+
"sample_frames": 85,
217217
}
218218
else:
219219
raise ValueError("Unsupported version of CogVideoX.")

src/diffusers/models/transformers/cogvideox_transformer_3d.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -466,16 +466,6 @@ def forward(
466466
emb = emb + emb_ofs
467467

468468
# 2. Patch embedding
469-
p = self.config.patch_size
470-
p_t = self.config.patch_size_t
471-
472-
# We know that the hidden states height and width will always be divisible by patch_size.
473-
# But, the number of frames may not be divisible by patch_size_t. So, we pad with the beginning frames.
474-
if p_t is not None:
475-
remaining_frames = p_t - num_frames % p_t
476-
first_frame = hidden_states[:, :1].repeat(1, 1 + remaining_frames, 1, 1, 1)
477-
hidden_states = torch.cat([first_frame, hidden_states[:, 1:]], dim=1)
478-
479469
hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
480470
hidden_states = self.embedding_dropout(hidden_states)
481471

@@ -524,6 +514,9 @@ def custom_forward(*inputs):
524514
hidden_states = self.proj_out(hidden_states)
525515

526516
# 5. Unpatchify
517+
p = self.config.patch_size
518+
p_t = self.config.patch_size_t
519+
527520
if p_t is None:
528521
output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
529522
output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
@@ -532,7 +525,6 @@ def custom_forward(*inputs):
532525
batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
533526
)
534527
output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
535-
output = output[:, remaining_frames:]
536528

537529
if USE_PEFT_BACKEND:
538530
# remove `lora_scale` from each PEFT layer

src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,12 +368,12 @@ def prepare_extra_step_kwargs(self, generator, eta):
368368
extra_step_kwargs["generator"] = generator
369369
return extra_step_kwargs
370370

371-
# Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
372371
def check_inputs(
373372
self,
374373
prompt,
375374
height,
376375
width,
376+
num_frames,
377377
negative_prompt,
378378
callback_on_step_end_tensor_inputs,
379379
prompt_embeds=None,
@@ -382,6 +382,10 @@ def check_inputs(
382382
if height % 8 != 0 or width % 8 != 0:
383383
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
384384

385+
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
386+
if self.transformer.config.patch_size_t is not None and latent_frames % self.transformer.config.patch_size_t != 0:
387+
raise ValueError(f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}.")
388+
385389
if callback_on_step_end_tensor_inputs is not None and not all(
386390
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
387391
):
@@ -602,6 +606,7 @@ def __call__(
602606
prompt,
603607
height,
604608
width,
609+
num_frames,
605610
negative_prompt,
606611
callback_on_step_end_tensor_inputs,
607612
prompt_embeds,

0 commit comments

Comments
 (0)