Skip to content

Commit 7a15767

Browse files
committed
refactor
1 parent 27441fc commit 7a15767

File tree

5 files changed

+47
-70
lines changed

5 files changed

+47
-70
lines changed

scripts/convert_cogvideox_to_diffusers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ def get_init_kwargs(version: str):
213213
"patch_bias": False,
214214
"sample_height": 768 // vae_scale_factor_spatial,
215215
"sample_width": 1360 // vae_scale_factor_spatial,
216-
"sample_frames": 85,
216+
"sample_frames": 81,
217217
}
218218
else:
219219
raise ValueError("Unsupported version of CogVideoX.")

src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -334,10 +334,6 @@ def prepare_latents(
334334
width // self.vae_scale_factor_spatial,
335335
)
336336

337-
# For CogVideoX1.5, the latent should add 1 for padding (Not use)
338-
if self.transformer.config.patch_size_t is not None:
339-
shape = shape[:1] + (shape[1] + shape[1] % self.transformer.config.patch_size_t,) + shape[2:]
340-
341337
if latents is None:
342338
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
343339
else:
@@ -648,7 +644,16 @@ def __call__(
648644
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
649645
self._num_timesteps = len(timesteps)
650646

651-
# 5. Prepare latents.
647+
# 5. Prepare latents
648+
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
649+
650+
# For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
651+
patch_size_t = self.transformer.config.patch_size_t
652+
additional_frames = 0
653+
if patch_size_t is not None and latent_frames % patch_size_t != 0:
654+
additional_frames = patch_size_t - latent_frames % patch_size_t
655+
num_frames += additional_frames * self.vae_scale_factor_temporal
656+
652657
latent_channels = self.transformer.config.in_channels
653658
latents = self.prepare_latents(
654659
batch_size * num_videos_per_prompt,
@@ -738,13 +743,9 @@ def __call__(
738743
progress_bar.update()
739744

740745
if not output_type == "latent":
741-
# Calculate the number of start frames based on the size of the second dimension of latents
742-
num_latent_frames = latents.size(1) # Get the size of the second dimension
743-
# (81 - 1) / 4 + 1 = 21 and latents is 22, so the first frames will be 22 - 1 = 1, and we will skip frames 0
744-
start_frames = num_latent_frames - ((num_frames - 1) // self.vae_scale_factor_temporal + 1)
745-
746-
# Slice latents starting from start_frames
747-
video = self.decode_latents(latents[:, start_frames:])
746+
# Discard any padding frames that were added for CogVideoX 1.5
747+
latents = latents[:, additional_frames:]
748+
video = self.decode_latents(latents)
748749
video = self.video_processor.postprocess_video(video=video, output_type=output_type)
749750
else:
750751
video = latents

src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,6 @@ def check_inputs(
412412
prompt,
413413
height,
414414
width,
415-
num_frames,
416415
negative_prompt,
417416
callback_on_step_end_tensor_inputs,
418417
prompt_embeds=None,
@@ -423,15 +422,6 @@ def check_inputs(
423422
if height % 8 != 0 or width % 8 != 0:
424423
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
425424

426-
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
427-
if (
428-
self.transformer.config.patch_size_t is not None
429-
and latent_frames % self.transformer.config.patch_size_t != 0
430-
):
431-
raise ValueError(
432-
f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}."
433-
)
434-
435425
if callback_on_step_end_tensor_inputs is not None and not all(
436426
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
437427
):
@@ -663,7 +653,6 @@ def __call__(
663653
prompt,
664654
height,
665655
width,
666-
num_frames,
667656
negative_prompt,
668657
callback_on_step_end_tensor_inputs,
669658
prompt_embeds,
@@ -708,7 +697,17 @@ def __call__(
708697
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
709698
self._num_timesteps = len(timesteps)
710699

711-
# 5. Prepare latents.
700+
# 5. Prepare latents
701+
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
702+
703+
# For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
704+
patch_size_t = self.transformer.config.patch_size_t
705+
if patch_size_t is not None and latent_frames % patch_size_t != 0:
706+
raise ValueError(
707+
f"The number of latent frames must be divisible by `{patch_size_t=}` but the given video "
708+
f"contains {latent_frames=}, which is not divisible."
709+
)
710+
712711
latent_channels = self.transformer.config.in_channels // 2
713712
latents = self.prepare_latents(
714713
batch_size * num_videos_per_prompt,

src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py

Lines changed: 12 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,6 @@ def check_inputs(
450450
prompt,
451451
height,
452452
width,
453-
num_frames,
454453
negative_prompt,
455454
callback_on_step_end_tensor_inputs,
456455
latents=None,
@@ -470,15 +469,6 @@ def check_inputs(
470469
if height % 8 != 0 or width % 8 != 0:
471470
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
472471

473-
# latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
474-
# if (
475-
# self.transformer.config.patch_size_t is not None
476-
# and latent_frames % self.transformer.config.patch_size_t != 0
477-
# ):
478-
# raise ValueError(
479-
# f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}."
480-
# )
481-
482472
if callback_on_step_end_tensor_inputs is not None and not all(
483473
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
484474
):
@@ -705,7 +695,6 @@ def __call__(
705695
image=image,
706696
prompt=prompt,
707697
height=height,
708-
num_frames=num_frames,
709698
width=width,
710699
negative_prompt=negative_prompt,
711700
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
@@ -751,22 +740,15 @@ def __call__(
751740
self._num_timesteps = len(timesteps)
752741

753742
# 5. Prepare latents
754-
# TODO: Only CogVideoX1.5-5B-I2V can use this method. Need to Change
755-
def adjust_resolution_to_divisible(image_height, image_width, tgt_height, tgt_width, divisor=16):
756-
# Step 1: Compare image dimensions with target dimensions
757-
if image_height > tgt_height:
758-
image_height = tgt_height
759-
if image_width > tgt_width:
760-
image_width = tgt_width
761-
762-
# Step 2: Ensure height and width are divisible by the divisor
763-
image_height = (image_height // divisor) * divisor
764-
image_width = (image_width // divisor) * divisor
765-
return image_height, image_width
766-
767-
image_width, image_height = image.size[-2:]
768-
769-
height, width = adjust_resolution_to_divisible(image_height, image_width, height, width)
743+
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
744+
745+
# For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
746+
patch_size_t = self.transformer.config.patch_size_t
747+
additional_frames = 0
748+
if patch_size_t is not None and latent_frames % patch_size_t != 0:
749+
additional_frames = patch_size_t - latent_frames % patch_size_t
750+
num_frames += additional_frames * self.vae_scale_factor_temporal
751+
770752
image = self.video_processor.preprocess(image, height=height, width=width).to(
771753
device, dtype=prompt_embeds.dtype
772754
)
@@ -863,13 +845,9 @@ def adjust_resolution_to_divisible(image_height, image_width, tgt_height, tgt_wi
863845
progress_bar.update()
864846

865847
if not output_type == "latent":
866-
# Calculate the number of start frames based on the size of the second dimension of latents
867-
num_latent_frames = latents.size(1) # Get the size of the second dimension
868-
# (81 - 1) / 4 + 1 = 21 and latents is 22, so the first frames will be 22 - 1 = 1, and we will skip frames 0
869-
start_frames = num_latent_frames - ((num_frames - 1) // self.vae_scale_factor_temporal + 1)
870-
871-
# Slice latents starting from start_frames
872-
video = self.decode_latents(latents[:, start_frames:])
848+
# Discard any padding frames that were added for CogVideoX 1.5
849+
latents = latents[:, additional_frames:]
850+
video = self.decode_latents(latents)
873851
video = self.video_processor.postprocess_video(video=video, output_type=output_type)
874852
else:
875853
video = latents

src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,6 @@ def check_inputs(
438438
prompt,
439439
height,
440440
width,
441-
num_frames,
442441
strength,
443442
negative_prompt,
444443
callback_on_step_end_tensor_inputs,
@@ -450,15 +449,6 @@ def check_inputs(
450449
if height % 8 != 0 or width % 8 != 0:
451450
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
452451

453-
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
454-
if (
455-
self.transformer.config.patch_size_t is not None
456-
and latent_frames % self.transformer.config.patch_size_t != 0
457-
):
458-
raise ValueError(
459-
f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}."
460-
)
461-
462452
if strength < 0 or strength > 1:
463453
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
464454

@@ -688,7 +678,6 @@ def __call__(
688678
prompt=prompt,
689679
height=height,
690680
width=width,
691-
num_frames=num_frames,
692681
strength=strength,
693682
negative_prompt=negative_prompt,
694683
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
@@ -737,6 +726,16 @@ def __call__(
737726
self._num_timesteps = len(timesteps)
738727

739728
# 5. Prepare latents
729+
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
730+
731+
# For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
732+
patch_size_t = self.transformer.config.patch_size_t
733+
if patch_size_t is not None and latent_frames % patch_size_t != 0:
734+
raise ValueError(
735+
f"The number of latent frames must be divisible by `{patch_size_t=}` but the given video "
736+
f"contains {latent_frames=}, which is not divisible."
737+
)
738+
740739
if latents is None:
741740
video = self.video_processor.preprocess_video(video, height=height, width=width)
742741
video = video.to(device=device, dtype=prompt_embeds.dtype)

0 commit comments

Comments
 (0)