Skip to content

Commit 048a5f0

Browse files
committed
update pipeline implementations
1 parent b94c704 commit 048a5f0

File tree

3 files changed

+40
-10
lines changed

3 files changed

+40
-10
lines changed

src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,7 @@ def check_inputs(
412412
prompt,
413413
height,
414414
width,
415+
num_frames,
415416
negative_prompt,
416417
callback_on_step_end_tensor_inputs,
417418
prompt_embeds=None,
@@ -421,6 +422,10 @@ def check_inputs(
421422
):
422423
if height % 8 != 0 or width % 8 != 0:
423424
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
425+
426+
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
427+
if self.transformer.config.patch_size_t is not None and latent_frames % self.transformer.config.patch_size_t != 0:
428+
raise ValueError(f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}.")
424429

425430
if callback_on_step_end_tensor_inputs is not None and not all(
426431
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -533,8 +538,8 @@ def __call__(
533538
prompt: Optional[Union[str, List[str]]] = None,
534539
negative_prompt: Optional[Union[str, List[str]]] = None,
535540
control_video: Optional[List[Image.Image]] = None,
536-
height: int = 480,
537-
width: int = 720,
541+
height: Optional[int] = None,
542+
width: Optional[int] = None,
538543
num_inference_steps: int = 50,
539544
timesteps: Optional[List[int]] = None,
540545
guidance_scale: float = 6,
@@ -638,14 +643,22 @@ def __call__(
638643

639644
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
640645
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
646+
647+
if control_video is not None and isinstance(control_video[0], Image.Image):
648+
control_video = [control_video]
641649

650+
height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
651+
width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
652+
num_frames = len(control_video[0]) if control_video is not None else control_video_latents.size(2)
653+
642654
num_videos_per_prompt = 1
643655

644656
# 1. Check inputs. Raise error if not correct
645657
self.check_inputs(
646658
prompt,
647659
height,
648660
width,
661+
num_frames,
649662
negative_prompt,
650663
callback_on_step_end_tensor_inputs,
651664
prompt_embeds,
@@ -665,9 +678,6 @@ def __call__(
665678
else:
666679
batch_size = prompt_embeds.shape[0]
667680

668-
if control_video is not None and isinstance(control_video[0], Image.Image):
669-
control_video = [control_video]
670-
671681
device = self._execution_device
672682

673683
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
@@ -695,7 +705,6 @@ def __call__(
695705

696706
# 5. Prepare latents.
697707
latent_channels = self.transformer.config.in_channels // 2
698-
num_frames = len(control_video[0]) if control_video is not None else control_video_latents.size(2)
699708
latents = self.prepare_latents(
700709
batch_size * num_videos_per_prompt,
701710
latent_channels,

src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ def check_inputs(
440440
prompt,
441441
height,
442442
width,
443+
num_frames,
443444
negative_prompt,
444445
callback_on_step_end_tensor_inputs,
445446
latents=None,
@@ -459,6 +460,10 @@ def check_inputs(
459460
if height % 8 != 0 or width % 8 != 0:
460461
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
461462

463+
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
464+
if self.transformer.config.patch_size_t is not None and latent_frames % self.transformer.config.patch_size_t != 0:
465+
raise ValueError(f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}.")
466+
462467
if callback_on_step_end_tensor_inputs is not None and not all(
463468
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
464469
):
@@ -567,8 +572,8 @@ def __call__(
567572
image: PipelineImageInput,
568573
prompt: Optional[Union[str, List[str]]] = None,
569574
negative_prompt: Optional[Union[str, List[str]]] = None,
570-
height: int = 768,
571-
width: int = 1360,
575+
height: Optional[int] = None,
576+
width: Optional[int] = None,
572577
num_frames: int = 49,
573578
num_inference_steps: int = 50,
574579
timesteps: Optional[List[int]] = None,
@@ -674,12 +679,18 @@ def __call__(
674679
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
675680
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
676681

682+
height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
683+
width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
684+
num_frames = num_frames or self.transformer.config.sample_frames
685+
677686
num_videos_per_prompt = 1
687+
678688
# 1. Check inputs. Raise error if not correct
679689
self.check_inputs(
680690
image=image,
681691
prompt=prompt,
682692
height=height,
693+
num_frames=num_frames,
683694
width=width,
684695
negative_prompt=negative_prompt,
685696
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,

src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,7 @@ def check_inputs(
438438
prompt,
439439
height,
440440
width,
441+
num_frames,
441442
strength,
442443
negative_prompt,
443444
callback_on_step_end_tensor_inputs,
@@ -448,6 +449,10 @@ def check_inputs(
448449
):
449450
if height % 8 != 0 or width % 8 != 0:
450451
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
452+
453+
latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
454+
if self.transformer.config.patch_size_t is not None and latent_frames % self.transformer.config.patch_size_t != 0:
455+
raise ValueError(f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}.")
451456

452457
if strength < 0 or strength > 1:
453458
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
@@ -563,8 +568,8 @@ def __call__(
563568
video: List[Image.Image] = None,
564569
prompt: Optional[Union[str, List[str]]] = None,
565570
negative_prompt: Optional[Union[str, List[str]]] = None,
566-
height: int = 480,
567-
width: int = 720,
571+
height: Optional[int] = None,
572+
width: Optional[int] = None,
568573
num_inference_steps: int = 50,
569574
timesteps: Optional[List[int]] = None,
570575
strength: float = 0.8,
@@ -667,13 +672,18 @@ def __call__(
667672
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
668673
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
669674

675+
height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
676+
width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
677+
num_frames = len(video) if latents is None else latents.size(1)
678+
670679
num_videos_per_prompt = 1
671680

672681
# 1. Check inputs. Raise error if not correct
673682
self.check_inputs(
674683
prompt=prompt,
675684
height=height,
676685
width=width,
686+
num_frames=num_frames,
677687
strength=strength,
678688
negative_prompt=negative_prompt,
679689
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,

0 commit comments

Comments
 (0)