update pipeline implementations

a-r-r-o-w · a-r-r-o-w · commit 048a5f02d96f · 2024-11-10T21:50:00.000+01:00
diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py
@@ -412,6 +412,7 @@ def check_inputs(
         prompt,
         height,
         width,
+        num_frames,
         negative_prompt,
         callback_on_step_end_tensor_inputs,
         prompt_embeds=None,
@@ -421,6 +422,10 @@ def check_inputs(
     ):
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        if self.transformer.config.patch_size_t is not None and latent_frames % self.transformer.config.patch_size_t != 0:
+            raise ValueError(f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}.")
 
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -533,8 +538,8 @@ def __call__(
         prompt: Optional[Union[str, List[str]]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         control_video: Optional[List[Image.Image]] = None,
-        height: int = 480,
-        width: int = 720,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         timesteps: Optional[List[int]] = None,
         guidance_scale: float = 6,
@@ -638,14 +643,22 @@ def __call__(
 
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        
+        if control_video is not None and isinstance(control_video[0], Image.Image):
+            control_video = [control_video]
 
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        num_frames = len(control_video[0]) if control_video is not None else control_video_latents.size(2)
+        
         num_videos_per_prompt = 1
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt,
             height,
             width,
+            num_frames,
             negative_prompt,
             callback_on_step_end_tensor_inputs,
             prompt_embeds,
@@ -665,9 +678,6 @@ def __call__(
         else:
             batch_size = prompt_embeds.shape[0]
 
-        if control_video is not None and isinstance(control_video[0], Image.Image):
-            control_video = [control_video]
-
         device = self._execution_device
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
@@ -695,7 +705,6 @@ def __call__(
 
         # 5. Prepare latents.
         latent_channels = self.transformer.config.in_channels // 2
-        num_frames = len(control_video[0]) if control_video is not None else control_video_latents.size(2)
         latents = self.prepare_latents(
             batch_size * num_videos_per_prompt,
             latent_channels,
diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py
@@ -440,6 +440,7 @@ def check_inputs(
         prompt,
         height,
         width,
+        num_frames,
         negative_prompt,
         callback_on_step_end_tensor_inputs,
         latents=None,
@@ -459,6 +460,10 @@ def check_inputs(
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        if self.transformer.config.patch_size_t is not None and latent_frames % self.transformer.config.patch_size_t != 0:
+            raise ValueError(f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}.")
+
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
         ):
@@ -567,8 +572,8 @@ def __call__(
         image: PipelineImageInput,
         prompt: Optional[Union[str, List[str]]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
-        height: int = 768,
-        width: int = 1360,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_frames: int = 49,
         num_inference_steps: int = 50,
         timesteps: Optional[List[int]] = None,
@@ -674,12 +679,18 @@ def __call__(
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
 
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        num_frames = num_frames or self.transformer.config.sample_frames
+        
         num_videos_per_prompt = 1
+
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             image=image,
             prompt=prompt,
             height=height,
+            num_frames=num_frames,
             width=width,
             negative_prompt=negative_prompt,
             callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py
@@ -438,6 +438,7 @@ def check_inputs(
         prompt,
         height,
         width,
+        num_frames,
         strength,
         negative_prompt,
         callback_on_step_end_tensor_inputs,
@@ -448,6 +449,10 @@ def check_inputs(
     ):
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        if self.transformer.config.patch_size_t is not None and latent_frames % self.transformer.config.patch_size_t != 0:
+            raise ValueError(f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}.")
 
         if strength < 0 or strength > 1:
             raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
@@ -563,8 +568,8 @@ def __call__(
         video: List[Image.Image] = None,
         prompt: Optional[Union[str, List[str]]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
-        height: int = 480,
-        width: int = 720,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         timesteps: Optional[List[int]] = None,
         strength: float = 0.8,
@@ -667,13 +672,18 @@ def __call__(
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
 
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        num_frames = len(video) if latents is None else latents.size(1)
+        
         num_videos_per_prompt = 1
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt=prompt,
             height=height,
             width=width,
+            num_frames=num_frames,
             strength=strength,
             negative_prompt=negative_prompt,
             callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,