refactor

a-r-r-o-w · a-r-r-o-w · commit 7a15767657f0 · 2024-11-11T15:38:51.000+01:00
diff --git a/scripts/convert_cogvideox_to_diffusers.py b/scripts/convert_cogvideox_to_diffusers.py
@@ -213,7 +213,7 @@ def get_init_kwargs(version: str):
             "patch_bias": False,
             "sample_height": 768 // vae_scale_factor_spatial,
             "sample_width": 1360 // vae_scale_factor_spatial,
-            "sample_frames": 85,
+            "sample_frames": 81,
         }
     else:
         raise ValueError("Unsupported version of CogVideoX.")
diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py
@@ -334,10 +334,6 @@ def prepare_latents(
             width // self.vae_scale_factor_spatial,
         )
 
-        # For CogVideoX1.5, the latent should add 1 for padding (Not use)
-        if self.transformer.config.patch_size_t is not None:
-            shape = shape[:1] + (shape[1] + shape[1] % self.transformer.config.patch_size_t,) + shape[2:]
-
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         else:
@@ -648,7 +644,16 @@ def __call__(
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
         self._num_timesteps = len(timesteps)
 
-        # 5. Prepare latents.
+        # 5. Prepare latents
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+
+        # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        additional_frames = 0
+        if patch_size_t is not None and latent_frames % patch_size_t != 0:
+            additional_frames = patch_size_t - latent_frames % patch_size_t
+            num_frames += additional_frames * self.vae_scale_factor_temporal
+
         latent_channels = self.transformer.config.in_channels
         latents = self.prepare_latents(
             batch_size * num_videos_per_prompt,
@@ -738,13 +743,9 @@ def __call__(
                     progress_bar.update()
 
         if not output_type == "latent":
-            # Calculate the number of start frames based on the size of the second dimension of latents
-            num_latent_frames = latents.size(1)  # Get the size of the second dimension
-            # (81 - 1) / 4 + 1 = 21 and latents is 22, so the first frames will be 22 - 1 = 1, and we will skip frames 0
-            start_frames = num_latent_frames - ((num_frames - 1) // self.vae_scale_factor_temporal + 1)
-
-            # Slice latents starting from start_frames
-            video = self.decode_latents(latents[:, start_frames:])
+            # Discard any padding frames that were added for CogVideoX 1.5
+            latents = latents[:, additional_frames:]
+            video = self.decode_latents(latents)
             video = self.video_processor.postprocess_video(video=video, output_type=output_type)
         else:
             video = latents
diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py
@@ -412,7 +412,6 @@ def check_inputs(
         prompt,
         height,
         width,
-        num_frames,
         negative_prompt,
         callback_on_step_end_tensor_inputs,
         prompt_embeds=None,
@@ -423,15 +422,6 @@ def check_inputs(
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
-        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
-        if (
-            self.transformer.config.patch_size_t is not None
-            and latent_frames % self.transformer.config.patch_size_t != 0
-        ):
-            raise ValueError(
-                f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}."
-            )
-
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
         ):
@@ -663,7 +653,6 @@ def __call__(
             prompt,
             height,
             width,
-            num_frames,
             negative_prompt,
             callback_on_step_end_tensor_inputs,
             prompt_embeds,
@@ -708,7 +697,17 @@ def __call__(
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
         self._num_timesteps = len(timesteps)
 
-        # 5. Prepare latents.
+        # 5. Prepare latents
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+
+        # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        if patch_size_t is not None and latent_frames % patch_size_t != 0:
+            raise ValueError(
+                f"The number of latent frames must be divisible by `{patch_size_t=}` but the given video "
+                f"contains {latent_frames=}, which is not divisible."
+            )
+        
         latent_channels = self.transformer.config.in_channels // 2
         latents = self.prepare_latents(
             batch_size * num_videos_per_prompt,
diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py
@@ -450,7 +450,6 @@ def check_inputs(
         prompt,
         height,
         width,
-        num_frames,
         negative_prompt,
         callback_on_step_end_tensor_inputs,
         latents=None,
@@ -470,15 +469,6 @@ def check_inputs(
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
-        # latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
-        # if (
-        #     self.transformer.config.patch_size_t is not None
-        #     and latent_frames % self.transformer.config.patch_size_t != 0
-        # ):
-        #     raise ValueError(
-        #         f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}."
-        #     )
-
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
         ):
@@ -705,7 +695,6 @@ def __call__(
             image=image,
             prompt=prompt,
             height=height,
-            num_frames=num_frames,
             width=width,
             negative_prompt=negative_prompt,
             callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
@@ -751,22 +740,15 @@ def __call__(
         self._num_timesteps = len(timesteps)
 
         # 5. Prepare latents
-        # TODO: Only CogVideoX1.5-5B-I2V can use this method. Need to Change
-        def adjust_resolution_to_divisible(image_height, image_width, tgt_height, tgt_width, divisor=16):
-            # Step 1: Compare image dimensions with target dimensions
-            if image_height > tgt_height:
-                image_height = tgt_height
-            if image_width > tgt_width:
-                image_width = tgt_width
-
-            # Step 2: Ensure height and width are divisible by the divisor
-            image_height = (image_height // divisor) * divisor
-            image_width = (image_width // divisor) * divisor
-            return image_height, image_width
-
-        image_width, image_height = image.size[-2:]
-
-        height, width = adjust_resolution_to_divisible(image_height, image_width, height, width)
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+
+        # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        additional_frames = 0
+        if patch_size_t is not None and latent_frames % patch_size_t != 0:
+            additional_frames = patch_size_t - latent_frames % patch_size_t
+            num_frames += additional_frames * self.vae_scale_factor_temporal
+        
         image = self.video_processor.preprocess(image, height=height, width=width).to(
             device, dtype=prompt_embeds.dtype
         )
@@ -863,13 +845,9 @@ def adjust_resolution_to_divisible(image_height, image_width, tgt_height, tgt_wi
                     progress_bar.update()
 
         if not output_type == "latent":
-            # Calculate the number of start frames based on the size of the second dimension of latents
-            num_latent_frames = latents.size(1)  # Get the size of the second dimension
-            # (81 - 1) / 4 + 1 = 21 and latents is 22, so the first frames will be 22 - 1 = 1, and we will skip frames 0
-            start_frames = num_latent_frames - ((num_frames - 1) // self.vae_scale_factor_temporal + 1)
-
-            # Slice latents starting from start_frames
-            video = self.decode_latents(latents[:, start_frames:])
+            # Discard any padding frames that were added for CogVideoX 1.5
+            latents = latents[:, additional_frames:]
+            video = self.decode_latents(latents)
             video = self.video_processor.postprocess_video(video=video, output_type=output_type)
         else:
             video = latents
diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py
@@ -438,7 +438,6 @@ def check_inputs(
         prompt,
         height,
         width,
-        num_frames,
         strength,
         negative_prompt,
         callback_on_step_end_tensor_inputs,
@@ -450,15 +449,6 @@ def check_inputs(
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
-        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
-        if (
-            self.transformer.config.patch_size_t is not None
-            and latent_frames % self.transformer.config.patch_size_t != 0
-        ):
-            raise ValueError(
-                f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}."
-            )
-
         if strength < 0 or strength > 1:
             raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
 
@@ -688,7 +678,6 @@ def __call__(
             prompt=prompt,
             height=height,
             width=width,
-            num_frames=num_frames,
             strength=strength,
             negative_prompt=negative_prompt,
             callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
@@ -737,6 +726,16 @@ def __call__(
         self._num_timesteps = len(timesteps)
 
         # 5. Prepare latents
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+
+        # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        if patch_size_t is not None and latent_frames % patch_size_t != 0:
+            raise ValueError(
+                f"The number of latent frames must be divisible by `{patch_size_t=}` but the given video "
+                f"contains {latent_frames=}, which is not divisible."
+            )
+        
         if latents is None:
             video = self.video_processor.preprocess_video(video, height=height, width=width)
             video = video.to(device=device, dtype=prompt_embeds.dtype)

Original file line number	Diff line number	Diff line change
`@@ -213,7 +213,7 @@ def get_init_kwargs(version: str):`
`213`	`213`	`"patch_bias": False,`
`214`	`214`	`"sample_height": 768 // vae_scale_factor_spatial,`
`215`	`215`	`"sample_width": 1360 // vae_scale_factor_spatial,`
`216`		`- "sample_frames": 85,`
	`216`	`+ "sample_frames": 81,`
`217`	`217`	`}`
`218`	`218`	`else:`
`219`	`219`	`raise ValueError("Unsupported version of CogVideoX.")`