use even number of latent frames only

a-r-r-o-w · a-r-r-o-w · commit b94c7047ffc0 · 2024-11-10T21:37:24.000+01:00
diff --git a/scripts/convert_cogvideox_to_diffusers.py b/scripts/convert_cogvideox_to_diffusers.py
@@ -213,7 +213,7 @@ def get_init_kwargs(version: str):
             "patch_bias": False,
             "sample_height": 768 // vae_scale_factor_spatial,
             "sample_width": 1360 // vae_scale_factor_spatial,
-            "sample_frames": 81, # TODO: Need Test with 161 for 10 seconds
+            "sample_frames": 85,
         }
     else:
         raise ValueError("Unsupported version of CogVideoX.")
diff --git a/src/diffusers/models/transformers/cogvideox_transformer_3d.py b/src/diffusers/models/transformers/cogvideox_transformer_3d.py
@@ -466,16 +466,6 @@ def forward(
             emb = emb + emb_ofs
 
         # 2. Patch embedding
-        p = self.config.patch_size
-        p_t = self.config.patch_size_t
-
-        # We know that the hidden states height and width will always be divisible by patch_size.
-        # But, the number of frames may not be divisible by patch_size_t. So, we pad with the beginning frames.
-        if p_t is not None:
-            remaining_frames = p_t - num_frames % p_t
-            first_frame = hidden_states[:, :1].repeat(1, 1 + remaining_frames, 1, 1, 1)
-            hidden_states = torch.cat([first_frame, hidden_states[:, 1:]], dim=1)
-
         hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
         hidden_states = self.embedding_dropout(hidden_states)
 
@@ -524,6 +514,9 @@ def custom_forward(*inputs):
         hidden_states = self.proj_out(hidden_states)
 
         # 5. Unpatchify
+        p = self.config.patch_size
+        p_t = self.config.patch_size_t
+        
         if p_t is None:
             output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
             output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
@@ -532,7 +525,6 @@ def custom_forward(*inputs):
                 batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
             )
             output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
-            output = output[:, remaining_frames:]
 
         if USE_PEFT_BACKEND:
             # remove `lora_scale` from each PEFT layer
diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py
@@ -368,12 +368,12 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
-    # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
     def check_inputs(
         self,
         prompt,
         height,
         width,
+        num_frames,
         negative_prompt,
         callback_on_step_end_tensor_inputs,
         prompt_embeds=None,
@@ -382,6 +382,10 @@ def check_inputs(
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        if self.transformer.config.patch_size_t is not None and latent_frames % self.transformer.config.patch_size_t != 0:
+            raise ValueError(f"Number of latent frames must be divisible by `{self.transformer.config.patch_size_t}` but got {latent_frames=}.")
+
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
         ):
@@ -602,6 +606,7 @@ def __call__(
             prompt,
             height,
             width,
+            num_frames,
             negative_prompt,
             callback_on_step_end_tensor_inputs,
             prompt_embeds,

Original file line number	Diff line number	Diff line change
`@@ -213,7 +213,7 @@ def get_init_kwargs(version: str):`
`213`	`213`	`"patch_bias": False,`
`214`	`214`	`"sample_height": 768 // vae_scale_factor_spatial,`
`215`	`215`	`"sample_width": 1360 // vae_scale_factor_spatial,`
`216`		`- "sample_frames": 81, # TODO: Need Test with 161 for 10 seconds`
	`216`	`+ "sample_frames": 85,`
`217`	`217`	`}`
`218`	`218`	`else:`
`219`	`219`	`raise ValueError("Unsupported version of CogVideoX.")`