From 42c27dbfc0b5d44c4aafba118afb8fba1162c8dc Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 16 Apr 2025 10:26:26 +0200
Subject: [PATCH 1/2] update

---
 .../transformers/transformer_hunyuan_video.py |  1 +
 .../pipeline_hunyuan_video_image2video.py     |  2 +-
 .../hunyuan_video/test_hunyuan_image2video.py | 42 +++++++++++++++----
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py
index 36f914f0b5c1..d0c991ba3a40 100644
--- a/src/diffusers/models/transformers/transformer_hunyuan_video.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py
@@ -446,6 +446,7 @@ def forward(
         else:
             original_dtype = hidden_states.dtype
             mask_float = attention_mask.float().unsqueeze(-1)
+            __import__("ipdb").set_trace()
             pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
             pooled_projections = pooled_projections.to(original_dtype)
 
diff --git a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py
index d3c8a3539b98..18a0e970c610 100644
--- a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py
+++ b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py
@@ -344,7 +344,7 @@ def _get_llama_prompt_embeds(
         )
         prompt_embeds = self.text_encoder(
             **expanded_inputs,
-            pixel_value=image_embeds,
+            pixel_values=image_embeds,
             output_hidden_states=True,
         ).hidden_states[-(num_hidden_layers_to_skip + 1)]
         prompt_embeds = prompt_embeds.to(dtype=dtype)
diff --git a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py
index 5802bde87a61..16b196929b70 100644
--- a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py
+++ b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py
@@ -23,10 +23,13 @@
     CLIPTextConfig,
     CLIPTextModel,
     CLIPTokenizer,
+    LlavaForConditionalGeneration,
+    LlavaConfig,
     LlamaConfig,
     LlamaModel,
     LlamaTokenizer,
 )
+from transformers.models.clip import CLIPVisionConfig
 
 from diffusers import (
     AutoencoderKLHunyuanVideo,
@@ -116,7 +119,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
         torch.manual_seed(0)
         scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
 
-        llama_text_encoder_config = LlamaConfig(
+        text_config = LlamaConfig(
             bos_token_id=0,
             eos_token_id=2,
             hidden_size=16,
@@ -129,6 +132,18 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
             hidden_act="gelu",
             projection_dim=32,
         )
+        vision_config = CLIPVisionConfig(
+            hidden_size=8,
+            intermediate_size=37,
+            projection_dim=32,
+            num_attention_heads=4,
+            num_hidden_layers=2,
+            image_size=224,
+        )
+        llava_text_encoder_config = LlavaConfig(
+            vision_config, text_config, image_seq_length=7, pad_token_id=1, image_token_index=8
+        )
+
         clip_text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
             eos_token_id=2,
@@ -144,7 +159,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
         )
 
         torch.manual_seed(0)
-        text_encoder = LlamaModel(llama_text_encoder_config)
+        text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config)
         tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
 
         torch.manual_seed(0)
@@ -153,14 +168,14 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
 
         torch.manual_seed(0)
         image_processor = CLIPImageProcessor(
-            crop_size=336,
+            crop_size=224,
             do_center_crop=True,
             do_normalize=True,
             do_resize=True,
             image_mean=[0.48145466, 0.4578275, 0.40821073],
             image_std=[0.26862954, 0.26130258, 0.27577711],
             resample=3,
-            size=336,
+            size=224,
         )
 
         components = {
@@ -188,8 +203,21 @@ def get_dummy_inputs(self, device, seed=0):
             "image": image,
             "prompt": "dance monkey",
             "prompt_template": {
-                "template": "{}",
-                "crop_start": 0,
+                "template": (
+                    "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
+                    "1. The main content and theme of the video."
+                    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+                    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+                    "4. background environment, light, style and atmosphere."
+                    "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n"
+                    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+                    "<|start_header_id|>assistant<|end_header_id|>\n\n"
+                ),
+                "crop_start": 5,
+                "image_emb_len": 49,
+                "image_emb_start": 5,
+                "image_emb_end": 54,
+                "double_return_token_id": 10,
             },
             "generator": generator,
             "num_inference_steps": 2,
@@ -197,7 +225,7 @@ def get_dummy_inputs(self, device, seed=0):
             "height": image_height,
             "width": image_width,
             "num_frames": 9,
-            "max_sequence_length": 16,
+            "max_sequence_length": 64,
             "output_type": "pt",
         }
         return inputs

From cb11196de5abd50234b8fb1e78b0a5958080601a Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Wed, 16 Apr 2025 13:36:10 +0200
Subject: [PATCH 2/2] update

---
 .../transformers/transformer_hunyuan_video.py |  1 -
 .../hunyuan_video/test_hunyuan_image2video.py | 30 ++++++-------------
 2 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py
index d0c991ba3a40..36f914f0b5c1 100644
--- a/src/diffusers/models/transformers/transformer_hunyuan_video.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py
@@ -446,7 +446,6 @@ def forward(
         else:
             original_dtype = hidden_states.dtype
             mask_float = attention_mask.float().unsqueeze(-1)
-            __import__("ipdb").set_trace()
             pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
             pooled_projections = pooled_projections.to(original_dtype)
 
diff --git a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py
index 16b196929b70..37a4f418cc6d 100644
--- a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py
+++ b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py
@@ -23,11 +23,10 @@
     CLIPTextConfig,
     CLIPTextModel,
     CLIPTokenizer,
-    LlavaForConditionalGeneration,
-    LlavaConfig,
     LlamaConfig,
-    LlamaModel,
-    LlamaTokenizer,
+    LlamaTokenizerFast,
+    LlavaConfig,
+    LlavaForConditionalGeneration,
 )
 from transformers.models.clip import CLIPVisionConfig
 
@@ -127,7 +126,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
             layer_norm_eps=1e-05,
             num_attention_heads=4,
             num_hidden_layers=2,
-            pad_token_id=1,
+            pad_token_id=100,
             vocab_size=1000,
             hidden_act="gelu",
             projection_dim=32,
@@ -140,9 +139,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
             num_hidden_layers=2,
             image_size=224,
         )
-        llava_text_encoder_config = LlavaConfig(
-            vision_config, text_config, image_seq_length=7, pad_token_id=1, image_token_index=8
-        )
+        llava_text_encoder_config = LlavaConfig(vision_config, text_config, pad_token_id=100, image_token_index=101)
 
         clip_text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
@@ -160,7 +157,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
 
         torch.manual_seed(0)
         text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config)
-        tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
+        tokenizer = LlamaTokenizerFast.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
 
         torch.manual_seed(0)
         text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
@@ -203,21 +200,12 @@ def get_dummy_inputs(self, device, seed=0):
             "image": image,
             "prompt": "dance monkey",
             "prompt_template": {
-                "template": (
-                    "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
-                    "1. The main content and theme of the video."
-                    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
-                    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
-                    "4. background environment, light, style and atmosphere."
-                    "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n"
-                    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
-                    "<|start_header_id|>assistant<|end_header_id|>\n\n"
-                ),
-                "crop_start": 5,
+                "template": "{}",
+                "crop_start": 0,
                 "image_emb_len": 49,
                 "image_emb_start": 5,
                 "image_emb_end": 54,
-                "double_return_token_id": 10,
+                "double_return_token_id": 0,
             },
             "generator": generator,
             "num_inference_steps": 2,