From 42c27dbfc0b5d44c4aafba118afb8fba1162c8dc Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 16 Apr 2025 10:26:26 +0200 Subject: [PATCH 1/2] update --- .../transformers/transformer_hunyuan_video.py | 1 + .../pipeline_hunyuan_video_image2video.py | 2 +- .../hunyuan_video/test_hunyuan_image2video.py | 42 +++++++++++++++---- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index 36f914f0b5c1..d0c991ba3a40 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -446,6 +446,7 @@ def forward( else: original_dtype = hidden_states.dtype mask_float = attention_mask.float().unsqueeze(-1) + __import__("ipdb").set_trace() pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1) pooled_projections = pooled_projections.to(original_dtype) diff --git a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py index d3c8a3539b98..18a0e970c610 100644 --- a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +++ b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py @@ -344,7 +344,7 @@ def _get_llama_prompt_embeds( ) prompt_embeds = self.text_encoder( **expanded_inputs, - pixel_value=image_embeds, + pixel_values=image_embeds, output_hidden_states=True, ).hidden_states[-(num_hidden_layers_to_skip + 1)] prompt_embeds = prompt_embeds.to(dtype=dtype) diff --git a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py index 5802bde87a61..16b196929b70 100644 --- a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py +++ b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py @@ -23,10 +23,13 @@ CLIPTextConfig, CLIPTextModel, CLIPTokenizer, + LlavaForConditionalGeneration, + LlavaConfig, LlamaConfig, LlamaModel, LlamaTokenizer, ) +from transformers.models.clip import CLIPVisionConfig from diffusers import ( AutoencoderKLHunyuanVideo, @@ -116,7 +119,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1): torch.manual_seed(0) scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0) - llama_text_encoder_config = LlamaConfig( + text_config = LlamaConfig( bos_token_id=0, eos_token_id=2, hidden_size=16, @@ -129,6 +132,18 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1): hidden_act="gelu", projection_dim=32, ) + vision_config = CLIPVisionConfig( + hidden_size=8, + intermediate_size=37, + projection_dim=32, + num_attention_heads=4, + num_hidden_layers=2, + image_size=224, + ) + llava_text_encoder_config = LlavaConfig( + vision_config, text_config, image_seq_length=7, pad_token_id=1, image_token_index=8 + ) + clip_text_encoder_config = CLIPTextConfig( bos_token_id=0, eos_token_id=2, @@ -144,7 +159,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1): ) torch.manual_seed(0) - text_encoder = LlamaModel(llama_text_encoder_config) + text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config) tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer") torch.manual_seed(0) @@ -153,14 +168,14 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1): torch.manual_seed(0) image_processor = CLIPImageProcessor( - crop_size=336, + crop_size=224, do_center_crop=True, do_normalize=True, do_resize=True, image_mean=[0.48145466, 0.4578275, 0.40821073], image_std=[0.26862954, 0.26130258, 0.27577711], resample=3, - size=336, + size=224, ) components = { @@ -188,8 +203,21 @@ def get_dummy_inputs(self, device, seed=0): "image": image, "prompt": "dance monkey", "prompt_template": { - "template": "{}", - "crop_start": 0, + "template": ( + "<|start_header_id|>system<|end_header_id|>\n\n\nDescribe the video by detailing the following aspects according to the reference image: " + "1. The main content and theme of the video." + "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects." + "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects." + "4. background environment, light, style and atmosphere." + "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n" + "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>\n\n" + ), + "crop_start": 5, + "image_emb_len": 49, + "image_emb_start": 5, + "image_emb_end": 54, + "double_return_token_id": 10, }, "generator": generator, "num_inference_steps": 2, @@ -197,7 +225,7 @@ def get_dummy_inputs(self, device, seed=0): "height": image_height, "width": image_width, "num_frames": 9, - "max_sequence_length": 16, + "max_sequence_length": 64, "output_type": "pt", } return inputs From cb11196de5abd50234b8fb1e78b0a5958080601a Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 16 Apr 2025 13:36:10 +0200 Subject: [PATCH 2/2] update --- .../transformers/transformer_hunyuan_video.py | 1 - .../hunyuan_video/test_hunyuan_image2video.py | 30 ++++++------------- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index d0c991ba3a40..36f914f0b5c1 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -446,7 +446,6 @@ def forward( else: original_dtype = hidden_states.dtype mask_float = attention_mask.float().unsqueeze(-1) - __import__("ipdb").set_trace() pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1) pooled_projections = pooled_projections.to(original_dtype) diff --git a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py index 16b196929b70..37a4f418cc6d 100644 --- a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py +++ b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py @@ -23,11 +23,10 @@ CLIPTextConfig, CLIPTextModel, CLIPTokenizer, - LlavaForConditionalGeneration, - LlavaConfig, LlamaConfig, - LlamaModel, - LlamaTokenizer, + LlamaTokenizerFast, + LlavaConfig, + LlavaForConditionalGeneration, ) from transformers.models.clip import CLIPVisionConfig @@ -127,7 +126,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1): layer_norm_eps=1e-05, num_attention_heads=4, num_hidden_layers=2, - pad_token_id=1, + pad_token_id=100, vocab_size=1000, hidden_act="gelu", projection_dim=32, @@ -140,9 +139,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1): num_hidden_layers=2, image_size=224, ) - llava_text_encoder_config = LlavaConfig( - vision_config, text_config, image_seq_length=7, pad_token_id=1, image_token_index=8 - ) + llava_text_encoder_config = LlavaConfig(vision_config, text_config, pad_token_id=100, image_token_index=101) clip_text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -160,7 +157,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1): torch.manual_seed(0) text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config) - tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer") + tokenizer = LlamaTokenizerFast.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer") torch.manual_seed(0) text_encoder_2 = CLIPTextModel(clip_text_encoder_config) @@ -203,21 +200,12 @@ def get_dummy_inputs(self, device, seed=0): "image": image, "prompt": "dance monkey", "prompt_template": { - "template": ( - "<|start_header_id|>system<|end_header_id|>\n\n\nDescribe the video by detailing the following aspects according to the reference image: " - "1. The main content and theme of the video." - "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects." - "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects." - "4. background environment, light, style and atmosphere." - "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n" - "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>" - "<|start_header_id|>assistant<|end_header_id|>\n\n" - ), - "crop_start": 5, + "template": "{}", + "crop_start": 0, "image_emb_len": 49, "image_emb_start": 5, "image_emb_end": 54, - "double_return_token_id": 10, + "double_return_token_id": 0, }, "generator": generator, "num_inference_steps": 2,