2424 CLIPTextModel ,
2525 CLIPTokenizer ,
2626 LlamaConfig ,
27- LlamaModel ,
28- LlamaTokenizer ,
27+ LlamaTokenizerFast ,
28+ LlavaConfig ,
29+ LlavaForConditionalGeneration ,
2930)
31+ from transformers .models .clip import CLIPVisionConfig
3032
3133from diffusers import (
3234 AutoencoderKLHunyuanVideo ,
@@ -116,19 +118,29 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
116118 torch .manual_seed (0 )
117119 scheduler = FlowMatchEulerDiscreteScheduler (shift = 7.0 )
118120
119- llama_text_encoder_config = LlamaConfig (
121+ text_config = LlamaConfig (
120122 bos_token_id = 0 ,
121123 eos_token_id = 2 ,
122124 hidden_size = 16 ,
123125 intermediate_size = 37 ,
124126 layer_norm_eps = 1e-05 ,
125127 num_attention_heads = 4 ,
126128 num_hidden_layers = 2 ,
127- pad_token_id = 1 ,
129+ pad_token_id = 100 ,
128130 vocab_size = 1000 ,
129131 hidden_act = "gelu" ,
130132 projection_dim = 32 ,
131133 )
134+ vision_config = CLIPVisionConfig (
135+ hidden_size = 8 ,
136+ intermediate_size = 37 ,
137+ projection_dim = 32 ,
138+ num_attention_heads = 4 ,
139+ num_hidden_layers = 2 ,
140+ image_size = 224 ,
141+ )
142+ llava_text_encoder_config = LlavaConfig (vision_config , text_config , pad_token_id = 100 , image_token_index = 101 )
143+
132144 clip_text_encoder_config = CLIPTextConfig (
133145 bos_token_id = 0 ,
134146 eos_token_id = 2 ,
@@ -144,23 +156,23 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
144156 )
145157
146158 torch .manual_seed (0 )
147- text_encoder = LlamaModel ( llama_text_encoder_config )
148- tokenizer = LlamaTokenizer .from_pretrained ("finetrainers/dummy-hunyaunvideo" , subfolder = "tokenizer" )
159+ text_encoder = LlavaForConditionalGeneration ( llava_text_encoder_config )
160+ tokenizer = LlamaTokenizerFast .from_pretrained ("finetrainers/dummy-hunyaunvideo" , subfolder = "tokenizer" )
149161
150162 torch .manual_seed (0 )
151163 text_encoder_2 = CLIPTextModel (clip_text_encoder_config )
152164 tokenizer_2 = CLIPTokenizer .from_pretrained ("hf-internal-testing/tiny-random-clip" )
153165
154166 torch .manual_seed (0 )
155167 image_processor = CLIPImageProcessor (
156- crop_size = 336 ,
168+ crop_size = 224 ,
157169 do_center_crop = True ,
158170 do_normalize = True ,
159171 do_resize = True ,
160172 image_mean = [0.48145466 , 0.4578275 , 0.40821073 ],
161173 image_std = [0.26862954 , 0.26130258 , 0.27577711 ],
162174 resample = 3 ,
163- size = 336 ,
175+ size = 224 ,
164176 )
165177
166178 components = {
@@ -190,14 +202,18 @@ def get_dummy_inputs(self, device, seed=0):
190202 "prompt_template" : {
191203 "template" : "{}" ,
192204 "crop_start" : 0 ,
205+ "image_emb_len" : 49 ,
206+ "image_emb_start" : 5 ,
207+ "image_emb_end" : 54 ,
208+ "double_return_token_id" : 0 ,
193209 },
194210 "generator" : generator ,
195211 "num_inference_steps" : 2 ,
196212 "guidance_scale" : 4.5 ,
197213 "height" : image_height ,
198214 "width" : image_width ,
199215 "num_frames" : 9 ,
200- "max_sequence_length" : 16 ,
216+ "max_sequence_length" : 64 ,
201217 "output_type" : "pt" ,
202218 }
203219 return inputs
0 commit comments