44
55import torch
66import torch .nn as nn
7- from transformers import AutoModelForVision2Seq , AutoTokenizer , AutoConfig
7+ from transformers import Qwen2_5_VLConfig , Qwen2_5_VLForConditionalGeneration
88
99from transformers .utils import ModelOutput
1010
@@ -33,7 +33,7 @@ class TextEncoderModelOutput(ModelOutput):
3333class HunyuanImage_TextEncoder_Qwen (nn .Module ):
3434 def __init__ (
3535 self ,
36- model_path : str ,
36+ model_path : str = "Qwen/Qwen2.5-VL-7B-Instruct" ,
3737 apply_final_norm : bool = False ,
3838 hidden_state_skip_layer : Optional [int ] = 2 ,
3939 crop_start : int = 34 ,
@@ -43,9 +43,144 @@ def __init__(
4343 self .apply_final_norm = apply_final_norm
4444 self .hidden_state_skip_layer = hidden_state_skip_layer
4545 self .crop_start = crop_start
46-
47- config = AutoConfig .from_pretrained (model_path , trust_remote_code = True )
48- self .model = AutoModelForVision2Seq .from_config (config , trust_remote_code = True )
46+
47+ config = Qwen2_5_VLConfig (** {
48+ "architectures" : [
49+ "Qwen2_5_VLForConditionalGeneration"
50+ ],
51+ "attention_dropout" : 0.0 ,
52+ "bos_token_id" : 151643 ,
53+ "eos_token_id" : 151645 ,
54+ "hidden_act" : "silu" ,
55+ "hidden_size" : 3584 ,
56+ "image_token_id" : 151655 ,
57+ "initializer_range" : 0.02 ,
58+ "intermediate_size" : 18944 ,
59+ "max_position_embeddings" : 128000 ,
60+ "max_window_layers" : 28 ,
61+ "model_type" : "qwen2_5_vl" ,
62+ "num_attention_heads" : 28 ,
63+ "num_hidden_layers" : 28 ,
64+ "num_key_value_heads" : 4 ,
65+ "rms_norm_eps" : 1e-06 ,
66+ "rope_scaling" : {
67+ "mrope_section" : [
68+ 16 ,
69+ 24 ,
70+ 24
71+ ],
72+ "rope_type" : "default" ,
73+ "type" : "default"
74+ },
75+ "rope_theta" : 1000000.0 ,
76+ "sliding_window" : 32768 ,
77+ "text_config" : {
78+ "architectures" : [
79+ "Qwen2_5_VLForConditionalGeneration"
80+ ],
81+ "attention_dropout" : 0.0 ,
82+ "bos_token_id" : 151643 ,
83+ "eos_token_id" : 151645 ,
84+ "hidden_act" : "silu" ,
85+ "hidden_size" : 3584 ,
86+ "image_token_id" : None ,
87+ "initializer_range" : 0.02 ,
88+ "intermediate_size" : 18944 ,
89+ "layer_types" : [
90+ "full_attention" ,
91+ "full_attention" ,
92+ "full_attention" ,
93+ "full_attention" ,
94+ "full_attention" ,
95+ "full_attention" ,
96+ "full_attention" ,
97+ "full_attention" ,
98+ "full_attention" ,
99+ "full_attention" ,
100+ "full_attention" ,
101+ "full_attention" ,
102+ "full_attention" ,
103+ "full_attention" ,
104+ "full_attention" ,
105+ "full_attention" ,
106+ "full_attention" ,
107+ "full_attention" ,
108+ "full_attention" ,
109+ "full_attention" ,
110+ "full_attention" ,
111+ "full_attention" ,
112+ "full_attention" ,
113+ "full_attention" ,
114+ "full_attention" ,
115+ "full_attention" ,
116+ "full_attention" ,
117+ "full_attention"
118+ ],
119+ "max_position_embeddings" : 128000 ,
120+ "max_window_layers" : 28 ,
121+ "model_type" : "qwen2_5_vl_text" ,
122+ "num_attention_heads" : 28 ,
123+ "num_hidden_layers" : 28 ,
124+ "num_key_value_heads" : 4 ,
125+ "rms_norm_eps" : 1e-06 ,
126+ "rope_scaling" : {
127+ "mrope_section" : [
128+ 16 ,
129+ 24 ,
130+ 24
131+ ],
132+ "rope_type" : "default" ,
133+ "type" : "default"
134+ },
135+ "rope_theta" : 1000000.0 ,
136+ "sliding_window" : None ,
137+ "torch_dtype" : "float32" ,
138+ "use_cache" : True ,
139+ "use_sliding_window" : False ,
140+ "video_token_id" : None ,
141+ "vision_end_token_id" : 151653 ,
142+ "vision_start_token_id" : 151652 ,
143+ "vision_token_id" : 151654 ,
144+ "vocab_size" : 152064
145+ },
146+ "tie_word_embeddings" : False ,
147+ "torch_dtype" : "float32" ,
148+ "transformers_version" : "4.54.0" ,
149+ "use_cache" : True ,
150+ "use_sliding_window" : False ,
151+ "video_token_id" : 151656 ,
152+ "vision_config" : {
153+ "depth" : 32 ,
154+ "fullatt_block_indexes" : [
155+ 7 ,
156+ 15 ,
157+ 23 ,
158+ 31
159+ ],
160+ "hidden_act" : "silu" ,
161+ "hidden_size" : 1280 ,
162+ "in_channels" : 3 ,
163+ "in_chans" : 3 ,
164+ "initializer_range" : 0.02 ,
165+ "intermediate_size" : 3420 ,
166+ "model_type" : "qwen2_5_vl" ,
167+ "num_heads" : 16 ,
168+ "out_hidden_size" : 3584 ,
169+ "patch_size" : 14 ,
170+ "spatial_merge_size" : 2 ,
171+ "spatial_patch_size" : 14 ,
172+ "temporal_patch_size" : 2 ,
173+ "tokens_per_second" : 2 ,
174+ "torch_dtype" : "float32" ,
175+ "window_size" : 112
176+ },
177+ "vision_end_token_id" : 151653 ,
178+ "vision_start_token_id" : 151652 ,
179+ "vision_token_id" : 151654 ,
180+ "vocab_size" : 152064
181+ })
182+
183+ self .model = Qwen2_5_VLForConditionalGeneration (config )
49184
50185 self .output_key = "last_hidden_state"
51186
0 commit comments