Skip to content

Commit 9aa5c16

Browse files
committed
fix: config load
1 parent 6b2b6c4 commit 9aa5c16

File tree

6 files changed

+255
-147
lines changed

6 files changed

+255
-147
lines changed

diffsynth/configs/model_configs.py

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -32,61 +32,23 @@
3232
"model_name": "hunyuan_dit",
3333
"model_class": "diffsynth.models.hunyuanimage_dit.HYImageDiffusionTransformer",
3434
"state_dict_converter": "diffsynth.utils.state_dict_converters.hunyuan_dit_converter.HunyuanDiTStateDictConverter",
35-
"extra_kwargs": {
36-
"in_channels": 64,
37-
"out_channels": 64,
38-
"mm_double_blocks_depth": 20,
39-
"mm_single_blocks_depth": 40,
40-
"rope_dim_list": [64, 64],
41-
"hidden_size": 3584,
42-
"heads_num": 28,
43-
"mlp_width_ratio": 4,
44-
"patch_size": [1, 1],
45-
"text_states_dim": 3584,
46-
"glyph_byT5_v2": True,
47-
"guidance_embed": False,
48-
}
4935
},
5036
{
5137
"model_hash": "17119adfcaec79e9045b50274d51c65e",
5238
"model_name": "hunyuan_vae",
5339
"model_class": "diffsynth.models.hunyuanimage_vae.HunyuanImageVAE2D",
5440
"state_dict_converter": "diffsynth.utils.state_dict_converters.hunyuan_vae_converter.HunyuanVAEStateDictConverter",
55-
"extra_kwargs": {
56-
"in_channels": 3,
57-
"out_channels": 3,
58-
"latent_channels": 64,
59-
"block_out_channels": [ 128, 256, 512, 512, 1024, 1024 ],
60-
"layers_per_block": 2,
61-
"ffactor_spatial": 32,
62-
"sample_size": 384,
63-
"sample_tsize": 96,
64-
"scaling_factor": 0.75289,
65-
"downsample_match_channel": True,
66-
"upsample_match_channel": True
67-
}
6841
},
6942
{
7043
"model_hash": "8004730443f55db63092006dd9f7110e",
7144
"model_name": "hunyuan_text_encoder_qwen",
7245
"model_class": "diffsynth.models.hunyuanimage_text_encoder_qwen.HunyuanImage_TextEncoder_Qwen",
7346
"state_dict_converter": "diffsynth.utils.state_dict_converters.hunyuan_qwen_converter.HunyuanQwenStateDictConverter",
74-
"extra_kwargs": {
75-
"model_path": "Qwen/Qwen2.5-VL-7B-Instruct",
76-
"apply_final_norm": False,
77-
"hidden_state_skip_layer": 2,
78-
"crop_start": 34
79-
}
8047
},
8148
{
8249
"model_hash": "e47fee6f4928b305e2fd32bd45ef1950",
8350
"model_name": "hunyuan_text_encoder_t5",
8451
"model_class": "diffsynth.models.hunyuanimage_text_encoder_t5.HunyuanImage_ByT5",
8552
"state_dict_converter": "diffsynth.utils.state_dict_converters.hunyuan_t5_converter.HunyuanT5StateDictConverter",
86-
"extra_kwargs": {
87-
"model_path": "google/byt5-small",
88-
"color_ann_path": "AI-ModelScope/Glyph-SDXL-v2",
89-
"font_ann_path": "AI-ModelScope/Glyph-SDXL-v2"
90-
}
9153
}
9254
]

diffsynth/models/hunyuanimage_dit.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1474,16 +1474,16 @@ class HYImageDiffusionTransformer(torch.nn.Module):
14741474
# @register_to_config
14751475
def __init__(
14761476
self,
1477-
patch_size: list = [1, 2, 2],
1478-
in_channels: int = 4,
1479-
out_channels: int = None,
1480-
hidden_size: int = 3072,
1481-
heads_num: int = 24,
1477+
patch_size: list = [1, 1],
1478+
in_channels: int = 64,
1479+
out_channels: int = 64,
1480+
hidden_size: int = 3584,
1481+
heads_num: int = 28,
14821482
mlp_width_ratio: float = 4.0,
14831483
mlp_act_type: str = "gelu_tanh",
14841484
mm_double_blocks_depth: int = 20,
14851485
mm_single_blocks_depth: int = 40,
1486-
rope_dim_list: List[int] = [16, 56, 56],
1486+
rope_dim_list: List[int] = [64, 64],
14871487
qkv_bias: bool = True,
14881488
qk_norm: bool = True,
14891489
qk_norm_type: str = "rms",
@@ -1492,9 +1492,9 @@ def __init__(
14921492
use_attention_mask: bool = True,
14931493
dtype: Optional[torch.dtype] = None,
14941494
device: Optional[torch.device] = None,
1495-
text_states_dim: int = 4096,
1495+
text_states_dim: int = 3584,
14961496
rope_theta: int = 256,
1497-
glyph_byT5_v2: bool = False,
1497+
glyph_byT5_v2: bool = True,
14981498
use_meanflow: bool = False,
14991499
):
15001500
factory_kwargs = {"device": device, "dtype": dtype}

diffsynth/models/hunyuanimage_text_encoder_qwen.py

Lines changed: 140 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import torch
66
import torch.nn as nn
7-
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoConfig
7+
from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration
88

99
from transformers.utils import ModelOutput
1010

@@ -33,7 +33,7 @@ class TextEncoderModelOutput(ModelOutput):
3333
class HunyuanImage_TextEncoder_Qwen(nn.Module):
3434
def __init__(
3535
self,
36-
model_path: str,
36+
model_path: str = "Qwen/Qwen2.5-VL-7B-Instruct",
3737
apply_final_norm: bool = False,
3838
hidden_state_skip_layer: Optional[int] = 2,
3939
crop_start: int = 34,
@@ -43,9 +43,144 @@ def __init__(
4343
self.apply_final_norm = apply_final_norm
4444
self.hidden_state_skip_layer = hidden_state_skip_layer
4545
self.crop_start = crop_start
46-
47-
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
48-
self.model = AutoModelForVision2Seq.from_config(config, trust_remote_code=True)
46+
47+
config = Qwen2_5_VLConfig(**{
48+
"architectures": [
49+
"Qwen2_5_VLForConditionalGeneration"
50+
],
51+
"attention_dropout": 0.0,
52+
"bos_token_id": 151643,
53+
"eos_token_id": 151645,
54+
"hidden_act": "silu",
55+
"hidden_size": 3584,
56+
"image_token_id": 151655,
57+
"initializer_range": 0.02,
58+
"intermediate_size": 18944,
59+
"max_position_embeddings": 128000,
60+
"max_window_layers": 28,
61+
"model_type": "qwen2_5_vl",
62+
"num_attention_heads": 28,
63+
"num_hidden_layers": 28,
64+
"num_key_value_heads": 4,
65+
"rms_norm_eps": 1e-06,
66+
"rope_scaling": {
67+
"mrope_section": [
68+
16,
69+
24,
70+
24
71+
],
72+
"rope_type": "default",
73+
"type": "default"
74+
},
75+
"rope_theta": 1000000.0,
76+
"sliding_window": 32768,
77+
"text_config": {
78+
"architectures": [
79+
"Qwen2_5_VLForConditionalGeneration"
80+
],
81+
"attention_dropout": 0.0,
82+
"bos_token_id": 151643,
83+
"eos_token_id": 151645,
84+
"hidden_act": "silu",
85+
"hidden_size": 3584,
86+
"image_token_id": None,
87+
"initializer_range": 0.02,
88+
"intermediate_size": 18944,
89+
"layer_types": [
90+
"full_attention",
91+
"full_attention",
92+
"full_attention",
93+
"full_attention",
94+
"full_attention",
95+
"full_attention",
96+
"full_attention",
97+
"full_attention",
98+
"full_attention",
99+
"full_attention",
100+
"full_attention",
101+
"full_attention",
102+
"full_attention",
103+
"full_attention",
104+
"full_attention",
105+
"full_attention",
106+
"full_attention",
107+
"full_attention",
108+
"full_attention",
109+
"full_attention",
110+
"full_attention",
111+
"full_attention",
112+
"full_attention",
113+
"full_attention",
114+
"full_attention",
115+
"full_attention",
116+
"full_attention",
117+
"full_attention"
118+
],
119+
"max_position_embeddings": 128000,
120+
"max_window_layers": 28,
121+
"model_type": "qwen2_5_vl_text",
122+
"num_attention_heads": 28,
123+
"num_hidden_layers": 28,
124+
"num_key_value_heads": 4,
125+
"rms_norm_eps": 1e-06,
126+
"rope_scaling": {
127+
"mrope_section": [
128+
16,
129+
24,
130+
24
131+
],
132+
"rope_type": "default",
133+
"type": "default"
134+
},
135+
"rope_theta": 1000000.0,
136+
"sliding_window": None,
137+
"torch_dtype": "float32",
138+
"use_cache": True,
139+
"use_sliding_window": False,
140+
"video_token_id": None,
141+
"vision_end_token_id": 151653,
142+
"vision_start_token_id": 151652,
143+
"vision_token_id": 151654,
144+
"vocab_size": 152064
145+
},
146+
"tie_word_embeddings": False,
147+
"torch_dtype": "float32",
148+
"transformers_version": "4.54.0",
149+
"use_cache": True,
150+
"use_sliding_window": False,
151+
"video_token_id": 151656,
152+
"vision_config": {
153+
"depth": 32,
154+
"fullatt_block_indexes": [
155+
7,
156+
15,
157+
23,
158+
31
159+
],
160+
"hidden_act": "silu",
161+
"hidden_size": 1280,
162+
"in_channels": 3,
163+
"in_chans": 3,
164+
"initializer_range": 0.02,
165+
"intermediate_size": 3420,
166+
"model_type": "qwen2_5_vl",
167+
"num_heads": 16,
168+
"out_hidden_size": 3584,
169+
"patch_size": 14,
170+
"spatial_merge_size": 2,
171+
"spatial_patch_size": 14,
172+
"temporal_patch_size": 2,
173+
"tokens_per_second": 2,
174+
"torch_dtype": "float32",
175+
"window_size": 112
176+
},
177+
"vision_end_token_id": 151653,
178+
"vision_start_token_id": 151652,
179+
"vision_token_id": 151654,
180+
"vocab_size": 152064
181+
})
182+
183+
self.model = Qwen2_5_VLForConditionalGeneration(config)
49184

50185
self.output_key = "last_hidden_state"
51186

0 commit comments

Comments
 (0)