Skip to content

Commit 1f8d908

Browse files
authored
[model] support Tencent-Hunyuan/Hunyuan-A13B-Instruct (#4745)
1 parent 696fad6 commit 1f8d908

File tree

16 files changed

+142
-23
lines changed

16 files changed

+142
-23
lines changed

docs/source/Instruction/支持的模型和数据集.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,7 @@
559559
|[XiaomiMiMo/MiMo-7B-RL-0530](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-0530)|mimo_rl|mimo_rl|transformers>=4.37|✔|-|[XiaomiMiMo/MiMo-7B-RL-0530](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-0530)|
560560
|[rednote-hilab/dots.llm1.base](https://modelscope.cn/models/rednote-hilab/dots.llm1.base)|dots1|dots1|transformers>=4.53.0.dev0|✔|-|[rednote-hilab/dots.llm1.base](https://huggingface.co/rednote-hilab/dots.llm1.base)|
561561
|[rednote-hilab/dots.llm1.inst](https://modelscope.cn/models/rednote-hilab/dots.llm1.inst)|dots1|dots1|transformers>=4.53.0.dev0|✔|-|[rednote-hilab/dots.llm1.inst](https://huggingface.co/rednote-hilab/dots.llm1.inst)|
562+
|[Tencent-Hunyuan/Hunyuan-A13B-Instruct](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-A13B-Instruct)|hunyuan|hunyuan|-|✘|-|[tencent/Hunyuan-A13B-Instruct](https://huggingface.co/tencent/Hunyuan-A13B-Instruct)|
562563
|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|transformers>=4.48|✘|bert|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)|
563564
|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|transformers>=4.48|✘|bert|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)|
564565
|[iic/gte-modernbert-base](https://modelscope.cn/models/iic/gte-modernbert-base)|modern_bert_gte|dummy|transformers>=4.48|✘|bert, embedding|[Alibaba-NLP/gte-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)|

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,7 @@ The table below introduces the models integrated with ms-swift:
559559
|[XiaomiMiMo/MiMo-7B-RL-0530](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-0530)|mimo_rl|mimo_rl|transformers>=4.37|✔|-|[XiaomiMiMo/MiMo-7B-RL-0530](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-0530)|
560560
|[rednote-hilab/dots.llm1.base](https://modelscope.cn/models/rednote-hilab/dots.llm1.base)|dots1|dots1|transformers>=4.53.0.dev0|✔|-|[rednote-hilab/dots.llm1.base](https://huggingface.co/rednote-hilab/dots.llm1.base)|
561561
|[rednote-hilab/dots.llm1.inst](https://modelscope.cn/models/rednote-hilab/dots.llm1.inst)|dots1|dots1|transformers>=4.53.0.dev0|✔|-|[rednote-hilab/dots.llm1.inst](https://huggingface.co/rednote-hilab/dots.llm1.inst)|
562+
|[Tencent-Hunyuan/Hunyuan-A13B-Instruct](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-A13B-Instruct)|hunyuan|hunyuan|-|✘|-|[tencent/Hunyuan-A13B-Instruct](https://huggingface.co/tencent/Hunyuan-A13B-Instruct)|
562563
|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|transformers>=4.48|✘|bert|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)|
563564
|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|transformers>=4.48|✘|bert|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)|
564565
|[iic/gte-modernbert-base](https://modelscope.cn/models/iic/gte-modernbert-base)|modern_bert_gte|dummy|transformers>=4.48|✘|bert, embedding|[Alibaba-NLP/gte-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)|

swift/llm/model/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ class LLMModelType:
119119
mimo = 'mimo'
120120
mimo_rl = 'mimo_rl'
121121
dots1 = 'dots1'
122+
hunyuan = 'hunyuan'
122123

123124

124125
class BertModelType:

swift/llm/model/model/llm.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,3 +343,14 @@ def forward(self, **kwargs):
343343
architectures=['Dots1ForCausalLM'],
344344
requires=['transformers>=4.53.0.dev0'],
345345
))
346+
347+
register_model(
348+
ModelMeta(
349+
LLMModelType.hunyuan,
350+
[ModelGroup([
351+
Model('Tencent-Hunyuan/Hunyuan-A13B-Instruct', 'tencent/Hunyuan-A13B-Instruct'),
352+
])],
353+
TemplateType.hunyuan,
354+
get_model_tokenizer_with_flash_attn,
355+
architectures=['HunYuanMoEV1ForCausalLM'],
356+
))

swift/llm/template/base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1034,7 +1034,10 @@ def _swift_encode(self, inputs: StdTemplateInputs):
10341034
res_context_list.append(bos_token)
10351035
res_context_types.append(ContextType.OTHER)
10361036

1037-
prefix = template_meta.system_prefix if system else template_meta.prefix
1037+
if self.template_meta.is_post_system or not system:
1038+
prefix = template_meta.prefix
1039+
else:
1040+
prefix = template_meta.system_prefix
10381041
self._concat_context_list(prefix, res_context_list, res_context_types, system=system)
10391042

10401043
n_round = len(inputs.messages) // 2

swift/llm/template/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class LLMTemplateType:
8585
moonlight = 'moonlight'
8686
mimo_rl = 'mimo_rl'
8787
dots1 = 'dots1'
88+
hunyuan = 'hunyuan'
8889

8990
aya = 'aya'
9091
c4ai = 'c4ai'

swift/llm/template/template/llm.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,3 +289,13 @@ class TeleChatTemplateMeta(TemplateMeta):
289289
suffix=['<|endofresponse|>'],
290290
default_system='You are a helpful assistant.',
291291
))
292+
293+
register_template(
294+
TemplateMeta(
295+
LLMTemplateType.hunyuan,
296+
prefix=['<|startoftext|>'],
297+
system_prefix=['<|startoftext|>{{SYSTEM}}<|extra_4|>'],
298+
prompt=['{{QUERY}}<|extra_0|>'],
299+
chat_sep=['<|eos|><|startoftext|>'],
300+
suffix=['<|eos|>'],
301+
))

swift/llm/template/template_meta.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,8 @@ def __post_init__(self):
8282

8383
self.is_post_system = self._has_system(self.prompt) # mistral_nemo
8484
if self.is_post_system:
85-
self.prompt = [context for context in self.prompt if '{{SYSTEM}}' not in context]
8685
self.system_prompt = self.prompt
86+
self.prompt = [context for context in self.prompt if '{{SYSTEM}}' not in context]
8787

8888
if self.system_prefix is None and not self.is_post_system:
8989
self.support_system = False

swift/megatron/model/config.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
# moe
2525
'moe_ffn_hidden_size': ['moe_intermediate_size'],
2626
'moe_shared_expert_intermediate_size': ['shared_expert_intermediate_size'],
27-
'moe_router_topk': ['num_experts_per_tok', 'n_group'],
27+
'moe_router_topk': ['num_experts_per_tok', 'n_group', 'moe_topk'],
2828
'num_experts': ['num_experts', 'n_routed_experts'],
2929
'moe_router_pre_softmax': ['norm_topk_prob'],
3030
'moe_aux_loss_coeff': ['router_aux_loss_coef'],
@@ -35,11 +35,12 @@
3535
'qk_head_dim': ['qk_nope_head_dim'],
3636
'qk_pos_emb_head_dim': ['qk_rope_head_dim'],
3737
'moe_router_topk_scaling_factor': ['routed_scaling_factor'],
38+
'qk_layernorm': ['use_qk_norm'],
3839
# other
3940
'original_max_position_embeddings': ['original_max_position_embeddings'],
4041
'partial_rotary_factor': ['partial_rotary_factor'],
4142
'first_k_dense_replace': ['first_k_dense_replace'],
42-
'n_shared_experts': ['n_shared_experts']
43+
'n_shared_experts': ['n_shared_experts', 'num_shared_expert'],
4344
}
4445

4546

@@ -49,6 +50,8 @@ def convert_hf_config(config) -> Dict[str, Any]:
4950
for hf_k in hf_keys:
5051
if hasattr(config, hf_k):
5152
hf_v = getattr(config, hf_k)
53+
if hf_v is None:
54+
continue
5255
if k == 'rotary_base':
5356
megatron_config[k] = int(hf_v)
5457
elif k in {'untie_embeddings_and_output_weights', 'disable_bias_linear', 'moe_router_pre_softmax'}:

swift/megatron/model/gpt/config.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,14 @@ def convert_gpt_hf_config(config) -> Dict[str, Any]:
3030
if res.get('moe_router_score_function', 'softmax') == 'sigmoid':
3131
res['moe_router_enable_expert_bias'] = True
3232
res['moe_layer_freq'] = f'[0]*{first_k_dense_replace}+[1]*{res["num_layers"] - first_k_dense_replace}'
33+
if architectures == 'HunYuanMoEV1ForCausalLM':
34+
# Since HunYuan’s attention applies RoPE before using q/k_layernorm,
35+
# which is incompatible with megatron-core, support is not provided here.
36+
res['n_shared_experts'] = n_shared_experts
37+
for key in ['moe_ffn_hidden_size', 'n_shared_experts', 'moe_router_topk']:
38+
val = res.get(key)
39+
if isinstance(val, list) and val and min(val) == max(val):
40+
res[key] = val[0]
41+
n_shared_experts = res.pop('n_shared_experts')
42+
res['moe_shared_expert_intermediate_size'] = n_shared_experts * res['moe_ffn_hidden_size']
3343
return res

0 commit comments

Comments
 (0)