[model] support Tencent-Hunyuan/Hunyuan-A13B-Instruct (#4745)

Jintao-Huang · web-flow · commit 1f8d9080c3fa · 2025-06-28T00:09:02.000+08:00
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -559,6 +559,7 @@
 |[XiaomiMiMo/MiMo-7B-RL-0530](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-0530)|mimo_rl|mimo_rl|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-RL-0530](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-0530)|
 |[rednote-hilab/dots.llm1.base](https://modelscope.cn/models/rednote-hilab/dots.llm1.base)|dots1|dots1|transformers>=4.53.0.dev0|&#x2714;|-|[rednote-hilab/dots.llm1.base](https://huggingface.co/rednote-hilab/dots.llm1.base)|
 |[rednote-hilab/dots.llm1.inst](https://modelscope.cn/models/rednote-hilab/dots.llm1.inst)|dots1|dots1|transformers>=4.53.0.dev0|&#x2714;|-|[rednote-hilab/dots.llm1.inst](https://huggingface.co/rednote-hilab/dots.llm1.inst)|
+|[Tencent-Hunyuan/Hunyuan-A13B-Instruct](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-A13B-Instruct)|hunyuan|hunyuan|-|&#x2718;|-|[tencent/Hunyuan-A13B-Instruct](https://huggingface.co/tencent/Hunyuan-A13B-Instruct)|
 |[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|transformers>=4.48|&#x2718;|bert|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)|
 |[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|transformers>=4.48|&#x2718;|bert|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)|
 |[iic/gte-modernbert-base](https://modelscope.cn/models/iic/gte-modernbert-base)|modern_bert_gte|dummy|transformers>=4.48|&#x2718;|bert, embedding|[Alibaba-NLP/gte-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -559,6 +559,7 @@ The table below introduces the models integrated with ms-swift:
 |[XiaomiMiMo/MiMo-7B-RL-0530](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-0530)|mimo_rl|mimo_rl|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-RL-0530](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-0530)|
 |[rednote-hilab/dots.llm1.base](https://modelscope.cn/models/rednote-hilab/dots.llm1.base)|dots1|dots1|transformers>=4.53.0.dev0|&#x2714;|-|[rednote-hilab/dots.llm1.base](https://huggingface.co/rednote-hilab/dots.llm1.base)|
 |[rednote-hilab/dots.llm1.inst](https://modelscope.cn/models/rednote-hilab/dots.llm1.inst)|dots1|dots1|transformers>=4.53.0.dev0|&#x2714;|-|[rednote-hilab/dots.llm1.inst](https://huggingface.co/rednote-hilab/dots.llm1.inst)|
+|[Tencent-Hunyuan/Hunyuan-A13B-Instruct](https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-A13B-Instruct)|hunyuan|hunyuan|-|&#x2718;|-|[tencent/Hunyuan-A13B-Instruct](https://huggingface.co/tencent/Hunyuan-A13B-Instruct)|
 |[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|transformers>=4.48|&#x2718;|bert|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)|
 |[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|transformers>=4.48|&#x2718;|bert|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)|
 |[iic/gte-modernbert-base](https://modelscope.cn/models/iic/gte-modernbert-base)|modern_bert_gte|dummy|transformers>=4.48|&#x2718;|bert, embedding|[Alibaba-NLP/gte-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)|
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
@@ -119,6 +119,7 @@ class LLMModelType:
     mimo = 'mimo'
     mimo_rl = 'mimo_rl'
     dots1 = 'dots1'
+    hunyuan = 'hunyuan'
 
 
 class BertModelType:
diff --git a/swift/llm/model/model/llm.py b/swift/llm/model/model/llm.py
@@ -343,3 +343,14 @@ def forward(self, **kwargs):
         architectures=['Dots1ForCausalLM'],
         requires=['transformers>=4.53.0.dev0'],
     ))
+
+register_model(
+    ModelMeta(
+        LLMModelType.hunyuan,
+        [ModelGroup([
+            Model('Tencent-Hunyuan/Hunyuan-A13B-Instruct', 'tencent/Hunyuan-A13B-Instruct'),
+        ])],
+        TemplateType.hunyuan,
+        get_model_tokenizer_with_flash_attn,
+        architectures=['HunYuanMoEV1ForCausalLM'],
+    ))
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -1034,7 +1034,10 @@ def _swift_encode(self, inputs: StdTemplateInputs):
                 res_context_list.append(bos_token)
                 res_context_types.append(ContextType.OTHER)
 
-        prefix = template_meta.system_prefix if system else template_meta.prefix
+        if self.template_meta.is_post_system or not system:
+            prefix = template_meta.prefix
+        else:
+            prefix = template_meta.system_prefix
         self._concat_context_list(prefix, res_context_list, res_context_types, system=system)
 
         n_round = len(inputs.messages) // 2
diff --git a/swift/llm/template/constant.py b/swift/llm/template/constant.py
@@ -85,6 +85,7 @@ class LLMTemplateType:
     moonlight = 'moonlight'
     mimo_rl = 'mimo_rl'
     dots1 = 'dots1'
+    hunyuan = 'hunyuan'
 
     aya = 'aya'
     c4ai = 'c4ai'
diff --git a/swift/llm/template/template/llm.py b/swift/llm/template/template/llm.py
@@ -289,3 +289,13 @@ class TeleChatTemplateMeta(TemplateMeta):
         suffix=['<|endofresponse|>'],
         default_system='You are a helpful assistant.',
     ))
+
+register_template(
+    TemplateMeta(
+        LLMTemplateType.hunyuan,
+        prefix=['<|startoftext|>'],
+        system_prefix=['<|startoftext|>{{SYSTEM}}<|extra_4|>'],
+        prompt=['{{QUERY}}<|extra_0|>'],
+        chat_sep=['<|eos|><|startoftext|>'],
+        suffix=['<|eos|>'],
+    ))
diff --git a/swift/llm/template/template_meta.py b/swift/llm/template/template_meta.py
@@ -82,8 +82,8 @@ def __post_init__(self):
 
         self.is_post_system = self._has_system(self.prompt)  # mistral_nemo
         if self.is_post_system:
-            self.prompt = [context for context in self.prompt if '{{SYSTEM}}' not in context]
             self.system_prompt = self.prompt
+            self.prompt = [context for context in self.prompt if '{{SYSTEM}}' not in context]
 
         if self.system_prefix is None and not self.is_post_system:
             self.support_system = False
diff --git a/swift/megatron/model/config.py b/swift/megatron/model/config.py
@@ -24,7 +24,7 @@
     # moe
     'moe_ffn_hidden_size': ['moe_intermediate_size'],
     'moe_shared_expert_intermediate_size': ['shared_expert_intermediate_size'],
-    'moe_router_topk': ['num_experts_per_tok', 'n_group'],
+    'moe_router_topk': ['num_experts_per_tok', 'n_group', 'moe_topk'],
     'num_experts': ['num_experts', 'n_routed_experts'],
     'moe_router_pre_softmax': ['norm_topk_prob'],
     'moe_aux_loss_coeff': ['router_aux_loss_coef'],
@@ -35,11 +35,12 @@
     'qk_head_dim': ['qk_nope_head_dim'],
     'qk_pos_emb_head_dim': ['qk_rope_head_dim'],
     'moe_router_topk_scaling_factor': ['routed_scaling_factor'],
+    'qk_layernorm': ['use_qk_norm'],
     # other
     'original_max_position_embeddings': ['original_max_position_embeddings'],
     'partial_rotary_factor': ['partial_rotary_factor'],
     'first_k_dense_replace': ['first_k_dense_replace'],
-    'n_shared_experts': ['n_shared_experts']
+    'n_shared_experts': ['n_shared_experts', 'num_shared_expert'],
 }
 
 
@@ -49,6 +50,8 @@ def convert_hf_config(config) -> Dict[str, Any]:
         for hf_k in hf_keys:
             if hasattr(config, hf_k):
                 hf_v = getattr(config, hf_k)
+                if hf_v is None:
+                    continue
                 if k == 'rotary_base':
                     megatron_config[k] = int(hf_v)
                 elif k in {'untie_embeddings_and_output_weights', 'disable_bias_linear', 'moe_router_pre_softmax'}:
diff --git a/swift/megatron/model/gpt/config.py b/swift/megatron/model/gpt/config.py
@@ -30,4 +30,14 @@ def convert_gpt_hf_config(config) -> Dict[str, Any]:
             if res.get('moe_router_score_function', 'softmax') == 'sigmoid':
                 res['moe_router_enable_expert_bias'] = True
             res['moe_layer_freq'] = f'[0]*{first_k_dense_replace}+[1]*{res["num_layers"] - first_k_dense_replace}'
+    if architectures == 'HunYuanMoEV1ForCausalLM':
+        # Since HunYuan’s attention applies RoPE before using q/k_layernorm,
+        # which is incompatible with megatron-core, support is not provided here.
+        res['n_shared_experts'] = n_shared_experts
+        for key in ['moe_ffn_hidden_size', 'n_shared_experts', 'moe_router_topk']:
+            val = res.get(key)
+            if isinstance(val, list) and val and min(val) == max(val):
+                res[key] = val[0]
+        n_shared_experts = res.pop('n_shared_experts')
+        res['moe_shared_expert_intermediate_size'] = n_shared_experts * res['moe_ffn_hidden_size']
     return res
diff --git a/swift/megatron/model/gpt/hf2mcore.py b/swift/megatron/model/gpt/hf2mcore.py
@@ -39,8 +39,10 @@ def set_attn_state(args, mg_attn, hf_attn):
             ],
                       dim=1).reshape(-1))
     if args.qk_layernorm:
-        mg_attn.q_layernorm.weight.data.copy_(hf_attn.q_norm.weight)
-        mg_attn.k_layernorm.weight.data.copy_(hf_attn.k_norm.weight)
+        q_norm = hf_attn.query_layernorm if hasattr(hf_attn, 'query_layernorm') else hf_attn.q_norm
+        k_norm = hf_attn.key_layernorm if hasattr(hf_attn, 'key_layernorm') else hf_attn.k_norm
+        mg_attn.q_layernorm.weight.data.copy_(q_norm.weight)
+        mg_attn.k_layernorm.weight.data.copy_(k_norm.weight)
 
 
 def _set_mlp_state(mg_mlp, hf_mlp):
@@ -52,11 +54,19 @@ def _set_mlp_state(mg_mlp, hf_mlp):
 
 
 def _set_moe_state(args, mg_mlp, hf_mlp):
-    mg_mlp.router.weight.data.copy_(hf_mlp.gate.weight)
+    hf_gate = hf_mlp.gate
+    if hasattr(hf_gate, 'wg'):
+        hf_gate = hf_gate.wg
+    mg_mlp.router.weight.data.copy_(hf_gate.weight)
     if args.moe_router_enable_expert_bias:
-        mg_mlp.router.expert_bias.data.copy_(hf_mlp.gate.e_score_correction_bias)
+        mg_mlp.router.expert_bias.data.copy_(hf_gate.e_score_correction_bias)
     if mg_mlp.shared_experts is not None:
-        hf_shared_expert = hf_mlp.shared_expert if hasattr(hf_mlp, 'shared_expert') else hf_mlp.shared_experts
+        if hasattr(hf_mlp, 'shared_experts'):
+            hf_shared_expert = hf_mlp.shared_experts
+        elif hasattr(hf_mlp, 'shared_mlp'):
+            hf_shared_expert = hf_mlp.shared_mlp
+        else:
+            hf_shared_expert = hf_mlp.shared_expert
         _set_mlp_state(mg_mlp.shared_experts, hf_shared_expert)
         if mg_mlp.shared_experts.gate_weight is not None:
             mg_mlp.shared_experts.gate_weight.data.copy_(hf_mlp.shared_expert_gate.weight)
diff --git a/swift/megatron/model/gpt/mcore2hf.py b/swift/megatron/model/gpt/mcore2hf.py
@@ -34,16 +34,26 @@ def set_attn_state(args, mg_attn, hf_attn):
         hf_attn.v_proj.bias.data.copy_(mg_attn_bias[:, -kv_dim:].reshape(-1))
 
     if args.qk_layernorm:
-        hf_attn.q_norm.weight.data.copy_(mg_attn.q_layernorm.weight)
-        hf_attn.k_norm.weight.data.copy_(mg_attn.k_layernorm.weight)
+        q_norm = hf_attn.query_layernorm if hasattr(hf_attn, 'query_layernorm') else hf_attn.q_norm
+        k_norm = hf_attn.key_layernorm if hasattr(hf_attn, 'key_layernorm') else hf_attn.k_norm
+        q_norm.weight.data.copy_(mg_attn.q_layernorm.weight)
+        k_norm.weight.data.copy_(mg_attn.k_layernorm.weight)
 
 
 def _set_moe_state(args, mg_mlp, hf_mlp):
-    hf_mlp.gate.weight.data.copy_(mg_mlp.router.weight)
+    hf_gate = hf_mlp.gate
+    if hasattr(hf_gate, 'wg'):
+        hf_gate = hf_gate.wg
+    hf_gate.weight.data.copy_(mg_mlp.router.weight)
     if args.moe_router_enable_expert_bias:
-        hf_mlp.gate.e_score_correction_bias.data.copy_(mg_mlp.router.expert_bias)
+        hf_gate.e_score_correction_bias.data.copy_(mg_mlp.router.expert_bias)
     if mg_mlp.shared_experts is not None:
-        hf_shared_expert = hf_mlp.shared_expert if hasattr(hf_mlp, 'shared_expert') else hf_mlp.shared_experts
+        if hasattr(hf_mlp, 'shared_experts'):
+            hf_shared_expert = hf_mlp.shared_experts
+        elif hasattr(hf_mlp, 'shared_mlp'):
+            hf_shared_expert = hf_mlp.shared_mlp
+        else:
+            hf_shared_expert = hf_mlp.shared_expert
         _set_mlp_state(mg_mlp.shared_experts, hf_shared_expert)
         if mg_mlp.shared_experts.gate_weight is not None:
             hf_mlp.shared_expert_gate.weight.data.copy_(mg_mlp.shared_experts.gate_weight)
diff --git a/swift/megatron/model/rope.py b/swift/megatron/model/rope.py
@@ -1,4 +1,8 @@
+from typing import Any, Dict, Optional
+
+import torch
 from megatron.training import get_args
+from transformers import PretrainedConfig
 
 from swift.utils import get_logger
 
@@ -30,11 +34,22 @@ def _get_dummy_config(args):
     return dummy_config
 
 
+EXTENDED_ROPE_INIT_FUNCTIONS = {}
+
+
+def _get_rope_type(rope_scaling: Dict[str, Any]):
+    rope_type = rope_scaling['rope_type']
+    if rope_type == 'dynamic' and rope_scaling.get('alpha') is not None:
+        rope_type = 'dynamic_alpha'
+    return rope_type
+
+
 def get_rope_inv_freq(seq_len=None):
     from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
     args = get_args()
+    ROPE_INIT_FUNCTIONS.update(EXTENDED_ROPE_INIT_FUNCTIONS)
     dummy_config = _get_dummy_config(args)
-    rope_init_fn = ROPE_INIT_FUNCTIONS[args.rope_scaling['rope_type']]
+    rope_init_fn = ROPE_INIT_FUNCTIONS[_get_rope_type(args.rope_scaling)]
     inv_freq, attention_scaling = rope_init_fn(dummy_config, 'cpu', seq_len=seq_len)
     if attention_scaling is None:
         attention_scaling = 1.
@@ -49,7 +64,7 @@ def longrope_frequency_update(args, model, inv_freq, seq_len: int):
         original_max_position_embeddings = args.max_position_embeddings
 
     if not hasattr(model, 'long_inv_freq'):
-        model.long_inv_freq, _ = get_rope_inv_freq(inv_freq.device, seq_len=original_max_position_embeddings + 1)
+        model.long_inv_freq, _ = get_rope_inv_freq(seq_len=original_max_position_embeddings + 1)
         model.original_inv_freq = inv_freq.clone()
 
     if seq_len > original_max_position_embeddings:
@@ -66,7 +81,7 @@ def dynamic_frequency_update(args, model, inv_freq, seq_len: int):
         model.original_inv_freq = inv_freq.clone()
     attention_scaling = None
     if seq_len > model.max_seq_len_cached:  # growth
-        new_inv_freq, attention_scaling = get_rope_inv_freq(inv_freq.device, seq_len=seq_len)
+        new_inv_freq, attention_scaling = get_rope_inv_freq(seq_len=seq_len)
         inv_freq.data.copy_(new_inv_freq)
         model.max_seq_len_cached = seq_len
 
@@ -78,10 +93,34 @@ def dynamic_frequency_update(args, model, inv_freq, seq_len: int):
 
 def dynamic_rope_update(model, inv_freq, seq_len: int):
     args = get_args()
-    rope_type = args.rope_scaling['rope_type']
+    rope_type = _get_rope_type(args.rope_scaling)
     attention_scaling = None
     if rope_type == 'dynamic':
         attention_scaling = dynamic_frequency_update(args, model, inv_freq, seq_len)
     elif rope_type == 'longrope':
         attention_scaling = longrope_frequency_update(args, model, inv_freq, seq_len)
     return attention_scaling
+
+
+def _compute_dynamic_alpha_ntk_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional['torch.device'] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple['torch.Tensor', float]:
+    # Code borrowed from Tencent-Hunyuan/Hunyuan-A13B-Instruct
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, 'partial_rotary_factor') else 1.0
+    head_dim = getattr(config, 'head_dim', config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    alpha = config.rope_scaling['alpha']
+
+    attention_factor = 1.0  # Unused in this type of RoPE
+
+    # Compute the inverse frequencies
+    base = base * alpha**(dim / (dim - 2))
+    inv_freq = 1.0 / (base**(torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim))
+    return inv_freq, attention_factor
+
+
+EXTENDED_ROPE_INIT_FUNCTIONS['dynamic_alpha'] = _compute_dynamic_alpha_ntk_parameters
diff --git a/swift/megatron/utils/convert.py b/swift/megatron/utils/convert.py
@@ -55,7 +55,7 @@ def _find_modules(model, recurse: bool = True):
 
 
 @contextmanager
-def _model_cpu_forward_context(modules, torch_dtype=None, device=None):
+def _model_cpu_forward_context(modules, torch_dtype=None, device=None, share_embedding: bool = False):
     origin_torch_dtype = next(modules[0].parameters()).dtype
 
     def _to_cuda_hook(module, args):
@@ -65,6 +65,8 @@ def _to_cuda_hook(module, args):
             module.to(torch_dtype)
 
     def _to_cpu_hook(module, args, output):
+        if share_embedding and module is modules[0]:
+            return
         module.to('cpu')
         if torch_dtype is not None:
             module.to(origin_torch_dtype)
@@ -89,9 +91,11 @@ def test_convert_precision(hf_model, mg_model, processor, torch_dtype=torch.floa
     input_ids = torch.tensor(input_ids)[None].to('cuda')
 
     HfConfigFactory.set_model_config_attr(hf_model, 'use_cache', False)
+    share_embedding = mg_model.share_embeddings_and_output_weights
     hf_modules = _find_modules(hf_model)
-    with torch.inference_mode(), _model_cpu_forward_context(hf_modules, torch_dtype):
+    with torch.inference_mode(), _model_cpu_forward_context(hf_modules, torch_dtype, share_embedding=share_embedding):
         hf_logits = hf_model(input_ids).logits
+    hf_model = hf_model.to('cpu')
 
     attention_mask, _, position_ids = get_ltor_masks_and_position_ids(input_ids, -100, True, True, True)
     packed_seq_params = None
@@ -102,7 +106,8 @@ def test_convert_precision(hf_model, mg_model, processor, torch_dtype=torch.floa
     # packed_seq_params = get_packed_seq_params(position_ids)
     # attention_mask = None
     mg_modules = _find_modules(mg_model)
-    with torch.inference_mode(), _model_cpu_forward_context(mg_modules, mg_torch_dtype, 'cuda'):
+    with torch.inference_mode(), _model_cpu_forward_context(
+            mg_modules, mg_torch_dtype, 'cuda', share_embedding=share_embedding):
         mg_logits = mg_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
diff --git a/tests/megatron/test_align/test_llm.py b/tests/megatron/test_align/test_llm.py
@@ -116,6 +116,10 @@ def test_kimi_dev():
     _test_model('moonshotai/Kimi-Dev-72B')
 
 
+def test_hunyuan():
+    _test_model('Tencent-Hunyuan/Hunyuan-A13B-Instruct')
+
+
 if __name__ == '__main__':
     # test_qwen2()
     # test_llama2()
@@ -137,4 +141,5 @@ def test_kimi_dev():
     # test_deepseek_v2()
     # test_deepseek_moe()
     # test_dots()
-    test_kimi_dev()
+    # test_kimi_dev()
+    test_hunyuan()
diff --git a/tests/test_align/test_template/test_llm.py b/tests/test_align/test_template/test_llm.py
@@ -430,6 +430,14 @@ def test_kimi_dev():
     assert res == res2, f'res: {res}, res2: {res2}'
 
 
+def test_hunyuan():
+    pt_engine = PtEngine('Tencent-Hunyuan/Hunyuan-A13B-Instruct')
+    res = _infer_model(pt_engine)
+    pt_engine.default_template.template_backend = 'jinja'
+    res2 = _infer_model(pt_engine)
+    assert res == res2, f'res: {res}, res2: {res2}'
+
+
 if __name__ == '__main__':
     from swift.llm import PtEngine, RequestConfig
     from swift.utils import get_logger, seed_everything
@@ -471,4 +479,5 @@ def test_kimi_dev():
     # test_mimo()
     # test_minicpm()
     # test_minimax()
-    test_kimi_dev()
+    # test_kimi_dev()
+    test_hunyuan()