[model] support Kimi-K2 template (#4925)

Jintao-Huang · web-flow · commit fcc1c2d1a177 · 2025-07-12T14:34:13.000+08:00
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -552,6 +552,8 @@
 |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|&#x2718;|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)|
 |[moonshotai/Moonlight-16B-A3B](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Moonlight-16B-A3B](https://huggingface.co/moonshotai/Moonlight-16B-A3B)|
 |[moonshotai/Moonlight-16B-A3B-Instruct](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B-Instruct)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Moonlight-16B-A3B-Instruct](https://huggingface.co/moonshotai/Moonlight-16B-A3B-Instruct)|
+|[moonshotai/Kimi-K2-Base](https://modelscope.cn/models/moonshotai/Kimi-K2-Base)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Kimi-K2-Base](https://huggingface.co/moonshotai/Kimi-K2-Base)|
+|[moonshotai/Kimi-K2-Instruct](https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Kimi-K2-Instruct](https://huggingface.co/moonshotai/Kimi-K2-Instruct)|
 |[XiaomiMiMo/MiMo-7B-Base](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-Base)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-Base](https://huggingface.co/XiaomiMiMo/MiMo-7B-Base)|
 |[XiaomiMiMo/MiMo-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-SFT)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-7B-SFT)|
 |[XiaomiMiMo/MiMo-7B-RL-Zero](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-Zero)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-RL-Zero](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-Zero)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -552,6 +552,8 @@ The table below introduces the models integrated with ms-swift:
 |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|&#x2718;|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)|
 |[moonshotai/Moonlight-16B-A3B](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Moonlight-16B-A3B](https://huggingface.co/moonshotai/Moonlight-16B-A3B)|
 |[moonshotai/Moonlight-16B-A3B-Instruct](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B-Instruct)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Moonlight-16B-A3B-Instruct](https://huggingface.co/moonshotai/Moonlight-16B-A3B-Instruct)|
+|[moonshotai/Kimi-K2-Base](https://modelscope.cn/models/moonshotai/Kimi-K2-Base)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Kimi-K2-Base](https://huggingface.co/moonshotai/Kimi-K2-Base)|
+|[moonshotai/Kimi-K2-Instruct](https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Kimi-K2-Instruct](https://huggingface.co/moonshotai/Kimi-K2-Instruct)|
 |[XiaomiMiMo/MiMo-7B-Base](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-Base)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-Base](https://huggingface.co/XiaomiMiMo/MiMo-7B-Base)|
 |[XiaomiMiMo/MiMo-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-SFT)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-7B-SFT)|
 |[XiaomiMiMo/MiMo-7B-RL-Zero](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-Zero)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-RL-Zero](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-Zero)|
diff --git a/swift/llm/model/model/moonshot.py b/swift/llm/model/model/moonshot.py
@@ -14,6 +14,10 @@
                 Model('moonshotai/Moonlight-16B-A3B', 'moonshotai/Moonlight-16B-A3B'),
                 Model('moonshotai/Moonlight-16B-A3B-Instruct', 'moonshotai/Moonlight-16B-A3B-Instruct'),
             ]),
+            ModelGroup([
+                Model('moonshotai/Kimi-K2-Base', 'moonshotai/Kimi-K2-Base'),
+                Model('moonshotai/Kimi-K2-Instruct', 'moonshotai/Kimi-K2-Instruct'),
+            ]),
         ],
         TemplateType.moonlight,
         get_model_tokenizer_with_flash_attn,
diff --git a/swift/megatron/utils/convert.py b/swift/megatron/utils/convert.py
@@ -11,7 +11,7 @@
 from megatron.training.initialize import initialize_megatron
 from megatron.training.utils import get_ltor_masks_and_position_ids
 
-from swift.llm import ExportArguments, HfConfigFactory, get_model_tokenizer, get_template, save_checkpoint
+from swift.llm import ExportArguments, HfConfigFactory, get_model_tokenizer, get_template, save_checkpoint, to_device
 from swift.utils import get_logger, get_n_params_grads
 from ..argument import MegatronArguments
 from ..model import get_megatron_model_meta
@@ -87,21 +87,37 @@ def test_convert_precision(hf_model, mg_model, processor, torch_dtype=torch.floa
     _test_params_sum(mg_model)
 
     template = get_template(hf_model.model_meta.template, processor)
-    input_ids = template.encode({'messages': [{'role': 'user', 'content': 'who are you?'}]})['input_ids']
-    input_ids = torch.tensor(input_ids)[None].to('cuda')
+    template.set_mode('train')
+    inputs = template.encode({
+        'messages': [
+            {
+                'role': 'user',
+                'content': 'Introduction to ms-swift.'
+            },
+            {
+                'role':
+                'assistant',
+                'content':
+                'ms-swift is an official framework provided by the ModelScope community for fine-tuning '
+                'and deploying large language models and multi-modal large models.'
+            },
+        ]
+    })
+    inputs = to_device(template.data_collator([inputs]), 'cuda')
 
     HfConfigFactory.set_model_config_attr(hf_model, 'use_cache', False)
     share_embedding = mg_model.share_embeddings_and_output_weights
     hf_modules = _find_modules(hf_model)
     with torch.inference_mode(), _model_cpu_forward_context(hf_modules, torch_dtype, share_embedding=share_embedding):
-        hf_logits = hf_model(input_ids).logits
+        hf_logits = hf_model(**inputs).logits
     hf_model = hf_model.to('cpu')
 
+    input_ids = inputs['input_ids']
     attention_mask, _, position_ids = get_ltor_masks_and_position_ids(input_ids, -100, True, True, True)
     packed_seq_params = None
     mg_torch_dtype = torch_dtype
     # thd
-    # from ..train.utils import get_packed_seq_params
+    # from ..trainers.utils import get_packed_seq_params
     # mg_torch_dtype = None
     # packed_seq_params = get_packed_seq_params(position_ids)
     # attention_mask = None
@@ -115,8 +131,10 @@ def test_convert_precision(hf_model, mg_model, processor, torch_dtype=torch.floa
             position_ids=position_ids,
             packed_seq_params=packed_seq_params)
 
-    mean_diff = (mg_logits - hf_logits).abs().mean().item()
+    token_mean_diff = (mg_logits - hf_logits).abs().mean(dim=-1)
+    mean_diff = token_mean_diff.mean().item()
     max_diff = (mg_logits - hf_logits).abs().max().item()
+    print(f'token_mean_diff: {token_mean_diff}')
     print(f'mean_diff: {mean_diff}, max_diff: {max_diff} (Please check that mean_diff is less than 0.1).')
     hf_tokens = hf_logits.argmax(-1)
     mg_tokens = mg_logits.argmax(-1)