support telechat2 (#2210)

Jintao-Huang · web-flow · commit 1658ccb62c8b · 2024-10-09T15:55:23.000+08:00
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -405,8 +405,9 @@
 |mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)|
 |telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|&#x2714;|&#x2718;|&#x2718;|&#x2718;||-|[Tele-AI/telechat-7B](https://huggingface.co/Tele-AI/telechat-7B)|
 |telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|&#x2714;|&#x2718;|&#x2718;|&#x2718;||-|[Tele-AI/TeleChat-12B](https://huggingface.co/Tele-AI/TeleChat-12B)|
-|telechat-12b-v2|[TeleAI/TeleChat-12B-v2](https://modelscope.cn/models/TeleAI/TeleChat-12B-v2/summary)|key_value, query|telechat-v2|&#x2714;|&#x2718;|&#x2718;|&#x2718;||-|[Tele-AI/TeleChat-12B-v2](https://huggingface.co/Tele-AI/TeleChat-12B-v2)|
-|telechat-12b-v2-gptq-int4|[swift/TeleChat-12B-V2-GPTQ-Int4](https://modelscope.cn/models/swift/TeleChat-12B-V2-GPTQ-Int4/summary)|key_value, query|telechat-v2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|auto_gptq>=0.5|-|-|
+|telechat-12b-v2|[TeleAI/TeleChat-12B-v2](https://modelscope.cn/models/TeleAI/TeleChat-12B-v2/summary)|key_value, query|telechat|&#x2714;|&#x2718;|&#x2718;|&#x2718;||-|[Tele-AI/TeleChat-12B-v2](https://huggingface.co/Tele-AI/TeleChat-12B-v2)|
+|telechat-12b-v2-gptq-int4|[swift/TeleChat-12B-V2-GPTQ-Int4](https://modelscope.cn/models/swift/TeleChat-12B-V2-GPTQ-Int4/summary)|key_value, query|telechat|&#x2714;|&#x2718;|&#x2718;|&#x2718;|auto_gptq>=0.5|-|-|
+|telechat2-115b|[TeleAI/TeleChat2-115B](https://modelscope.cn/models/TeleAI/TeleChat2-115B/summary)|key_value, query|telechat2|&#x2714;|&#x2718;|&#x2718;|&#x2718;||-|[Tele-AI/TeleChat2-115B](https://huggingface.co/Tele-AI/TeleChat2-115B)|
 |grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;|&#x2718;|&#x2718;||-|[hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1)|
 |dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|moe|[databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct)|
 |dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|moe|[databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)|
diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md
@@ -405,8 +405,9 @@ The table below introcudes all models supported by SWIFT:
 |mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)|
 |telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|&#x2714;|&#x2718;|&#x2718;|&#x2718;||-|[Tele-AI/telechat-7B](https://huggingface.co/Tele-AI/telechat-7B)|
 |telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|&#x2714;|&#x2718;|&#x2718;|&#x2718;||-|[Tele-AI/TeleChat-12B](https://huggingface.co/Tele-AI/TeleChat-12B)|
-|telechat-12b-v2|[TeleAI/TeleChat-12B-v2](https://modelscope.cn/models/TeleAI/TeleChat-12B-v2/summary)|key_value, query|telechat-v2|&#x2714;|&#x2718;|&#x2718;|&#x2718;||-|[Tele-AI/TeleChat-12B-v2](https://huggingface.co/Tele-AI/TeleChat-12B-v2)|
-|telechat-12b-v2-gptq-int4|[swift/TeleChat-12B-V2-GPTQ-Int4](https://modelscope.cn/models/swift/TeleChat-12B-V2-GPTQ-Int4/summary)|key_value, query|telechat-v2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|auto_gptq>=0.5|-|-|
+|telechat-12b-v2|[TeleAI/TeleChat-12B-v2](https://modelscope.cn/models/TeleAI/TeleChat-12B-v2/summary)|key_value, query|telechat|&#x2714;|&#x2718;|&#x2718;|&#x2718;||-|[Tele-AI/TeleChat-12B-v2](https://huggingface.co/Tele-AI/TeleChat-12B-v2)|
+|telechat-12b-v2-gptq-int4|[swift/TeleChat-12B-V2-GPTQ-Int4](https://modelscope.cn/models/swift/TeleChat-12B-V2-GPTQ-Int4/summary)|key_value, query|telechat|&#x2714;|&#x2718;|&#x2718;|&#x2718;|auto_gptq>=0.5|-|-|
+|telechat2-115b|[TeleAI/TeleChat2-115B](https://modelscope.cn/models/TeleAI/TeleChat2-115B/summary)|key_value, query|telechat2|&#x2714;|&#x2718;|&#x2718;|&#x2718;||-|[Tele-AI/TeleChat2-115B](https://huggingface.co/Tele-AI/TeleChat2-115B)|
 |grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;|&#x2718;|&#x2718;||-|[hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1)|
 |dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|moe|[databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct)|
 |dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|moe|[databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)|
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -599,6 +599,7 @@ class ModelType:
     telechat_12b = 'telechat-12b'
     telechat_12b_v2 = 'telechat-12b-v2'
     telechat_12b_v2_gptq_int4 = 'telechat-12b-v2-gptq-int4'
+    telechat2_115b = 'telechat2-115b'
     # grok-1
     grok_1 = 'grok-1'
     # dbrx
@@ -930,6 +931,14 @@ def _new_forward(self, x):
     support_vllm=True,
     support_flash_attn=True,
     hf_model_id='CohereForAI/c4ai-command-r-plus')
+@register_model(
+    ModelType.telechat2_115b,
+    'TeleAI/TeleChat2-115B',
+    LoRATM.telechat,
+    TemplateType.telechat2,
+    torch_dtype=torch.float16,
+    support_flash_attn=True,
+    hf_model_id='Tele-AI/TeleChat2-115B')
 def get_model_tokenizer_from_repo(model_dir: str,
                                   torch_dtype: Optional[torch.dtype],
                                   model_kwargs: Dict[str, Any],
@@ -5829,17 +5838,18 @@ def get_model_tokenizer_codellama(model_dir: str,
     ModelType.telechat_12b_v2,
     'TeleAI/TeleChat-12B-v2',
     LoRATM.telechat,
-    TemplateType.telechat_v2,
+    TemplateType.telechat,
     eos_token=2,
     support_flash_attn=True,
     hf_model_id='Tele-AI/TeleChat-12B-v2')
 @register_model(
     ModelType.telechat_12b_v2_gptq_int4,
     'swift/TeleChat-12B-V2-GPTQ-Int4',
     LoRATM.telechat,
-    TemplateType.telechat_v2,
+    TemplateType.telechat,
     eos_token=2,
     requires=['auto_gptq>=0.5'],
+    torch_dtype=torch.float16,
     support_flash_attn=True,
     function_kwargs={'gptq_bits': 4})
 def get_model_tokenizer_phi(model_dir: str,
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -138,7 +138,7 @@ class TemplateType:
     phi3 = 'phi3'
     phi3_vl = 'phi3-vl'
     telechat = 'telechat'
-    telechat_v2 = 'telechat-v2'
+    telechat2 = 'telechat2'
     dbrx = 'dbrx'
     mengzi = 'mengzi'
     c4ai = 'c4ai'
@@ -3448,7 +3448,7 @@ class MiniCPMV2_5Template(Llama3TemplateMixin, MiniCPMVTemplate):
 
 register_template(TemplateType.telechat, Template([], ['<_user>{{QUERY}}<_bot>'], ['<_end>'], ['<_end>']))
 
-register_template(TemplateType.telechat_v2, Template([], ['<_user> {{QUERY}}<_bot>'], [], ['<_end>']))
+register_template(TemplateType.telechat2, Template(['<_start>'], [[4], '{{QUERY}}', [5]], ['<_end>'], ['<_end>']))
 
 DBRX_SYSTEM = (
     'You are DBRX, created by Databricks. You were last updated in December 2023. '