support more models (#971)

hjh0119 · web-flow · commit 14a528341169 · 2024-05-22T11:20:37.000+08:00
diff --git a/README.md b/README.md
@@ -45,6 +45,8 @@ Additionally, we are expanding capabilities for other modalities. Currently, we
 SWIFT has rich documentations for users, please check [here](https://github.com/modelscope/swift/tree/main/docs/source_en/LLM).
 
 ## 🎉 News
+- 2024.05.22: Supports DeepSeek-V2-Lite series models, model_type are `deepseek-v2-lite` and `deepseek-v2-lite-chat`
+- 2024.05.22: Supports TeleChat-12B-v2 model with quantized version, model_type are `telechat-12b-v2` and `telechat-12b-v2-gptq-int4`
 - 🔥2024.05.21: Inference and fine-tuning support for MiniCPM-Llama3-V-2_5 are now available. For more details, please refer to [minicpm-v-2.5 Best Practice](docs/source/Multi-Modal/minicpm-v-2.5最佳实践.md).
 - 🔥2024.05.20: Support for inferencing and fine-tuning cogvlm2-llama3-chinese-chat-19B, cogvlm2-llama3-chat-19B. you can refer to [cogvlm2 Best Practice](docs/source_en/Multi-Modal/cogvlm2-best-practice.md).
 - 🔥2024.05.17: Support peft=0.11.0. Meanwhile support 3 new tuners: `BOFT`, `Vera` and `Pissa`. use `--sft_type boft/vera` to use BOFT or Vera, use `--init_lora_weights pissa` with `--sft_type lora` to use Pissa.
@@ -482,7 +484,7 @@ The complete list of supported models and datasets can be found at [Supported Mo
 | Mistral<br>Mixtral                            | [Mistral series models](https://github.com/mistralai/mistral-src)       | English            | 7B-22B     | base model<br>instruct model<br>MoE model                     |
 | Yi<br>Yi1.5                                      | [01AI's YI series models](https://github.com/01-ai)                     | Chinese<br>English    | 6B-34B<br>including quantized             | base model<br>chat model<br>long text model            |
 | InternLM<br>InternLM2<br>InternLM2-Math              | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese<br>English | 1.8B-20B                            | base model<br>chat model<br>math model            |
-| DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math          | [DeepSeek series models](https://github.com/deepseek-ai)       | Chinese<br>English    | 1.3B-236B                               | base model<br>chat model<br>MoE model<br>code model<br>math model |
+| DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math<br>DeepSeek-V2          | [DeepSeek series models](https://github.com/deepseek-ai)       | Chinese<br>English    | 1.3B-236B                               | base model<br>chat model<br>MoE model<br>code model<br>math model |
 | MAMBA                                          | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English          | 130M-2.8B                              | base model                                 |
 | Gemma                                          | [Google Gemma series models](https://github.com/google/gemma_pytorch)   | English            | 2B-7B                                  | base model<br>instruct model                       |
 | MiniCPM                                        | [OpenBmB MiniCPM series models](https://github.com/OpenBMB/MiniCPM)     | Chinese<br>English    | 2B-3B                                  | chat model<br>MoE model                                 |
diff --git a/README_CN.md b/README_CN.md
@@ -46,6 +46,8 @@ SWIFT支持近**200种LLM和MLLM**（多模态大模型）的训练、推理、
 SWIFT具有丰富的文档体系，如有使用问题请请查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/LLM).
 
 ## 🎉 新闻
+- 2024.05.22: 支持DeepSeek-V2-lite系列模型, model_type为 `deepseek-v2-lite`和`deekseek-v2-lite-chat`
+- 2024.05.22: 支持TeleChat-12b-v2模型和量化版本, model_type为 `telechat-12b-v2`和`telechat-12b-v2-gptq-int4`
 - 🔥2024.05.21: 支持 MiniCPM-Llama3-V-2_5 的推理与微调, 可以查看[minicpm-v-2.5最佳实践](docs/source/Multi-Modal/minicpm-v-2.5最佳实践.md).
 - 🔥2024.05.20: 支持 cogvlm2-llama3-chinese-chat-19B, cogvlm2-llama3-chat-19B 的推理与微调, 可以查看[cogvlm2最佳实践](docs/source/Multi-Modal/cogvlm2最佳实践.md).
 - 🔥2024.05.17: 支持peft=0.11.0. 同时支持了三个新的tuner方法： `BOFT`, `Vera` 和 `Pissa`. 使用 `--sft_type boft/vera` 开启BOFT或者Vera, 使用 `--init_lora_weights pissa` 以及 `--sft_type lora` 来使用 Pissa.
@@ -481,7 +483,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | Mistral<br>Mixtral                                 | [Mistral系列模型](https://github.com/mistralai/mistral-src)  | 英文       | 7B-8x22B | base模型<br>instruct模型<br>MoE模型             |
 | Yi<br>Yi1.5                                    | [01AI的YI系列模型](https://github.com/01-ai)                 | 中文<br>英文 | 6B-34B<br>包含量化版本          | base模型<br>chat模型<br>长文本模型                 |
 | InternLM<br>InternLM2<br>InternLM2-Math                   | [浦江实验室书生浦语系列模型](https://github.com/InternLM/InternLM) | 中文<br>英文 | 1.8B-20B                  | base模型<br>chat模型<br>数学模型                  |
-| DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math               | [幻方系列模型](https://github.com/deepseek-ai)               | 中文<br>英文 | 1.3B-236B                  | base模型<br>chat模型<br>MoE模型<br>代码模型<br>数学模型 |
+| DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math<br>DeepSeek-V2       | [幻方系列模型](https://github.com/deepseek-ai)               | 中文<br>英文 | 1.3B-236B                  | base模型<br>chat模型<br>MoE模型<br>代码模型<br>数学模型 |
 | MAMBA                                               | [MAMBA时序卷积模型](https://github.com/state-spaces/mamba)   | 英文       | 130M-2.8B                 | base模型                                    |
 | Gemma                                               | [Google Gemma系列模型](https://github.com/google/gemma_pytorch) | 英文       | 2B-7B                     | base模型<br>instruct模型                      |
 | MiniCPM                                             | [OpenBmB MiniCPM系列模型](https://github.com/OpenBMB/MiniCPM) | 中文<br>英文 | 2B-3B                     | chat模型<br>MoE模型                                    |
diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -194,6 +194,8 @@
 |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|multi-modal, vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
 |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|multi-modal, vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
 |deepseek-v2-chat|[deepseek-ai/DeepSeek-V2-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V2-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat)|
+|deepseek-v2-lite|[deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)|
+|deepseek-v2-lite-chat|[deepseek-ai/DeepSeek-V2-Lite-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite-Chat/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V2-Lite-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat)|
 |gemma-2b|[AI-ModelScope/gemma-2b](https://modelscope.cn/models/AI-ModelScope/gemma-2b/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-2b](https://huggingface.co/google/gemma-2b)|
 |gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-7b](https://huggingface.co/google/gemma-7b)|
 |gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-2b-it](https://huggingface.co/google/gemma-2b-it)|
@@ -284,6 +286,8 @@
 |mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)|
 |telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|[Tele-AI/telechat-7B](https://huggingface.co/Tele-AI/telechat-7B)|
 |telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|[Tele-AI/TeleChat-12B](https://huggingface.co/Tele-AI/TeleChat-12B)|
+|telechat-12b-v2|[TeleAI/TeleChat-12B-v2](https://modelscope.cn/models/TeleAI/TeleChat-12B-v2/summary)|key_value, query|telechat-v2|&#x2714;|&#x2718;||-|[Tele-AI/TeleChat-12B-v2](https://huggingface.co/Tele-AI/TeleChat-12B-v2)|
+|telechat-12b-v2-gptq-int4|[swift/TeleChat-12B-V2-GPTQ-Int4](https://modelscope.cn/models/swift/TeleChat-12B-V2-GPTQ-Int4/summary)|key_value, query|telechat-v2|&#x2714;|&#x2718;|auto_gptq>=0.5|-|-|
 |grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1)|
 |dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|[databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct)|
 |dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|[databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)|
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
@@ -194,6 +194,8 @@ The table below introcudes all models supported by SWIFT:
 |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|multi-modal, vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
 |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|multi-modal, vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
 |deepseek-v2-chat|[deepseek-ai/DeepSeek-V2-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V2-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat)|
+|deepseek-v2-lite|[deepseek-ai/DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V2-Lite](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)|
+|deepseek-v2-lite-chat|[deepseek-ai/DeepSeek-V2-Lite-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Lite-Chat/summary)|q_a_proj, q_b_proj, kv_a_proj_with_mqa, kv_b_proj, o_proj|deepseek2|&#x2714;|&#x2714;|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V2-Lite-Chat](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat)|
 |gemma-2b|[AI-ModelScope/gemma-2b](https://modelscope.cn/models/AI-ModelScope/gemma-2b/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-2b](https://huggingface.co/google/gemma-2b)|
 |gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-7b](https://huggingface.co/google/gemma-7b)|
 |gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-2b-it](https://huggingface.co/google/gemma-2b-it)|
@@ -284,6 +286,8 @@ The table below introcudes all models supported by SWIFT:
 |mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)|
 |telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|[Tele-AI/telechat-7B](https://huggingface.co/Tele-AI/telechat-7B)|
 |telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|[Tele-AI/TeleChat-12B](https://huggingface.co/Tele-AI/TeleChat-12B)|
+|telechat-12b-v2|[TeleAI/TeleChat-12B-v2](https://modelscope.cn/models/TeleAI/TeleChat-12B-v2/summary)|key_value, query|telechat-v2|&#x2714;|&#x2718;||-|[Tele-AI/TeleChat-12B-v2](https://huggingface.co/Tele-AI/TeleChat-12B-v2)|
+|telechat-12b-v2-gptq-int4|[swift/TeleChat-12B-V2-GPTQ-Int4](https://modelscope.cn/models/swift/TeleChat-12B-V2-GPTQ-Int4/summary)|key_value, query|telechat-v2|&#x2714;|&#x2718;|auto_gptq>=0.5|-|-|
 |grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1)|
 |dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|[databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct)|
 |dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|[databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)|
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -244,6 +244,8 @@ class ModelType:
     deepseek_vl_7b_chat = 'deepseek-vl-7b-chat'
     # deepseek-v2
     deepseek_v2_chat = 'deepseek-v2-chat'
+    deepseek_v2_lite = 'deepseek-v2-lite'
+    deepseek_v2_lite_chat = 'deepseek-v2-lite-chat'
     # gemma
     gemma_2b = 'gemma-2b'
     gemma_7b = 'gemma-7b'
@@ -358,6 +360,8 @@ class ModelType:
     # teleAI
     telechat_7b = 'telechat-7b'
     telechat_12b = 'telechat-12b'
+    telechat_12b_v2 = 'telechat-12b-v2'
+    telechat_12b_v2_gptq_int4 = 'telechat-12b-v2-gptq-int4'
     # grok-1
     grok_1 = 'grok-1'
     # dbrx
@@ -2582,6 +2586,26 @@ def get_model_tokenizer_internlm2(model_dir: str,
     return model, tokenizer
 
 
+@register_model(
+    ModelType.deepseek_v2_lite,
+    'deepseek-ai/DeepSeek-V2-Lite',
+    LoRATM.deepseek2,
+    TemplateType.default_generation,
+    support_gradient_checkpointing=False,
+    support_flash_attn=True,
+    support_vllm=True,
+    requires=['transformers>=4.39.3'],
+    hf_model_id='deepseek-ai/DeepSeek-V2-Lite')
+@register_model(
+    ModelType.deepseek_v2_lite_chat,
+    'deepseek-ai/DeepSeek-V2-Lite-Chat',
+    LoRATM.deepseek2,
+    TemplateType.deepseek2,
+    support_gradient_checkpointing=False,
+    support_flash_attn=True,
+    support_vllm=True,
+    requires=['transformers>=4.39.3'],
+    hf_model_id='deepseek-ai/DeepSeek-V2-Lite-Chat')
 @register_model(
     ModelType.deepseek_v2_chat,
     'deepseek-ai/DeepSeek-V2-Chat',
@@ -3644,6 +3668,23 @@ def get_model_tokenizer_codellama(model_dir: str,
     TemplateType.telechat,
     support_flash_attn=True,
     hf_model_id='Tele-AI/TeleChat-12B')
+@register_model(
+    ModelType.telechat_12b_v2,
+    'TeleAI/TeleChat-12B-v2',
+    LoRATM.telechat,
+    TemplateType.telechat_v2,
+    eos_token=2,
+    support_flash_attn=True,
+    hf_model_id='Tele-AI/TeleChat-12B-v2')
+@register_model(
+    ModelType.telechat_12b_v2_gptq_int4,
+    'swift/TeleChat-12B-V2-GPTQ-Int4',
+    LoRATM.telechat,
+    TemplateType.telechat_v2,
+    eos_token=2,
+    requires=['auto_gptq>=0.5'],
+    support_flash_attn=True,
+    function_kwargs={'gptq_bits': 4})
 def get_model_tokenizer_phi(model_dir: str,
                             torch_dtype: Dtype,
                             model_kwargs: Dict[str, Any],
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -76,6 +76,7 @@ class TemplateType:
     atom = 'atom'
     phi3 = 'phi3'
     telechat = 'telechat'
+    telechat_v2 = 'telechat-v2'
     dbrx = 'dbrx'
     mengzi = 'mengzi'
     c4ai = 'c4ai'
@@ -1393,6 +1394,8 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]:
 
 register_template(TemplateType.telechat, Template([], ['<_user>{{QUERY}}<_bot>'], ['<_end>'], ['<_end>']))
 
+register_template(TemplateType.telechat_v2, Template([], ['<_user> {{QUERY}}<_bot>'], [], ['<_end>']))
+
 DBRX_SYSTEM = (
     'You are DBRX, created by Databricks. You were last updated in December 2023. '
     'You answer questions based on information available up to that point.\n'