support Mengzi-13b-base model (#646)

hjh0119 · jinghan · web-flow · commit 83246ea503c7 · 2024-04-02T21:50:22.000+08:00
Co-authored-by: jinghan &lt;jinghan@U-Y092T109-2224.local&gt;
diff --git a/README.md b/README.md
@@ -39,6 +39,7 @@ To facilitate use by users unfamiliar with deep learning, we provide a Gradio we
 Additionally, we are expanding capabilities for other modalities. Currently, we support full-parameter training and LoRA training for AnimateDiff.
 
 ## 🎉 News
+- 🔥2024.04.02: Support the fine-tuning and inference of Mengzi3-13B-Base model, use [this script](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/mengzi3_13b_base/lora_ddp_ds/sft.sh) to start training!
 - 🔥2024.04.01: Support **dbrx** series: dbrx-base and dbrx-instruct, use [this script](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/dbrx-instruct/lora_mp/sft.sh) to start training!
 - 🔥2024.03.29: Support **Qwen1.5-MoE** series: Qwen1.5-MoE-A2.7B, Qwen1.5-MoE-A2.7B-Chat, Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4.
 - 🔥2024.03.29: Support the fine-tuning and inference of **Grok-1** 300B MoE, please view details [here](https://github.com/modelscope/swift/tree/main/docs/source_en/LLM/Grok-1-best-practice.md).
@@ -398,6 +399,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | Grok | [X-ai](https://github.com/xai-org/grok-1) | English | 300B | base model |
 | TeleChat | [Tele-AI](https://github.com/Tele-AI/Telechat) | Chinese<br>English | 7B-12B | chat model |
 | dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model<br>chat model  |
+| mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese<br>English | 13B | base model  |
 
 
 #### MLLMs
diff --git a/README_CN.md b/README_CN.md
@@ -40,6 +40,7 @@ SWIFT支持近**200种LLM和MLLM**（多模态大模型）的训练、推理、
 此外，我们也在拓展其他模态的能力，目前我们支持了AnimateDiff的全参数训练和LoRA训练。
 
 ## 🎉 新闻
+- 🔥2024.04.02: 支持Mengzi3-13B-Base模型的推理与微调, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/mengzi3_13b_base/lora_ddp_ds/sft.sh)来开始训练！
 - 🔥2024.04.01: 支持**dbrx**系列, dbrx-base和dbrx-instruct, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/dbrx-instruct/lora_mp/sft.sh)来开始训练！.
 - 🔥2024.03.29: 支持**Qwen1.5-MoE**系列: Qwen1.5-MoE-A2.7B, Qwen1.5-MoE-A2.7B-Chat, Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4.
 - 🔥2024.03.29: 支持**Grok-1**300B MoE模型的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/LLM/Grok训练和推理.md).
@@ -397,6 +398,8 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | Grok | [X-ai](https://github.com/xai-org/grok-1) | 英文       | 300B | base模型                                    |
 | TeleChat | [Tele-AI](https://github.com/Tele-AI/Telechat) | 中文<br>英文 | 7B-12B | chat模型                                    |
 | dbrx | [databricks](https://github.com/databricks/dbrx) | 英文 | 132B | base模型<br>chat模型  |
+| mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | 中文<br>英文 | 13B | base模型  |
+
 
 #### 多模态大模型
 
diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -206,6 +206,7 @@
 |grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
 |dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|
 |dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|
+|mengzi3-13b-base|[langboat/Mengzi3-13B-Base](https://modelscope.cn/models/langboat/Mengzi3-13B-Base/summary)|q_proj, k_proj, v_proj|mengzi|&#x2718;|&#x2718;||-|
 
 
 ## 数据集
diff --git a/examples/pytorch/llm/scripts/mengzi3_13b_base/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/mengzi3_13b_base/lora_ddp_ds/infer.sh
@@ -0,0 +1,11 @@
+# Experimental environment: A100
+# 30GB GPU memory
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --ckpt_dir "output/mengzi3-13b-base/vx-xxx/checkpoint-xxx" \
+    --load_dataset_config true \
+    --temperature 0.1 \
+    --top_p 0.7 \
+    --repetition_penalty 1. \
+    --do_sample true \
+    --merge_lora false \
diff --git a/examples/pytorch/llm/scripts/mengzi3_13b_base/lora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/mengzi3_13b_base/lora_ddp_ds/sft.sh
@@ -0,0 +1,36 @@
+# Experimental environment: 2 * A100
+# 2 * 36GB GPU memory
+nproc_per_node=2
+
+CUDA_VISIBLE_DEVICES=0,1 \
+NPROC_PER_NODE=$nproc_per_node \
+MASTER_PORT=29500 \
+swift sft \
+    --model_id_or_path langboat/Mengzi3-13B-Base \
+    --model_revision master \
+    --sft_type lora \
+    --tuner_backend swift \
+    --dtype bf16 \
+    --output_dir output \
+    --ddp_backend nccl \
+    --dataset dureader-robust-zh \
+    --train_dataset_sample -1 \
+    --num_train_epochs 1 \
+    --max_length 2048 \
+    --check_dataset_strategy warning \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0.05 \
+    --lora_target_modules DEFAULT \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0.1 \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --deepspeed default-zero2 \
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -275,6 +275,8 @@ class ModelType:
     # dbrx
     dbrx_instruct = 'dbrx-instruct'
     dbrx_base = 'dbrx-base'
+    # mengzi
+    mengzi3_13b_base = 'mengzi3-13b-base'
 
     @classmethod
     def get_model_name_list(cls) -> List[str]:
@@ -437,6 +439,13 @@ def _register_model(
     TemplateType.default_generation,
     requires=['transformers<4.34'],
     support_vllm=True)
+@register_model(
+    ModelType.mengzi3_13b_base,
+    'langboat/Mengzi3-13B-Base',
+    LoRATM.llama2,
+    TemplateType.mengzi,
+    support_vllm=True,
+    support_flash_attn=True)
 def get_model_tokenizer_from_repo(model_dir: str,
                                   torch_dtype: Optional[Dtype],
                                   model_kwargs: Dict[str, Any],
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -60,6 +60,7 @@ class TemplateType:
     chatml = 'chatml'
     telechat = 'telechat'
     dbrx = 'dbrx'
+    mengzi = 'mengzi'
 
     @classmethod
     def get_template_name_list(cls) -> List[str]:
@@ -1220,6 +1221,11 @@ def get_generate_ids(generate_ids: Tensor,
         ['<|im_end|>\n'], ['<|im_end|>'], DBRX_SYSTEM,
         ['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n']))
 
+register_template(
+    TemplateType.mengzi,
+    Template([], ['输入：{{QUERY}}输出：\n'], [], [['eos_token_id']], None,
+             ['指令：{{SYSTEM}}']))
+
 
 def get_template(
     template_type: str,