Skip to content

Commit 83246ea

Browse files
hjh0119jinghan
andauthored
support Mengzi-13b-base model (#646)
Co-authored-by: jinghan <[email protected]>
1 parent 190491c commit 83246ea

File tree

7 files changed

+68
-0
lines changed

7 files changed

+68
-0
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ To facilitate use by users unfamiliar with deep learning, we provide a Gradio we
3939
Additionally, we are expanding capabilities for other modalities. Currently, we support full-parameter training and LoRA training for AnimateDiff.
4040

4141
## 🎉 News
42+
- 🔥2024.04.02: Support the fine-tuning and inference of Mengzi3-13B-Base model, use [this script](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/mengzi3_13b_base/lora_ddp_ds/sft.sh) to start training!
4243
- 🔥2024.04.01: Support **dbrx** series: dbrx-base and dbrx-instruct, use [this script](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/dbrx-instruct/lora_mp/sft.sh) to start training!
4344
- 🔥2024.03.29: Support **Qwen1.5-MoE** series: Qwen1.5-MoE-A2.7B, Qwen1.5-MoE-A2.7B-Chat, Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4.
4445
- 🔥2024.03.29: Support the fine-tuning and inference of **Grok-1** 300B MoE, please view details [here](https://github.com/modelscope/swift/tree/main/docs/source_en/LLM/Grok-1-best-practice.md).
@@ -398,6 +399,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
398399
| Grok | [X-ai](https://github.com/xai-org/grok-1) | English | 300B | base model |
399400
| TeleChat | [Tele-AI](https://github.com/Tele-AI/Telechat) | Chinese<br>English | 7B-12B | chat model |
400401
| dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model<br>chat model |
402+
| mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese<br>English | 13B | base model |
401403

402404

403405
#### MLLMs

README_CN.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ SWIFT支持近**200种LLM和MLLM**(多模态大模型)的训练、推理、
4040
此外,我们也在拓展其他模态的能力,目前我们支持了AnimateDiff的全参数训练和LoRA训练。
4141

4242
## 🎉 新闻
43+
- 🔥2024.04.02: 支持Mengzi3-13B-Base模型的推理与微调, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/mengzi3_13b_base/lora_ddp_ds/sft.sh)来开始训练!
4344
- 🔥2024.04.01: 支持**dbrx**系列, dbrx-base和dbrx-instruct, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/dbrx-instruct/lora_mp/sft.sh)来开始训练!.
4445
- 🔥2024.03.29: 支持**Qwen1.5-MoE**系列: Qwen1.5-MoE-A2.7B, Qwen1.5-MoE-A2.7B-Chat, Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4.
4546
- 🔥2024.03.29: 支持**Grok-1**300B MoE模型的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/LLM/Grok训练和推理.md).
@@ -397,6 +398,8 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
397398
| Grok | [X-ai](https://github.com/xai-org/grok-1) | 英文 | 300B | base模型 |
398399
| TeleChat | [Tele-AI](https://github.com/Tele-AI/Telechat) | 中文<br>英文 | 7B-12B | chat模型 |
399400
| dbrx | [databricks](https://github.com/databricks/dbrx) | 英文 | 132B | base模型<br>chat模型 |
401+
| mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | 中文<br>英文 | 13B | base模型 |
402+
400403

401404
#### 多模态大模型
402405

docs/source/LLM/支持的模型和数据集.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@
206206
|grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
207207
|dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|
208208
|dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|
209+
|mengzi3-13b-base|[langboat/Mengzi3-13B-Base](https://modelscope.cn/models/langboat/Mengzi3-13B-Base/summary)|q_proj, k_proj, v_proj|mengzi|&#x2718;|&#x2718;||-|
209210

210211

211212
## 数据集
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Experimental environment: A100
2+
# 30GB GPU memory
3+
CUDA_VISIBLE_DEVICES=0 \
4+
swift infer \
5+
--ckpt_dir "output/mengzi3-13b-base/vx-xxx/checkpoint-xxx" \
6+
--load_dataset_config true \
7+
--temperature 0.1 \
8+
--top_p 0.7 \
9+
--repetition_penalty 1. \
10+
--do_sample true \
11+
--merge_lora false \
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Experimental environment: 2 * A100
2+
# 2 * 36GB GPU memory
3+
nproc_per_node=2
4+
5+
CUDA_VISIBLE_DEVICES=0,1 \
6+
NPROC_PER_NODE=$nproc_per_node \
7+
MASTER_PORT=29500 \
8+
swift sft \
9+
--model_id_or_path langboat/Mengzi3-13B-Base \
10+
--model_revision master \
11+
--sft_type lora \
12+
--tuner_backend swift \
13+
--dtype bf16 \
14+
--output_dir output \
15+
--ddp_backend nccl \
16+
--dataset dureader-robust-zh \
17+
--train_dataset_sample -1 \
18+
--num_train_epochs 1 \
19+
--max_length 2048 \
20+
--check_dataset_strategy warning \
21+
--lora_rank 8 \
22+
--lora_alpha 32 \
23+
--lora_dropout_p 0.05 \
24+
--lora_target_modules DEFAULT \
25+
--gradient_checkpointing true \
26+
--batch_size 1 \
27+
--weight_decay 0.1 \
28+
--learning_rate 1e-4 \
29+
--gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
30+
--max_grad_norm 0.5 \
31+
--warmup_ratio 0.03 \
32+
--eval_steps 100 \
33+
--save_steps 100 \
34+
--save_total_limit 2 \
35+
--logging_steps 10 \
36+
--deepspeed default-zero2 \

swift/llm/utils/model.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,8 @@ class ModelType:
275275
# dbrx
276276
dbrx_instruct = 'dbrx-instruct'
277277
dbrx_base = 'dbrx-base'
278+
# mengzi
279+
mengzi3_13b_base = 'mengzi3-13b-base'
278280

279281
@classmethod
280282
def get_model_name_list(cls) -> List[str]:
@@ -437,6 +439,13 @@ def _register_model(
437439
TemplateType.default_generation,
438440
requires=['transformers<4.34'],
439441
support_vllm=True)
442+
@register_model(
443+
ModelType.mengzi3_13b_base,
444+
'langboat/Mengzi3-13B-Base',
445+
LoRATM.llama2,
446+
TemplateType.mengzi,
447+
support_vllm=True,
448+
support_flash_attn=True)
440449
def get_model_tokenizer_from_repo(model_dir: str,
441450
torch_dtype: Optional[Dtype],
442451
model_kwargs: Dict[str, Any],

swift/llm/utils/template.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ class TemplateType:
6060
chatml = 'chatml'
6161
telechat = 'telechat'
6262
dbrx = 'dbrx'
63+
mengzi = 'mengzi'
6364

6465
@classmethod
6566
def get_template_name_list(cls) -> List[str]:
@@ -1220,6 +1221,11 @@ def get_generate_ids(generate_ids: Tensor,
12201221
['<|im_end|>\n'], ['<|im_end|>'], DBRX_SYSTEM,
12211222
['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n']))
12221223

1224+
register_template(
1225+
TemplateType.mengzi,
1226+
Template([], ['输入:{{QUERY}}输出:\n'], [], [['eos_token_id']], None,
1227+
['指令:{{SYSTEM}}']))
1228+
12231229

12241230
def get_template(
12251231
template_type: str,

0 commit comments

Comments
 (0)