Skip to content

Commit 45256a0

Browse files
authored
update feat mp+ddp (#68)
1 parent 495a76c commit 45256a0

File tree

18 files changed

+187
-131
lines changed

18 files changed

+187
-131
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ Key features:
4141
6. openbuddy-llama series: openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b
4242
7. internlm series: internlm-7b, internlm-7b-chat, internlm-7b-chat-8k
4343
8. other: polylm-13b, seqgpt-560m
44-
3. supported features: quantization, ddp, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
44+
3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
4545
4. supported datasets:
4646
1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh
4747
2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh

examples/pytorch/llm/README.md

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
6. openbuddy-llama series: openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b
2626
7. internlm series: internlm-7b, internlm-7b-chat, internlm-7b-chat-8k
2727
8. other: polylm-13b, seqgpt-560m
28-
3. supported features: quantization, ddp, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
28+
3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
2929
4. supported datasets:
3030
1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh
3131
2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
@@ -79,24 +79,35 @@ bash scripts/qwen_7b_chat/lora/sft.sh
7979
bash scripts/qwen_7b_chat/lora/infer.sh
8080

8181
# sft(lora+ddp) and infer qwen-7b-chat, Requires 2*27GB GPU memory.
82+
# Recommended experimental environment: A100
8283
bash scripts/qwen_7b_chat/lora_ddp/sft.sh
8384
bash scripts/qwen_7b_chat/lora_ddp/infer.sh
8485

86+
# sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*14GB GPU memory.
87+
# Recommended experimental environment: V100, A10, 3090
88+
bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
89+
bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
90+
8591
# sft(qlora) and infer qwen-7b-chat, Requires 13GB GPU memory.
8692
# If you want to use quantification, you need to `pip install bitsandbytes -U`
87-
# Recommended experimental environment: 3090
93+
# Recommended experimental environment: A10, 3090
8894
bash scripts/qwen_7b_chat/qlora/sft.sh
8995
bash scripts/qwen_7b_chat/qlora/infer.sh
9096

9197
# sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*13GB GPU memory.
98+
# Recommended experimental environment: A10, 3090
9299
bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
93100
bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
94101

95-
# sft(full) and infer qwen-7b-chat, Requires 100GB GPU memory.
102+
# sft(full+mp) and infer qwen-7b-chat, Requires 2*50GB GPU memory.
96103
# Recommended experimental environment: A100
97-
bash scripts/qwen_7b_chat/full/sft.sh
98-
bash scripts/qwen_7b_chat/full/infer.sh
104+
bash scripts/qwen_7b_chat/full_mp/sft.sh
105+
bash scripts/qwen_7b_chat/full_mp/infer.sh
99106

107+
# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*50GB GPU memory.
108+
# Recommended experimental environment: A100
109+
bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
110+
bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
100111
# For more scripts, please see `scripts/` folder.
101112
```
102113

examples/pytorch/llm/README_CN.md

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,23 +81,35 @@ bash scripts/qwen_7b_chat/lora/sft.sh
8181
bash scripts/qwen_7b_chat/lora/infer.sh
8282

8383
# 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*27GB显存.
84+
# 推荐的实验环境: A100
8485
bash scripts/qwen_7b_chat/lora_ddp/sft.sh
8586
bash scripts/qwen_7b_chat/lora_ddp/infer.sh
8687

88+
# 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*14GB显存.
89+
# 推荐的实验环境: V100, 3090, A10
90+
bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
91+
bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
92+
8793
# 微调(qlora)+推理 qwen-7b-chat, 需要13GB显存.
8894
# 如果你想要使用量化, 你需要`pip install bitsandbytes -U`
89-
# 推荐的实验环境: 3090
95+
# 推荐的实验环境: 3090, A10
9096
bash scripts/qwen_7b_chat/qlora/sft.sh
9197
bash scripts/qwen_7b_chat/qlora/infer.sh
9298

9399
# 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*13GB显存.
100+
# 推荐的实验环境: 3090, A10
94101
bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
95102
bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
96103

97-
# 微调(full)+推理 qwen-7b-chat, 需要100G显存.
104+
# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*50G显存.
105+
# 推荐的实验环境: A100
106+
bash scripts/qwen_7b_chat/full_mp/sft.sh
107+
bash scripts/qwen_7b_chat/full_mp/infer.sh
108+
109+
# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*50G显存.
98110
# 推荐的实验环境: A100
99-
bash scripts/qwen_7b_chat/full/sft.sh
100-
bash scripts/qwen_7b_chat/full/infer.sh
111+
bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
112+
bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
101113

102114
# 更多的scripts脚本, 可以看`scripts`文件夹.
103115
```

examples/pytorch/llm/scripts/qwen_7b_chat/full/sft.sh renamed to examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Experimental environment: 2 * A100
2-
# 100GB GPU memory
2+
# 2 * 50GB GPU memory
33
CUDA_VISIBLE_DEVICES=0,1 \
44
python src/llm_sft.py \
55
--model_type qwen-7b-chat \

examples/pytorch/llm/scripts/qwen_vl_chat/lora/infer.sh renamed to examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,13 @@
11
CUDA_VISIBLE_DEVICES=0 \
22
python src/llm_infer.py \
3-
--model_type qwen-vl-chat \
4-
--sft_type lora \
3+
--model_type qwen-7b-chat \
4+
--sft_type full \
55
--template_type chatml \
66
--dtype bf16 \
7-
--ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \
8-
--eval_human false \
9-
--dataset coco-en \
10-
--dataset_sample 20000 \
11-
--max_length 2048 \
12-
--max_new_tokens 1024 \
7+
--ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
8+
--eval_human true \
139
--use_flash_attn true \
10+
--max_new_tokens 1024 \
1411
--temperature 0.9 \
1512
--top_k 50 \
1613
--top_p 0.9 \
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Experimental environment: 4 * A100
2+
# 4 * 50GB GPU memory
3+
nproc_per_node=2
4+
CUDA_VISIBLE_DEVICES=0,1,2,3 \
5+
torchrun \
6+
--nproc_per_node=$nproc_per_node \
7+
--master_port 29500 \
8+
src/llm_sft.py \
9+
--model_type qwen-7b-chat \
10+
--sft_type full \
11+
--template_type chatml \
12+
--dtype bf16 \
13+
--output_dir runs \
14+
--dataset alpaca-en,alpaca-zh \
15+
--dataset_sample -1 \
16+
--num_train_epochs 1 \
17+
--max_length 2048 \
18+
--gradient_checkpointing false \
19+
--batch_size 1 \
20+
--weight_decay 0.01 \
21+
--learning_rate 2e-5 \
22+
--gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
23+
--max_grad_norm 1 \
24+
--warmup_ratio 0.03 \
25+
--eval_steps 100 \
26+
--save_steps 100 \
27+
--only_save_model true \
28+
--save_total_limit 2 \
29+
--logging_steps 10 \
30+
--use_flash_attn true \
31+
--push_to_hub false \
32+
--hub_model_id qwen-7b-chat-full \
33+
--hub_private_repo true \
34+
--hub_token 'your-sdk-token' \

examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ python src/llm_infer.py \
66
--dtype bf16 \
77
--ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
88
--eval_human true \
9+
--use_flash_attn true \
910
--max_new_tokens 1024 \
1011
--temperature 0.9 \
1112
--top_k 50 \

examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/infer.sh renamed to examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,14 @@
11
CUDA_VISIBLE_DEVICES=0 \
22
python src/llm_infer.py \
3-
--model_type qwen-vl-chat \
3+
--model_type qwen-7b-chat \
44
--sft_type lora \
55
--template_type chatml \
6-
--dtype bf16 \
7-
--ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \
8-
--eval_human false \
9-
--dataset coco-en \
10-
--dataset_sample 20000 \
6+
--dtype fp16 \
7+
--ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
8+
--eval_human true \
119
--max_length 2048 \
12-
--quantization_bit 4 \
13-
--bnb_4bit_comp_dtype bf16 \
14-
--max_new_tokens 1024 \
1510
--use_flash_attn false \
11+
--max_new_tokens 1024 \
1612
--temperature 0.9 \
1713
--top_k 50 \
1814
--top_p 0.9 \
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,26 @@
1-
# Experimental environment: 3090
1+
# Experimental environment: 4 * V100(16GB)
2+
# 4 * 14GB GPU memory
23
nproc_per_node=2
3-
CUDA_VISIBLE_DEVICES=0,1 \
4+
CUDA_VISIBLE_DEVICES=0,1,2,3 \
45
torchrun \
56
--nproc_per_node=$nproc_per_node \
67
--master_port 29500 \
78
src/llm_sft.py \
8-
--model_type qwen-vl-chat \
9+
--model_type qwen-7b-chat \
910
--sft_type lora \
1011
--template_type chatml \
11-
--dtype bf16 \
12+
--dtype fp16 \
1213
--output_dir runs \
1314
--ddp_backend nccl \
14-
--dataset coco-en \
15-
--dataset_sample 20000 \
15+
--dataset alpaca-en,alpaca-zh \
16+
--dataset_sample -1 \
1617
--num_train_epochs 1 \
1718
--max_length 2048 \
18-
--quantization_bit 4 \
19-
--bnb_4bit_comp_dtype bf16 \
2019
--lora_rank 8 \
2120
--lora_alpha 32 \
2221
--lora_dropout_p 0. \
23-
--lora_target_modules c_attn attn.c_proj \
24-
--gradient_checkpointing false \
22+
--lora_target_modules c_attn c_proj \
23+
--gradient_checkpointing true \
2524
--batch_size 1 \
2625
--weight_decay 0. \
2726
--learning_rate 1e-4 \
@@ -34,6 +33,6 @@ torchrun \
3433
--logging_steps 10 \
3534
--use_flash_attn false \
3635
--push_to_hub false \
37-
--hub_model_id qwen-vl-chat-qlora \
36+
--hub_model_id qwen-7b-chat-lora \
3837
--hub_private_repo true \
3938
--hub_token 'your-sdk-token' \

0 commit comments

Comments
 (0)