update feat mp+ddp (#68)

Jintao-Huang · web-flow · commit 45256a008779 · 2023-09-12T22:15:05.000+08:00
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ Key features:
    6. openbuddy-llama series: openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b
    7. internlm series: internlm-7b, internlm-7b-chat, internlm-7b-chat-8k
    8. other: polylm-13b, seqgpt-560m
-3. supported features: quantization, ddp, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
+3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
 4. supported datasets:
    1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh
    2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
@@ -25,7 +25,7 @@
    6. openbuddy-llama series: openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b
    7. internlm series: internlm-7b, internlm-7b-chat, internlm-7b-chat-8k
    8. other: polylm-13b, seqgpt-560m
-3. supported features: quantization, ddp, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
+3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
 4. supported datasets:
    1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh
    2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
@@ -79,24 +79,35 @@ bash scripts/qwen_7b_chat/lora/sft.sh
 bash scripts/qwen_7b_chat/lora/infer.sh
 
 # sft(lora+ddp) and infer qwen-7b-chat, Requires 2*27GB GPU memory.
+# Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/lora_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_ddp/infer.sh
 
+# sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*14GB GPU memory.
+# Recommended experimental environment: V100, A10, 3090
+bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
+bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
+
 # sft(qlora) and infer qwen-7b-chat, Requires 13GB GPU memory.
 # If you want to use quantification, you need to `pip install bitsandbytes -U`
-# Recommended experimental environment: 3090
+# Recommended experimental environment: A10, 3090
 bash scripts/qwen_7b_chat/qlora/sft.sh
 bash scripts/qwen_7b_chat/qlora/infer.sh
 
 # sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*13GB GPU memory.
+# Recommended experimental environment: A10, 3090
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
-# sft(full) and infer qwen-7b-chat, Requires 100GB GPU memory.
+# sft(full+mp) and infer qwen-7b-chat, Requires 2*50GB GPU memory.
 # Recommended experimental environment: A100
-bash scripts/qwen_7b_chat/full/sft.sh
-bash scripts/qwen_7b_chat/full/infer.sh
+bash scripts/qwen_7b_chat/full_mp/sft.sh
+bash scripts/qwen_7b_chat/full_mp/infer.sh
 
+# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*50GB GPU memory.
+# Recommended experimental environment: A100
+bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
+bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
 # For more scripts, please see `scripts/` folder.
 ```
 
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
@@ -81,23 +81,35 @@ bash scripts/qwen_7b_chat/lora/sft.sh
 bash scripts/qwen_7b_chat/lora/infer.sh
 
 # 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*27GB显存.
+# 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/lora_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_ddp/infer.sh
 
+# 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*14GB显存.
+# 推荐的实验环境: V100, 3090, A10
+bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
+bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
+
 # 微调(qlora)+推理 qwen-7b-chat, 需要13GB显存.
 # 如果你想要使用量化, 你需要`pip install bitsandbytes -U`
-# 推荐的实验环境: 3090
+# 推荐的实验环境: 3090, A10
 bash scripts/qwen_7b_chat/qlora/sft.sh
 bash scripts/qwen_7b_chat/qlora/infer.sh
 
 # 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*13GB显存.
+# 推荐的实验环境: 3090, A10
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
-# 微调(full)+推理 qwen-7b-chat, 需要100G显存.
+# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*50G显存.
+# 推荐的实验环境: A100
+bash scripts/qwen_7b_chat/full_mp/sft.sh
+bash scripts/qwen_7b_chat/full_mp/infer.sh
+
+# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*50G显存.
 # 推荐的实验环境: A100
-bash scripts/qwen_7b_chat/full/sft.sh
-bash scripts/qwen_7b_chat/full/infer.sh
+bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
+bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
 
 # 更多的scripts脚本, 可以看`scripts`文件夹.
 ```
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh
@@ -1,5 +1,5 @@
 # Experimental environment: 2 * A100
-# 100GB GPU memory
+# 2 * 50GB GPU memory
 CUDA_VISIBLE_DEVICES=0,1 \
 python src/llm_sft.py \
     --model_type qwen-7b-chat \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
@@ -1,16 +1,13 @@
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
-    --model_type qwen-vl-chat \
-    --sft_type lora \
+    --model_type qwen-7b-chat \
+    --sft_type full \
     --template_type chatml \
     --dtype bf16 \
-    --ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human false \
-    --dataset coco-en \
-    --dataset_sample 20000 \
-    --max_length 2048 \
-    --max_new_tokens 1024 \
+    --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
+    --eval_human true \
     --use_flash_attn true \
+    --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \
     --top_p 0.9 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh
@@ -0,0 +1,34 @@
+# Experimental environment: 4 * A100
+# 4 * 50GB GPU memory
+nproc_per_node=2
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+torchrun \
+    --nproc_per_node=$nproc_per_node \
+    --master_port 29500 \
+    src/llm_sft.py \
+    --model_type qwen-7b-chat \
+    --sft_type full \
+    --template_type chatml \
+    --dtype bf16 \
+    --output_dir runs \
+    --dataset alpaca-en,alpaca-zh \
+    --dataset_sample -1 \
+    --num_train_epochs 1 \
+    --max_length 2048 \
+    --gradient_checkpointing false \
+    --batch_size 1 \
+    --weight_decay 0.01 \
+    --learning_rate 2e-5 \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --max_grad_norm 1 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --only_save_model true \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --use_flash_attn true \
+    --push_to_hub false \
+    --hub_model_id qwen-7b-chat-full \
+    --hub_private_repo true \
+    --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
@@ -6,6 +6,7 @@ python src/llm_infer.py \
     --dtype bf16 \
     --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
     --eval_human true \
+    --use_flash_attn true \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
@@ -1,18 +1,14 @@
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
-    --model_type qwen-vl-chat \
+    --model_type qwen-7b-chat \
     --sft_type lora \
     --template_type chatml \
-    --dtype bf16 \
-    --ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human false \
-    --dataset coco-en \
-    --dataset_sample 20000 \
+    --dtype fp16 \
+    --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
+    --eval_human true \
     --max_length 2048 \
-    --quantization_bit 4 \
-    --bnb_4bit_comp_dtype bf16 \
-    --max_new_tokens 1024 \
     --use_flash_attn false \
+    --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \
     --top_p 0.9 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
@@ -1,27 +1,26 @@
-# Experimental environment: 3090
+# Experimental environment: 4 * V100(16GB)
+# 4 * 14GB GPU memory
 nproc_per_node=2
-CUDA_VISIBLE_DEVICES=0,1 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
 torchrun \
     --nproc_per_node=$nproc_per_node \
     --master_port 29500 \
     src/llm_sft.py \
-    --model_type qwen-vl-chat \
+    --model_type qwen-7b-chat \
     --sft_type lora \
     --template_type chatml \
-    --dtype bf16 \
+    --dtype fp16 \
     --output_dir runs \
     --ddp_backend nccl \
-    --dataset coco-en \
-    --dataset_sample 20000 \
+    --dataset alpaca-en,alpaca-zh \
+    --dataset_sample -1 \
     --num_train_epochs 1 \
     --max_length 2048 \
-    --quantization_bit 4 \
-    --bnb_4bit_comp_dtype bf16 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
-    --lora_target_modules c_attn attn.c_proj \
-    --gradient_checkpointing false \
+    --lora_target_modules c_attn c_proj \
+    --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0. \
     --learning_rate 1e-4 \
@@ -34,6 +33,6 @@ torchrun \
     --logging_steps 10 \
     --use_flash_attn false \
     --push_to_hub false \
-    --hub_model_id qwen-vl-chat-qlora \
+    --hub_model_id qwen-7b-chat-lora \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
@@ -8,6 +8,7 @@ python src/llm_infer.py \
     --eval_human true \
     --quantization_bit 4 \
     --bnb_4bit_comp_dtype bf16 \
+    --use_flash_attn false \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh
@@ -8,6 +8,7 @@ python src/llm_infer.py \
     --eval_human true \
     --quantization_bit 4 \
     --bnb_4bit_comp_dtype bf16 \
+    --use_flash_attn false \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
@@ -13,9 +13,9 @@
 from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
                    broadcast_string, check_json_format,
                    find_all_linear_for_lora, get_dataset, get_dist_setting,
-                   get_model_tokenizer, get_preprocess, is_dist, is_master,
-                   plot_images, process_dataset, select_bnb, select_dtype,
-                   show_layers, sort_by_max_length)
+                   get_model_tokenizer, get_preprocess, is_ddp_plus_mp,
+                   is_dist, is_master, plot_images, process_dataset,
+                   select_bnb, select_dtype, show_layers, sort_by_max_length)
 
 from swift import (HubStrategy, LoraConfig, Seq2SeqTrainer,
                    Seq2SeqTrainingArguments, Swift, get_logger)
@@ -37,7 +37,6 @@ class SftArguments:
     template_type: str = field(
         default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
     output_dir: str = 'runs'
-    # DDP + MP(device_map) is not supported
     ddp_backend: Optional[str] = field(
         default=None, metadata={'choices': ['nccl', 'gloo', 'mpi', 'ccl']})
 
@@ -192,7 +191,7 @@ def llm_sft(args: SftArguments) -> None:
 
     # ### Loading Model and Tokenizer
     kwargs = {'low_cpu_mem_usage': True}
-    if is_dist():
+    if is_dist() and not is_ddp_plus_mp():
         kwargs['device_map'] = {'': local_rank}
     else:
         kwargs['device_map'] = 'auto'
diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py
@@ -3,6 +3,6 @@
 from .preprocess import TEMPLATE_MAPPING, get_preprocess
 from .utils import (broadcast_string, check_json_format, download_dataset,
                     find_all_linear_for_lora, get_dist_setting, inference,
-                    is_dist, is_local_master, is_master, plot_images,
-                    process_dataset, select_bnb, select_dtype, show_layers,
-                    sort_by_max_length)
+                    is_ddp_plus_mp, is_dist, is_local_master, is_master,
+                    plot_images, process_dataset, select_bnb, select_dtype,
+                    show_layers, sort_by_max_length)
diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py