Replace with loralib and add unload lora interface (#83)

tastelikefeet · web-flow · commit 56b331f2f4aa · 2023-09-21T09:43:37.000+08:00
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
@@ -28,13 +28,13 @@
 3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
 4. supported datasets:
    1. NLP: [alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), [alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, [jd-zh](https://modelscope.cn/datasets/DAMO_NLP/jd/summary), [dureader-robust-zh](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary), medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, [code-python-zh](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), [advertise-gen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)
-   2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
-   3. multi-modal: [coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
-   4. other: [cls-fudan-news-zh](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/files), [ner-jave-zh](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)
+   2. Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
+   3. Multi-Modal: [coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
+   4. Other: [cls-fudan-news-zh](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/files), [ner-jave-zh](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)
 5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation
 
 ## Prepare the Environment
-Experimental environment: V100, A10, 3090, A100, ... (V100 does not support bf16, quantization)
+Experimental environment: V100, A10, 3090, A100, ...
 ```bash
 # Installing miniconda
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
@@ -74,7 +74,7 @@ cd swift/examples/pytorch/llm
 # If you want to push weights into modelscope hub during training, you need to set '--push_to_hub true'.
 # Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/lora/sft.sh
-bash scripts/qwen_7b_chat/lora/merge_lora_and_infer.sh
+bash scripts/qwen_7b_chat/lora/infer.sh
 
 # sft(lora+ddp) and infer qwen-7b-chat, Requires 2*38GB GPU memory.
 # Recommended experimental environment: A100
@@ -88,12 +88,12 @@ bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
 
 # sft(qlora) and infer qwen-7b-chat, Requires 12GB GPU memory.
 # If you want to use quantification, you need to `pip install bitsandbytes -U`
-# Recommended experimental environment: A10, 3090
+# Recommended experimental environment: V100, A10, 3090
 bash scripts/qwen_7b_chat/qlora/sft.sh
-bash scripts/qwen_7b_chat/qlora/merge_lora_and_infer.sh
+bash scripts/qwen_7b_chat/qlora/infer.sh
 
 # sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*14GB GPU memory.
-# Recommended experimental environment: A10, 3090
+# Recommended experimental environment: V100, A10, 3090
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
@@ -29,13 +29,13 @@
 3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ...
 4. 支持的数据集:
    1. NLP: [alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), [alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, [jd-zh](https://modelscope.cn/datasets/DAMO_NLP/jd/summary), [dureader-robust-zh](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary), medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, [code-python-zh](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), [advertise-gen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)
-   2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
+   2. Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
    3. 多模态: [coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
    4. 其他: [cls-fudan-news-zh](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/files), [ner-jave-zh](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)
 5. 支持的对话模板: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation
 
 ## 准备实验环境
-实验环境: V100, A10, 3090, A100均可. (V100不支持bf16, 量化)
+实验环境: V100, A10, 3090, A100均可.
 ```bash
 # 安装miniconda
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
@@ -76,7 +76,7 @@ cd swift/examples/pytorch/llm
 # 如果你想在训练时, 将权重push到modelscope hub中, 你需要设置`--push_to_hub true`.
 # 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/lora/sft.sh
-bash scripts/qwen_7b_chat/lora/merge_lora_and_infer.sh
+bash scripts/qwen_7b_chat/lora/infer.sh
 
 # 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*38GB显存.
 # 推荐的实验环境: A100
@@ -90,12 +90,12 @@ bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
 
 # 微调(qlora)+推理 qwen-7b-chat, 需要12GB显存.
 # 如果你想要使用量化, 你需要`pip install bitsandbytes -U`
-# 推荐的实验环境: 3090, A10
+# 推荐的实验环境: V100, 3090, A10
 bash scripts/qwen_7b_chat/qlora/sft.sh
-bash scripts/qwen_7b_chat/qlora/merge_lora_and_infer.sh
+bash scripts/qwen_7b_chat/qlora/infer.sh
 
 # 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*14GB显存.
-# 推荐的实验环境: 3090, A10
+# 推荐的实验环境: V100, 3090, A10
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh
@@ -1,10 +1,10 @@
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
-    --model_type baichuan2-7b-chat \
+    --model_type baichuan2-7b \
     --sft_type lora \
-    --template_type baichuan \
+    --template_type default \
     --dtype bf16 \
-    --ckpt_dir "output/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \
+    --ckpt_dir "output/baichuan2-7b/vx_xxx/checkpoint-xxx" \
     --eval_human false \
     --dataset advertise-gen \
     --max_length 2048 \
@@ -15,3 +15,4 @@ python src/llm_infer.py \
     --top_k 20 \
     --top_p 0.9 \
     --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b/qlora/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b/qlora/sft.sh
@@ -2,9 +2,9 @@
 # 12GB GPU memory
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_sft.py \
-    --model_type baichuan2-7b-chat \
+    --model_type baichuan2-7b \
     --sft_type lora \
-    --template_type baichuan \
+    --template_type default \
     --dtype bf16 \
     --output_dir output \
     --dataset advertise-gen \
@@ -29,6 +29,6 @@ python src/llm_sft.py \
     --save_total_limit 2 \
     --logging_steps 10 \
     --push_to_hub false \
-    --hub_model_id baichuan2-7b-chat-qlora \
+    --hub_model_id baichuan2-7b-qlora \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
@@ -13,3 +13,4 @@ python src/llm_infer.py \
     --top_k 20 \
     --top_p 0.9 \
     --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh
@@ -1,5 +1,5 @@
 # Experimental environment: 2 * A100
-# 2 * 44GB GPU memory
+# 2 * 30GB GPU memory
 nproc_per_node=2
 CUDA_VISIBLE_DEVICES=0,1 \
 torchrun \
@@ -19,7 +19,7 @@ torchrun \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
-    --lora_target_modules W_pack o_proj \
+    --lora_target_modules ALL \
     --gradient_checkpointing false \
     --batch_size 1 \
     --weight_decay 0. \
diff --git a/examples/pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh
@@ -13,3 +13,4 @@ python src/llm_infer.py \
     --top_k 20 \
     --top_p 0.9 \
     --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b/qlora/infer.sh b/examples/pytorch/llm/scripts/internlm_20b/qlora/infer.sh
@@ -15,3 +15,4 @@ python src/llm_infer.py \
     --top_k 20 \
     --top_p 0.9 \
     --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_20b_chat/lora_ddp/infer.sh
@@ -13,3 +13,4 @@ python src/llm_infer.py \
     --top_k 20 \
     --top_p 0.9 \
     --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh
@@ -1,3 +1,4 @@
+# If you want to merge LoRA weights, please set merge_lora_and_save to true.
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
     --model_type qwen-7b \
@@ -14,3 +15,4 @@ python src/llm_infer.py \
     --top_k 20 \
     --top_p 0.9 \
     --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh
@@ -1,5 +1,7 @@
 # Experimental environment: 2 * A100
-# 2 * 75GB GPU memory
+# 2 * 75GB GPU memory (use flash_attn)
+# You need to install flash_attn or set gradient_checkpointing to True,
+# otherwise it may result in an OOM (Out of Memory) error.
 CUDA_VISIBLE_DEVICES=0,1 \
 python src/llm_sft.py \
     --model_type qwen-7b-chat \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh
@@ -1,5 +1,7 @@
 # Experimental environment: 4 * A100
-# 4 * 75GB GPU memory
+# 4 * 75GB GPU memory (use flash_attn)
+# You need to install flash_attn or set gradient_checkpointing to True,
+# otherwise it may result in an OOM (Out of Memory) error.
 nproc_per_node=2
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 torchrun \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
@@ -1,3 +1,4 @@
+# If you want to merge LoRA weights, please set merge_lora_and_save to true.
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
     --model_type qwen-7b-chat \
@@ -14,3 +15,4 @@ python src/llm_infer.py \
     --top_k 20 \
     --top_p 0.9 \
     --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/merge_lora_and_infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/merge_lora_and_infer.sh
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh
@@ -1,5 +1,7 @@
 # Experimental environment: A100
-# 38GB GPU memory
+# 60GB GPU memory (use flash_attn)
+# You need to install flash_attn or set gradient_checkpointing to True,
+# otherwise it may result in an OOM (Out of Memory) error.
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_sft.py \
     --model_type qwen-7b-chat \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
@@ -1,3 +1,4 @@
+# If you want to merge LoRA weights, please set merge_lora_and_save to true.
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
     --model_type qwen-7b-chat \
@@ -14,3 +15,4 @@ python src/llm_infer.py \
     --top_k 20 \
     --top_p 0.9 \
     --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
@@ -1,4 +1,4 @@
-# Experimental environment: 4 * V100(16GB)
+# Experimental environment: 4 * 3090
 # 4 * 15GB GPU memory
 nproc_per_node=2
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
@@ -1,3 +1,4 @@
+# If you want to merge LoRA weights, please set merge_lora_and_save to true.
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
     --model_type qwen-7b-chat \
@@ -16,3 +17,4 @@ python src/llm_infer.py \
     --top_k 20 \
     --top_p 0.9 \
     --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/merge_lora_and_infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/merge_lora_and_infer.sh
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh
@@ -1,3 +1,4 @@
+# If you want to merge LoRA weights, please set merge_lora_and_save to true.
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
     --model_type qwen-7b-chat \
@@ -16,3 +17,4 @@ python src/llm_infer.py \
     --top_k 20 \
     --top_p 0.9 \
     --do_sample true \
+    --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py
@@ -14,8 +14,37 @@
 logger = get_logger()
 
 
+def merge_lora(args: InferArguments) -> None:
+    assert args.sft_type == 'lora'
+    # ### Loading Model and Tokenizer
+    model, tokenizer = get_model_tokenizer(
+        args.model_type, torch_dtype=args.torch_dtype, device_map='cpu')
+
+    # ### Preparing LoRA
+    model = Swift.from_pretrained(model, args.ckpt_dir, inference_mode=True)
+    Swift.merge_and_unload(model)
+
+    ckpt_dir, ckpt_name = os.path.split(args.ckpt_dir)
+    merged_lora_path = os.path.abspath(
+        os.path.join(ckpt_dir, f'{ckpt_name}-merged'))
+    logger.info(f'merged_lora_path: `{merged_lora_path}`')
+    logger.info("Setting args.sft_type: 'full'")
+    logger.info(f'Setting args.ckpt_dir: {merged_lora_path}')
+    args.sft_type = 'full'
+    args.ckpt_dir = merged_lora_path
+    if not os.path.exists(args.ckpt_dir):
+        logger.info('Saving merged weights...')
+        model.model.save_pretrained(args.ckpt_dir)
+        tokenizer.save_pretrained(args.ckpt_dir)
+        logger.info('Successfully merged LoRA.')
+    else:
+        logger.info('The weight directory for the merged LoRa already exists, '
+                    'skipping the saving process.')
+
+
 def llm_infer(args: InferArguments) -> None:
-    args.init_argument()
+    if args.merge_lora_and_save:
+        merge_lora(args)
     logger.info(f'args: {args}')
     if not os.path.isdir(args.ckpt_dir):
         raise ValueError(f'Please enter a valid ckpt_dir: {args.ckpt_dir}')
@@ -65,6 +94,8 @@ def llm_infer(args: InferArguments) -> None:
         pad_token_id=tokenizer.pad_token_id,
         eos_token_id=tokenizer.eos_token_id)
     logger.info(f'generation_config: {generation_config}')
+    if args.save_generation_config:
+        generation_config.save_pretrained(args.ckpt_dir)
     model.generation_config = generation_config
 
     if args.eval_human:
@@ -92,6 +123,7 @@ def llm_infer(args: InferArguments) -> None:
 
 if __name__ == '__main__':
     args, remaining_argv = parse_args(InferArguments)
+    args.init_argument()
     if len(remaining_argv) > 0:
         if args.ignore_args_error:
             logger.warning(f'remaining_argv: {remaining_argv}')
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
@@ -25,7 +25,6 @@
 
 
 def llm_sft(args: SftArguments) -> None:
-    args.init_argument()
     logger.info(f'args: {args}')
     print(f'device_count: {torch.cuda.device_count()}')
     rank, local_rank, world_size, local_world_size = get_dist_setting()
@@ -217,7 +216,6 @@ def llm_sft(args: SftArguments) -> None:
                     ensure_ascii=False,
                     indent=2)
     trainer.train(training_args.resume_from_checkpoint)
-    logger.info(trainer.perf)
 
     # ### Visualization
     if is_master():
@@ -234,6 +232,7 @@ def llm_sft(args: SftArguments) -> None:
 
 if __name__ == '__main__':
     args, remaining_argv = parse_args(SftArguments)
+    args.init_argument()
     if len(remaining_argv) > 0:
         if args.ignore_args_error:
             logger.warning(f'remaining_argv: {remaining_argv}')
diff --git a/examples/pytorch/llm/src/merge_lora_and_infer.py b/examples/pytorch/llm/src/merge_lora_and_infer.py
diff --git a/examples/pytorch/llm/src/utils/argument.py b/examples/pytorch/llm/src/utils/argument.py
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py
diff --git a/requirements/framework.txt b/requirements/framework.txt
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py