update feat: merge lora (#82)

Jintao-Huang · web-flow · commit 3baf4e59f414 · 2023-09-20T11:52:50.000+08:00
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
@@ -74,7 +74,7 @@ cd swift/examples/pytorch/llm
 # If you want to push weights into modelscope hub during training, you need to set '--push_to_hub true'.
 # Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/lora/sft.sh
-bash scripts/qwen_7b_chat/lora/infer.sh
+bash scripts/qwen_7b_chat/lora/merge_lora_and_infer.sh
 
 # sft(lora+ddp) and infer qwen-7b-chat, Requires 2*38GB GPU memory.
 # Recommended experimental environment: A100
@@ -90,7 +90,7 @@ bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
 # If you want to use quantification, you need to `pip install bitsandbytes -U`
 # Recommended experimental environment: A10, 3090
 bash scripts/qwen_7b_chat/qlora/sft.sh
-bash scripts/qwen_7b_chat/qlora/infer.sh
+bash scripts/qwen_7b_chat/qlora/merge_lora_and_infer.sh
 
 # sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*14GB GPU memory.
 # Recommended experimental environment: A10, 3090
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
@@ -76,7 +76,7 @@ cd swift/examples/pytorch/llm
 # 如果你想在训练时, 将权重push到modelscope hub中, 你需要设置`--push_to_hub true`.
 # 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/lora/sft.sh
-bash scripts/qwen_7b_chat/lora/infer.sh
+bash scripts/qwen_7b_chat/lora/merge_lora_and_infer.sh
 
 # 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*38GB显存.
 # 推荐的实验环境: A100
@@ -92,7 +92,7 @@ bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
 # 如果你想要使用量化, 你需要`pip install bitsandbytes -U`
 # 推荐的实验环境: 3090, A10
 bash scripts/qwen_7b_chat/qlora/sft.sh
-bash scripts/qwen_7b_chat/qlora/infer.sh
+bash scripts/qwen_7b_chat/qlora/merge_lora_and_infer.sh
 
 # 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*14GB显存.
 # 推荐的实验环境: 3090, A10
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
@@ -6,10 +6,10 @@ python src/llm_infer.py \
     --dtype bf16 \
     --ckpt_dir "output/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
     --eval_human false \
-    --dataset cot-en,cot-zh \
-    --max_length 2048 \
+    --dataset damo-agent-mini-zh \
+    --max_length 4096 \
     --use_flash_attn true \
-    --max_new_tokens 1024 \
+    --max_new_tokens 2048 \
     --temperature 0.9 \
     --top_k 20 \
     --top_p 0.9 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/merge_lora_and_infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/merge_lora_and_infer.sh
@@ -0,0 +1,16 @@
+CUDA_VISIBLE_DEVICES=0 \
+python src/merge_lora_and_infer.py \
+    --model_type qwen-7b-chat \
+    --sft_type lora \
+    --template_type chatml \
+    --dtype bf16 \
+    --ckpt_dir "output/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
+    --eval_human false \
+    --dataset damo-agent-mini-zh \
+    --max_length 4096 \
+    --use_flash_attn true \
+    --max_new_tokens 2048 \
+    --temperature 0.9 \
+    --top_k 20 \
+    --top_p 0.9 \
+    --do_sample true \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh
@@ -7,10 +7,10 @@ python src/llm_sft.py \
     --template_type chatml \
     --dtype bf16 \
     --output_dir output \
-    --dataset cot-en,cot-zh \
-    --train_dataset_sample 50000 \
+    --dataset damo-agent-mini-zh \
+    --train_dataset_sample -1 \
     --num_train_epochs 1 \
-    --max_length 2048 \
+    --max_length 4096 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/merge_lora_and_infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/merge_lora_and_infer.sh
@@ -0,0 +1,18 @@
+CUDA_VISIBLE_DEVICES=0 \
+python src/merge_lora_and_infer.py \
+    --model_type qwen-7b-chat \
+    --sft_type lora \
+    --template_type chatml \
+    --dtype bf16 \
+    --ckpt_dir "output/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
+    --eval_human false \
+    --dataset advertise-gen \
+    --max_length 2048 \
+    --quantization_bit 4 \
+    --bnb_4bit_comp_dtype bf16 \
+    --use_flash_attn false \
+    --max_new_tokens 1024 \
+    --temperature 0.9 \
+    --top_k 20 \
+    --top_p 0.9 \
+    --do_sample true \
diff --git a/examples/pytorch/llm/src/merge_lora_and_infer.py b/examples/pytorch/llm/src/merge_lora_and_infer.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+import torch
+from transformers import BitsAndBytesConfig, GenerationConfig, TextStreamer
+from utils import (InferArguments, get_dataset, get_model_tokenizer,
+                   get_preprocess)
+from llm_infer import llm_infer
+from swift import Swift, get_logger
+from swift.tuners import LoRA
+from swift.utils import inference, parse_args, seed_everything
+
+logger = get_logger()
+
+
+def merge_lora(args: InferArguments) -> None:
+    assert args.sft_type == 'lora'
+    args.init_argument()
+    logger.info(f'device_count: {torch.cuda.device_count()}')
+
+    # ### Loading Model and Tokenizer
+    model, tokenizer = get_model_tokenizer(
+        args.model_type, torch_dtype=args.torch_dtype, device_map='cpu')
+
+    # ### Preparing LoRA
+    model = Swift.from_pretrained(model, args.ckpt_dir, inference_mode=True)
+    if not hasattr(model, 'peft_type'):
+        LoRA.unpatch_lora(model, model.adapters['default'].config, 'default')
+    else:
+        model.merge_and_unload()
+
+    new_ckpt_dir = os.path.abspath(
+        os.path.join(args.ckpt_dir, '..', 'output_ckpt'))
+    logger.info(f'new_ckpt_dir: `{new_ckpt_dir}`')
+    logger.info("Setting args.sft_type: 'full'")
+    logger.info(f'Setting args.ckpt_dir: {new_ckpt_dir}')
+    args.ckpt_dir = new_ckpt_dir
+    args.sft_type = 'full'
+    if not os.path.exists(args.ckpt_dir):
+        model.model.save_pretrained(args.ckpt_dir)
+        tokenizer.save_pretrained(args.ckpt_dir)
+
+
+if __name__ == '__main__':
+    args, remaining_argv = parse_args(InferArguments)
+    if len(remaining_argv) > 0:
+        if args.ignore_args_error:
+            logger.warning(f'remaining_argv: {remaining_argv}')
+        else:
+            raise ValueError(f'remaining_argv: {remaining_argv}')
+    merge_lora(args)
+    llm_infer(args)
diff --git a/swift/utils/llm_utils.py b/swift/utils/llm_utils.py
@@ -115,6 +115,7 @@ def inference(input_ids: List[int],
               tokenizer,
               streamer: Optional[TextStreamer] = None) -> str:
     generation_config = getattr(model, 'generation_config', None)
+    streamer.skip_prompt = True
     print(f'[INFERENCE]{tokenizer.decode(input_ids)}', end='')
     input_ids = torch.tensor(input_ids)[None].cuda()
     attention_mask = torch.ones_like(input_ids)