bugfix, fsdp for lora

chencyudel · chencyudel · commit 4d7aee69585b · 2024-01-08T20:29:12.000+08:00
diff --git a/mftcoder_accelerate/README.md b/mftcoder_accelerate/README.md
@@ -280,10 +280,16 @@ However, this may slightly slow down the training speed.
 
 #### Q2：install packages
 Please refer to init_env.sh and requirements.txt
-
+We highly recommend you install Flash Attention 2 (flash_attn>=2.1.0, 2.3.6 used by us) first to get memory-efficient and fast training.
 
 #### Q3：How should I specify the GPUs for training？
 You can specify the visiable GPUs as below:
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file accelerate_ds_config.yaml mft_accelerate.py --train_config configs/xxx_train_config.json
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file accelerate_ds_config.yaml pefts/mft_accelerate.py --train_config configs/xxx_train_config.json
 ```
+
+#### Q4：Whats is a recommended Distributed Training?
+For LoRA/QLoRA, we recommend DeepSpeed(ZeRO2) as the underlying framework, because it is easy and stable to use, moreover it is more compatable for different settings.
+And FSDP does not support Quantization(integer type in training).
+
+For Full-parameter finetuning, FSDP is usually faster, and may help you with very large models by sharding parameters and gradients.
diff --git a/mftcoder_accelerate/README_cn.md b/mftcoder_accelerate/README_cn.md
@@ -78,7 +78,7 @@
 
 
 ## 3. 模型训练
-目前支持全量参数指令微调、QLoRA指令微调，LoRA指令微调。
+目前支持全量参数(Full-parameters)指令微调、QLoRA指令微调，LoRA指令微调。
 一些优秀的代码预训练模型权重，理论上，HuggingFace上开源的模型，均可使用本项目进行训练：
 
 🤗 [最新代码预训练SOTA，CodeLlama](https://huggingface.co/codellama/CodeLlama-34b-Python-hf) ：code-llama-34b， code-llama-34b-python, 新的SOTA基座。
@@ -109,18 +109,34 @@ cd mftcoder_accelerate/src
 这种方式充分利用了模型并行计算的优势，训练更加高效，同时也充分利用了decoder-only模型从左到右attention的特性，一次性将多轮对话中的每个target部分都参与了训练，训练更充分高效。
 
 ### 3.2 LoRA/QLoRA微调
+
+#### LoRA/QLoRA微调简介
 关于LoRA的详细介绍可参考论文：[LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/pdf/2106.09685.pdf)
 
 关于QLoRA的详细介绍可参考论文：[QLORA: Efficient Finetuning of Quantized LLMs](https://arxiv.org/pdf/2305.14314.pdf)
 
 QLoRA通过4-bit的nf4量化，且加入更多adapter，在大幅减少显存消耗的同时，尽可能逼近全量参数微调的效果。
 QLoRA论文指出，该方法可以在一张V100上对33B的模型进行微调，并且性能逼近全量参数微调。
 
-执行如下命令即可进行Lora/QLora/全量 微调：
+执行如下命令即可进行 Lora/QLora/全量 微调：
+#### Launch via Deepspeed
+deepspeed配置在accelerate_ds_config.yaml中。
+```bash
+accelerate launch --config_file accelerate_ds_config.yaml pefts/mft_accelerate.py --train_config configs/xxx_train_config.json --distributed_type "deepspeed" 
+```
+或者
+
+修改并执行如下sh脚本：
 
+deepspeed配置在脚本中通过命令行输入。
+```bash
+sh ds_single_launch.sh
+```
+
+#### Launch via FSDP
 deepspeed配置在accelerate_ds_config.yaml中。
 ```bash
-accelerate launch --config_file accelerate_ds_config.yaml pefts/mft_accelerate.py --train_config configs/xxx_train_config.json
+accelerate launch --config_file accelerate_fsdp_config.yaml pefts/mft_accelerate.py --train_config configs/xxx_train_config.json --distributed_type "fsdp"
 ```
 或者
 
@@ -226,19 +242,23 @@ print(gen_text)
 #### 问题3：如何指定使用某些卡训练？
 通过如下方式，即可指定使用0和1号卡进行训练:
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file pefts/accelerate_ds_config.yaml mft_accelerate.py --train_config configs/xxx_train_config.json
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file pefts/accelerate_ds_config.yaml pefts/mft_accelerate.py --train_config configs/xxx_train_config.json --distributed_type "deepspeed"
 ```
 
-#### 问题4：如果无法安装flash attention 2, 该如何训练
-参数"attn_implementation" 设置成 "eager" 可以用naive attention
+#### 问题4：关于Flash Attention, 该如何配置训练？
+首先，我们强烈建议您安装Flash Attention 2(FA2)，（>=2.1.0, 2.3.6功能更齐全）。
+
+训练参数中"attn_implementation" 设置成 "eager" 可以用naive attention，也就是未经加速的attention。
 
-如果你可以自行安装环境并使用torch>=2.1.1，可以尝试设置参数"attn_implementation"为 "sdpa"。这样会尝试使用transformers兼容的torch.nn.functional.scaled_dot_product_attention。支持的模型不全面。
+训练参数中"attn_implementation" 设置成 "flash_attention_2" 可以用FA2，速度快，省显存。
 
-#### 问题5：在FDSP模式下，使用LoRA + Flash Attention，需要注意什么？
-FSDP模式下，由于dtype统一的问题，FA需要将queue, key, value同时加入target_modules，适配这种情况不影响最终结果。
+如果你可以自行安装环境并使用torch>=2.1.1，可以尝试设置参数"attn_implementation"为 "sdpa"。这样会尝试使用transformers兼容的torch.nn.functional.scaled_dot_product_attention。支持的模型还不全面。
 
-FSDP模式下，不支持QLoRA, 因为目前对int类型的支持不够完全。
+#### 问题5：推荐的分布式框架是怎样的？
+对于LoRA/QLoRA, 我们推荐使用DeepSpeed作为底层分布式框架，它具有易用性和兼容性好的特点，并且速度很快。
+FSDP 不支持QLoRA, 因为bitsandbytes暂不支持FSDP。
 
+对于全量微调，我们推荐使用FSDP， 因为它在全量训练时可以发挥fully sharding的优势，已达到更快的训练速度。
 
 #### 问题6：当前支持的模型中，有什么区别
 国产大模型比如chatglm2， chatglm3， baichuan2， qwen， aquila2等，使用的是和模型共同发布的modeling_xxx.py. 
diff --git a/mftcoder_accelerate/src/ds_single_launch.sh b/mftcoder_accelerate/src/ds_single_launch.sh
@@ -1,3 +1,8 @@
+#!/bin/sh
+# Author: Chaoyu Chen
+# Last Modified: 2024/12/11
+# Description: An alternative(Command line) way to launch DeepSpeed training
+
 # Launch script on single node
 N_GPU_PER_NODE=8
 
@@ -26,5 +31,5 @@ accelerate launch \
     --machine_rank 0 \
     --rdzv_backend 'static' \
     pefts/mft_accelerate.py --train_config configs/"xxx_train_config.json" \
-      --distributed_type "DeepSpeed" \
+      --distributed_type "deepspeed" \
         > MFTCoder-training-"$TODAY".log 2>&1 &
diff --git a/mftcoder_accelerate/src/fsdp_single_launch.sh b/mftcoder_accelerate/src/fsdp_single_launch.sh
@@ -1,3 +1,8 @@
+#!/bin/sh
+# Author: Chaoyu Chen
+# Last Modified: 2024/12/11
+# Description: An alternative(command line) way to launch FSDP training
+
 # Launch script on single node
 N_GPU_PER_NODE=8
 
@@ -7,10 +12,11 @@ export TOKENIZERS_PARALLELISM=False
 
 TODAY=$(date +%Y-%m%d-%H%M)
 
-ccelerate launch \
+# accelerate launch --config_file accelerate_fsdp_config.yaml \
+accelerate launch \
     --use_fsdp \
     --num_machines=1 \
-    --num_processes=2 \
+    --num_processes=$N_GPU_PER_NODE \
     --fsdp_sharding_strategy=1 \
     --fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP \
     --fsdp_state_dict_type=FULL_STATE_DICT \
@@ -24,6 +30,6 @@ ccelerate launch \
     --machine_rank=0 \
     --rdzv_backend=static \
     pefts/mft_accelerate.py --train_config configs/"xxx_train_config.json" \
-        --distributed_type "FSDP" \
+        --distributed_type "fsdp" \
         > MFTCoder-training-"$TODAY".log 2>&1 &
 
diff --git a/mftcoder_accelerate/src/pefts/merge_base_and_lora_to_hf.py b/mftcoder_accelerate/src/pefts/merge_base_and_lora_to_hf.py
@@ -1,7 +1,8 @@
 """
 # @author Chaoyu Chen
 # @date 2023/10/19
-Merge base and adaptor
+
+Merge base and lora adaptor
 """
 import os
 import sys
diff --git a/mftcoder_accelerate/src/pefts/mft_accelerate.py b/mftcoder_accelerate/src/pefts/mft_accelerate.py
@@ -1,5 +1,5 @@
 """
-# @author qumu
+# @author Chaoyu Chen
 # @date 2023/12/11
 # @module mft_accelerate.py
 
@@ -374,8 +374,7 @@ def main():
             # args.saving_limit = None
     else:
         model.gradient_checkpointing_enable()
-        assert (args.saving_limit is not None and isinstance(args.saving_limit,
-                                                             int)), "saving_limit must be a integer in Full Training"
+        assert (args.saving_limit is not None and isinstance(args.saving_limit, int)), "saving_limit must be a integer in Full Training"
 
     # Potentially load in the lora from a previous save
     if args.peft_type:
@@ -412,7 +411,7 @@ def main():
         adam_optimizer = Adam
     elif accelerator.distributed_type == DistributedType.FSDP:
         accelerator.print("DISTRIBUTED TRAINING USING FSDP")
-        if getattr(accelerator.state, "fsdp_plugin", None) is not None:
+        if args.peft_type and getattr(accelerator.state, "fsdp_plugin", None) is not None:
             from peft.utils.other import fsdp_auto_wrap_policy
             accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(model)
         model = accelerator.prepare(model)
diff --git a/mftcoder_accelerate/src/pefts/model_mapping.py b/mftcoder_accelerate/src/pefts/model_mapping.py
@@ -1,14 +1,15 @@
 """
  # @author Chaoyu Chen
- # @date 2023/10/11
+ # @date 2023/12/11
+
  Manage supported models and their special token used in training.
  Default targeting modules for LoRA/QLora
  4.36 is stable now
 """
 # Models that Transformers support FA2
 from transformers import (
     AutoConfig,
-    AutoTokenizer, 
+    AutoTokenizer,
     AutoModelForCausalLM,
     GPTNeoXForCausalLM,
     GPTBigCodeForCausalLM,
@@ -24,6 +25,7 @@
 from model.qwen.modeling_qwen import QWenLMHeadModel
 from model.chatglm2.modeling_chatglm import ChatGLMForConditionalGeneration as ChatGLMForConditionalGeneration2
 from model.chatglm3.modeling_chatglm import ChatGLMForConditionalGeneration as ChatGLMForConditionalGeneration3
+
 # from model.phi.modeling_mixformer_sequential import MixFormerSequentialForCausalLM
 
 MODEL_TYPES = {
@@ -43,22 +45,21 @@
 }
 
 FULL_LORA_TARGETING_MODULES = {
-    "aquila": ["q_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
+    "aquila": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
     "baichuan": ["W_pack", "o_proj", "gate_proj", "down_proj", "up_proj"],
     "chatglm2": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
     "chatglm3": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
-    "deepseek": ["q_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
-    "code_llama": ["q_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
+    "deepseek": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
+    "code_llama": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
     "gpt_neox": ["query_key_value", 'dense', 'dense_h_to_4h', 'dense_4h_to_h'],
-    "llama": ["q_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
+    "llama": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
     "mistral": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
     "mixtral": ["q_proj", "k_proj", "v_proj", "o_proj"],
     "phi": ["query_key_value", 'dense', 'fc1', 'fc2'],
     "qwen": ["c_proj", "c_attn", "w1", "w2"],
-    "starcoder": ["c_proj", "c_attn", "q_attn", "c_fc"],   
+    "starcoder": ["c_proj", "c_attn", "q_attn", "c_fc"],
 }
 
-
 MODEL_SPECIAL_TOKENS = {
     "gpt_neox": {
 
@@ -94,7 +95,7 @@
 
         "eos_token": "<|endoftext|>",
         "pad_token": "<|extra_1|>",
-        
+
     },
     "chatglm2": {
 
@@ -125,7 +126,7 @@
         "pad_token": "<｜end▁of▁sentence｜>",
 
     },
-     "mixtral": {
+    "mixtral": {
 
         "eos_token": "</s>",
         "pad_token": "<unk>",
@@ -137,4 +138,4 @@
         "pad_token": "<unk>",
 
     },
-}
+}
diff --git a/mftcoder_accelerate/src/tokenizer/chat_template.py b/mftcoder_accelerate/src/tokenizer/chat_template.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
-# @author qumu
+# @author Chaoyu Chen
 # @date 2023/12/25
-# @module chat_template
+
 # store possible chat_template for tokenizers to prepare input string
 # -------------------------------------------------- Import ------------------------------------------------------------
 from transformers import (
diff --git a/mftcoder_accelerate/src/tokenizer/tokenizer.py b/mftcoder_accelerate/src/tokenizer/tokenizer.py
@@ -1,6 +1,8 @@
 """
 # @author Chaoyu Chen
 # @date 2023/6/19
+
+Build tokenizer
 """