diff --git a/examples/sft/README.md b/examples/sft/README.md index 775a7da076..abbf957d9b 100644 --- a/examples/sft/README.md +++ b/examples/sft/README.md @@ -5,7 +5,7 @@ In this example, we'll see how to use [PEFT](https://github.com/huggingface/peft QLoRA uses 4-bit quantization of the base model to drastically reduce the GPU memory consumed by the base model while using LoRA for parameter-efficient fine-tuning. The command to use QLoRA is present at [run_peft.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft.sh). Note: -1. At present, `use_reentrant` needs to be `True` when using gradient checkpointing with QLoRA else QLoRA leads to high GPU memory consumption. +1. At present, `use_reentrant` needs to be `True` when using gradient checkpointing with QLoRA or else QLoRA leads to high GPU memory consumption. ## Single GPU SFT with QLoRA using Unsloth @@ -29,6 +29,8 @@ When you have access to multiple GPUs, it would be better to use normal LoRA wit ## Multi-GPU SFT with LoRA and FSDP When you have access to multiple GPUs, it would be better to use normal LoRA with DeepSpeed/FSDP. To use LoRA with FSDP, refer to the docs at [PEFT with FSDP](https://huggingface.co/docs/peft/accelerate/fsdp). +Note: FSDP is currently not compatible with 8bit bitsandbytes quantization. + ## Multi-GPU SFT with LoRA and FSDP for GPTQModel: As in [Multi-GPU SFT with LoRA and FSDP](https://github.com/huggingface/peft/blob/main/examples/sft/README.md#multi-gpu-sft-with-lora-and-fsdp), we also support other quantization methods like GPTQModel. You may need to install [GPTQModel](https://github.com/ModelCloud/GPTQModel) > v3.0.0 or from source. Here is the launch command for reference: [run_peft_fsdp_gptq.sh]. For the `--model_name_or_path` argument, it is important to pass a model that is already quantized with GPTQModel, like `"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"`. diff --git a/examples/sft/run_peft.sh b/examples/sft/run_peft.sh index 8aa48648d3..86d6ca2967 100644 --- a/examples/sft/run_peft.sh +++ b/examples/sft/run_peft.sh @@ -6,7 +6,7 @@ python train.py \ --add_special_tokens False \ --append_concat_token False \ --splits "train,test" \ ---max_seq_len 2048 \ +--max_length 2048 \ --num_train_epochs 1 \ --logging_steps 5 \ --log_level "info" \ diff --git a/examples/sft/run_peft_deepspeed.sh b/examples/sft/run_peft_deepspeed.sh index 95dbf08892..867ff85f78 100644 --- a/examples/sft/run_peft_deepspeed.sh +++ b/examples/sft/run_peft_deepspeed.sh @@ -6,7 +6,7 @@ accelerate launch --config_file "configs/deepspeed_config.yaml" train.py \ --add_special_tokens False \ --append_concat_token False \ --splits "train,test" \ ---max_seq_len 2048 \ +--max_length 2048 \ --num_train_epochs 1 \ --logging_steps 5 \ --log_level "info" \ @@ -36,4 +36,4 @@ accelerate launch --config_file "configs/deepspeed_config.yaml" train.py \ --lora_alpha 16 \ --lora_dropout 0.1 \ --lora_target_modules "all-linear" \ ---use_4bit_quantization False \ No newline at end of file +--use_4bit_quantization False diff --git a/examples/sft/run_peft_fsdp.sh b/examples/sft/run_peft_fsdp.sh index 63dd475f44..797764cdcd 100644 --- a/examples/sft/run_peft_fsdp.sh +++ b/examples/sft/run_peft_fsdp.sh @@ -6,7 +6,7 @@ accelerate launch --config_file "configs/fsdp_config.yaml" train.py \ --add_special_tokens False \ --append_concat_token False \ --splits "train,test" \ ---max_seq_len 2048 \ +--max_length 2048 \ --num_train_epochs 1 \ --logging_steps 5 \ --log_level "info" \ diff --git a/examples/sft/run_peft_fsdp_gptq.sh b/examples/sft/run_peft_fsdp_gptq.sh index 479a7eac83..d6d569c0ec 100644 --- a/examples/sft/run_peft_fsdp_gptq.sh +++ b/examples/sft/run_peft_fsdp_gptq.sh @@ -6,7 +6,7 @@ accelerate launch --config_file "configs/fsdp_config.yaml" train.py \ --add_special_tokens False \ --append_concat_token False \ --splits "train,test" \ ---max_seq_len 2048 \ +--max_length 2048 \ --num_train_epochs 1 \ --logging_steps 5 \ --log_level "info" \ diff --git a/examples/sft/run_peft_multigpu.sh b/examples/sft/run_peft_multigpu.sh index dbd108d0e0..d6d0ae7d08 100644 --- a/examples/sft/run_peft_multigpu.sh +++ b/examples/sft/run_peft_multigpu.sh @@ -6,7 +6,7 @@ torchrun --nproc_per_node 8 --nnodes 1 train.py \ --add_special_tokens False \ --append_concat_token False \ --splits "train,test" \ ---max_seq_len 2048 \ +--max_length 2048 \ --num_train_epochs 1 \ --logging_steps 5 \ --log_level "info" \ diff --git a/examples/sft/run_peft_qlora_deepspeed_stage3.sh b/examples/sft/run_peft_qlora_deepspeed_stage3.sh index 4bbc1bbcc4..a27980203e 100644 --- a/examples/sft/run_peft_qlora_deepspeed_stage3.sh +++ b/examples/sft/run_peft_qlora_deepspeed_stage3.sh @@ -6,7 +6,7 @@ accelerate launch --config_file "configs/deepspeed_config_z3_qlora.yaml" train. --add_special_tokens False \ --append_concat_token False \ --splits "train,test" \ ---max_seq_len 2048 \ +--max_length 2048 \ --num_train_epochs 1 \ --logging_steps 5 \ --log_level "info" \ @@ -39,4 +39,4 @@ accelerate launch --config_file "configs/deepspeed_config_z3_qlora.yaml" train. --use_4bit_quantization True \ --use_nested_quant True \ --bnb_4bit_compute_dtype "bfloat16" \ ---bnb_4bit_quant_storage_dtype "bfloat16" \ No newline at end of file +--bnb_4bit_quant_storage_dtype "bfloat16" diff --git a/examples/sft/run_peft_qlora_fsdp.sh b/examples/sft/run_peft_qlora_fsdp.sh index 4ed3218c82..ad04a71afd 100644 --- a/examples/sft/run_peft_qlora_fsdp.sh +++ b/examples/sft/run_peft_qlora_fsdp.sh @@ -6,7 +6,7 @@ accelerate launch --config_file "configs/fsdp_config_qlora.yaml" train.py \ --add_special_tokens False \ --append_concat_token False \ --splits "train,test" \ ---max_seq_len 2048 \ +--max_length 2048 \ --num_train_epochs 1 \ --logging_steps 5 \ --log_level "info" \ diff --git a/examples/sft/run_unsloth_peft.sh b/examples/sft/run_unsloth_peft.sh index 97a4a6b520..941ed337fb 100644 --- a/examples/sft/run_unsloth_peft.sh +++ b/examples/sft/run_unsloth_peft.sh @@ -6,7 +6,7 @@ python train.py \ --add_special_tokens False \ --append_concat_token False \ --splits "train,test" \ ---max_seq_len 2048 \ +--max_length 2048 \ --num_train_epochs 1 \ --logging_steps 5 \ --log_level "info" \ diff --git a/examples/sft/train.py b/examples/sft/train.py index 5a34f69357..921ae99104 100644 --- a/examples/sft/train.py +++ b/examples/sft/train.py @@ -18,10 +18,6 @@ class ModelArguments: model_name_or_path: str = field( metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} ) - max_seq_length: Optional[int] = field( - default=512, - metadata={"help": "The maximum total input sequence length after tokenization."}, - ) chat_template_format: Optional[str] = field( default="none", metadata={ @@ -156,4 +152,5 @@ def main(model_args, data_args, training_args): model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() + model_args.max_length = training_args.max_length main(model_args, data_args, training_args) diff --git a/examples/sft/utils.py b/examples/sft/utils.py index 0e24796de4..2fcb8dabd4 100644 --- a/examples/sft/utils.py +++ b/examples/sft/utils.py @@ -97,7 +97,9 @@ def create_and_prepare_model(args, data_args, training_args): ): raise NotImplementedError("Unsloth is not supported in distributed training") - if args.use_4bit_quantization: + if args.use_4bit_quantization and args.use_8bit_quantization: + raise ValueError("You configured 4bit and 8bit quantization at the same time, please choose only one of them.") + elif args.use_4bit_quantization: compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype) quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype) @@ -115,8 +117,8 @@ def create_and_prepare_model(args, data_args, training_args): print("=" * 80) print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16") print("=" * 80) - elif args.use_8bit_quantization: - bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization) + elif args.use_8bit_quantization: + bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization) if args.use_unsloth: if torch.xpu.is_available(): @@ -124,7 +126,7 @@ def create_and_prepare_model(args, data_args, training_args): # Load model model, _ = FastLanguageModel.from_pretrained( model_name=args.model_name_or_path, - max_seq_length=training_args.max_seq_length, + max_seq_length=training_args.max_length, dtype=None, load_in_4bit=args.use_4bit_quantization, ) @@ -211,7 +213,7 @@ def create_and_prepare_model(args, data_args, training_args): else args.lora_target_modules, use_gradient_checkpointing=training_args.gradient_checkpointing, random_state=training_args.seed, - max_seq_length=training_args.max_seq_length, + max_seq_length=training_args.max_length, ) return model, peft_config, tokenizer