Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion examples/sft/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ In this example, we'll see how to use [PEFT](https://github.com/huggingface/peft
QLoRA uses 4-bit quantization of the base model to drastically reduce the GPU memory consumed by the base model while using LoRA for parameter-efficient fine-tuning. The command to use QLoRA is present at [run_peft.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft.sh).

Note:
1. At present, `use_reentrant` needs to be `True` when using gradient checkpointing with QLoRA else QLoRA leads to high GPU memory consumption.
1. At present, `use_reentrant` needs to be `True` when using gradient checkpointing with QLoRA or else QLoRA leads to high GPU memory consumption.


## Single GPU SFT with QLoRA using Unsloth
Expand All @@ -29,6 +29,8 @@ When you have access to multiple GPUs, it would be better to use normal LoRA wit
## Multi-GPU SFT with LoRA and FSDP
When you have access to multiple GPUs, it would be better to use normal LoRA with DeepSpeed/FSDP. To use LoRA with FSDP, refer to the docs at [PEFT with FSDP](https://huggingface.co/docs/peft/accelerate/fsdp).

Note: FSDP is currently not compatible with 8bit bitsandbytes quantization.


## Multi-GPU SFT with LoRA and FSDP for GPTQModel:
As in [Multi-GPU SFT with LoRA and FSDP](https://github.com/huggingface/peft/blob/main/examples/sft/README.md#multi-gpu-sft-with-lora-and-fsdp), we also support other quantization methods like GPTQModel. You may need to install [GPTQModel](https://github.com/ModelCloud/GPTQModel) > v3.0.0 or from source. Here is the launch command for reference: [run_peft_fsdp_gptq.sh]. For the `--model_name_or_path` argument, it is important to pass a model that is already quantized with GPTQModel, like `"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"`.
Expand Down
2 changes: 1 addition & 1 deletion examples/sft/run_peft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ python train.py \
--add_special_tokens False \
--append_concat_token False \
--splits "train,test" \
--max_seq_len 2048 \
--max_length 2048 \
--num_train_epochs 1 \
--logging_steps 5 \
--log_level "info" \
Expand Down
4 changes: 2 additions & 2 deletions examples/sft/run_peft_deepspeed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ accelerate launch --config_file "configs/deepspeed_config.yaml" train.py \
--add_special_tokens False \
--append_concat_token False \
--splits "train,test" \
--max_seq_len 2048 \
--max_length 2048 \
--num_train_epochs 1 \
--logging_steps 5 \
--log_level "info" \
Expand Down Expand Up @@ -36,4 +36,4 @@ accelerate launch --config_file "configs/deepspeed_config.yaml" train.py \
--lora_alpha 16 \
--lora_dropout 0.1 \
--lora_target_modules "all-linear" \
--use_4bit_quantization False
--use_4bit_quantization False
2 changes: 1 addition & 1 deletion examples/sft/run_peft_fsdp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ accelerate launch --config_file "configs/fsdp_config.yaml" train.py \
--add_special_tokens False \
--append_concat_token False \
--splits "train,test" \
--max_seq_len 2048 \
--max_length 2048 \
--num_train_epochs 1 \
--logging_steps 5 \
--log_level "info" \
Expand Down
2 changes: 1 addition & 1 deletion examples/sft/run_peft_fsdp_gptq.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ accelerate launch --config_file "configs/fsdp_config.yaml" train.py \
--add_special_tokens False \
--append_concat_token False \
--splits "train,test" \
--max_seq_len 2048 \
--max_length 2048 \
--num_train_epochs 1 \
--logging_steps 5 \
--log_level "info" \
Expand Down
2 changes: 1 addition & 1 deletion examples/sft/run_peft_multigpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ torchrun --nproc_per_node 8 --nnodes 1 train.py \
--add_special_tokens False \
--append_concat_token False \
--splits "train,test" \
--max_seq_len 2048 \
--max_length 2048 \
--num_train_epochs 1 \
--logging_steps 5 \
--log_level "info" \
Expand Down
4 changes: 2 additions & 2 deletions examples/sft/run_peft_qlora_deepspeed_stage3.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ accelerate launch --config_file "configs/deepspeed_config_z3_qlora.yaml" train.
--add_special_tokens False \
--append_concat_token False \
--splits "train,test" \
--max_seq_len 2048 \
--max_length 2048 \
--num_train_epochs 1 \
--logging_steps 5 \
--log_level "info" \
Expand Down Expand Up @@ -39,4 +39,4 @@ accelerate launch --config_file "configs/deepspeed_config_z3_qlora.yaml" train.
--use_4bit_quantization True \
--use_nested_quant True \
--bnb_4bit_compute_dtype "bfloat16" \
--bnb_4bit_quant_storage_dtype "bfloat16"
--bnb_4bit_quant_storage_dtype "bfloat16"
2 changes: 1 addition & 1 deletion examples/sft/run_peft_qlora_fsdp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ accelerate launch --config_file "configs/fsdp_config_qlora.yaml" train.py \
--add_special_tokens False \
--append_concat_token False \
--splits "train,test" \
--max_seq_len 2048 \
--max_length 2048 \
--num_train_epochs 1 \
--logging_steps 5 \
--log_level "info" \
Expand Down
2 changes: 1 addition & 1 deletion examples/sft/run_unsloth_peft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ python train.py \
--add_special_tokens False \
--append_concat_token False \
--splits "train,test" \
--max_seq_len 2048 \
--max_length 2048 \
--num_train_epochs 1 \
--logging_steps 5 \
--log_level "info" \
Expand Down
5 changes: 1 addition & 4 deletions examples/sft/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@ class ModelArguments:
model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
max_seq_length: Optional[int] = field(
default=512,
metadata={"help": "The maximum total input sequence length after tokenization."},
)
chat_template_format: Optional[str] = field(
default="none",
metadata={
Expand Down Expand Up @@ -156,4 +152,5 @@ def main(model_args, data_args, training_args):
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
model_args.max_length = training_args.max_length
main(model_args, data_args, training_args)
12 changes: 7 additions & 5 deletions examples/sft/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ def create_and_prepare_model(args, data_args, training_args):
):
raise NotImplementedError("Unsloth is not supported in distributed training")

if args.use_4bit_quantization:
if args.use_4bit_quantization and args.use_8bit_quantization:
raise ValueError("You configured 4bit and 8bit quantization at the same time, please choose only one of them.")
elif args.use_4bit_quantization:
compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype)

Expand All @@ -115,16 +117,16 @@ def create_and_prepare_model(args, data_args, training_args):
print("=" * 80)
print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
print("=" * 80)
elif args.use_8bit_quantization:
bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization)
elif args.use_8bit_quantization:
bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization)

if args.use_unsloth:
if torch.xpu.is_available():
raise NotImplementedError("XPU hasn't supported unsloth yet")
# Load model
model, _ = FastLanguageModel.from_pretrained(
model_name=args.model_name_or_path,
max_seq_length=training_args.max_seq_length,
max_seq_length=training_args.max_length,
dtype=None,
load_in_4bit=args.use_4bit_quantization,
)
Expand Down Expand Up @@ -211,7 +213,7 @@ def create_and_prepare_model(args, data_args, training_args):
else args.lora_target_modules,
use_gradient_checkpointing=training_args.gradient_checkpointing,
random_state=training_args.seed,
max_seq_length=training_args.max_seq_length,
max_seq_length=training_args.max_length,
)

return model, peft_config, tokenizer
Loading