1+ #! /bin/bash
2+ set -x
3+
4+ # Set default values for required parameters
5+ default_nproc=2
6+ default_save_path=" ./output"
7+ default_use_lora=true
8+ default_data_prefix=" sft/data/"
9+
10+ # Parse command line arguments
11+ nproc_per_node=${1:- $default_nproc }
12+ save_path=${2:- $default_save_path }
13+ use_lora=${3:- $default_use_lora }
14+ data_prefix=${4:- $default_data_prefix }
15+
16+ # Shift the processed arguments
17+ shift $(( $# > 3 ? 4 : $# ))
18+
19+ # Display usage information if needed
20+ if [ " $1 " = " help" ] || [ " $1 " = " --help" ] || [ " $1 " = " -h" ]; then
21+ echo " Usage: $( basename $0 ) [nproc_per_node] [save_path] [use_lora] [data_prefix] [other_configs...]"
22+ echo " nproc_per_node: Number of processes per node (default: $default_nproc )"
23+ echo " save_path: Directory to save model and logs (default: $default_save_path )"
24+ echo " use_lora: Whether to use LoRA for fine-tuning (true/false, default: $default_use_lora )"
25+ echo " data_prefix: Path prefix for training data (default: $default_data_prefix )"
26+ exit 0
27+ fi
28+
29+ # Create save directory if it doesn't exist
30+ if [ ! -d " $save_path " ]; then
31+ mkdir -p " $save_path "
32+ echo " Created directory: $save_path "
33+ fi
34+
35+ # Setup LoRA parameters if enabled
36+ lora_config=" "
37+ if [ " $use_lora " = true ]; then
38+ lora_config=" model.lora_rank=64 model.lora_alpha=32 model.target_modules=all-linear"
39+ fi
40+
41+ # Extract environment type from data path for naming
42+ env_type=$( basename $data_prefix )
43+
44+ # Generate a unique experiment name with timestamp
45+ timestamp=$( date +" %Y%m%d_%H%M%S" )
46+ experiment_name=" finetune_${env_type} _${timestamp} "
47+
48+ # Run the training process
49+ echo " Starting fine-tuning"
50+ echo " Processes per node: $nproc_per_node "
51+ echo " Save path: $save_path "
52+ echo " Data path: $data_prefix "
53+ echo " LoRA enabled: $use_lora "
54+
55+ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
56+ -m verl.trainer.fsdp_sft_trainer \
57+ data.train_files=${data_prefix} /train.parquet \
58+ data.val_files=${data_prefix} /test.parquet \
59+ data.prompt_key=prompt \
60+ data.response_key=response \
61+ data.max_length=2048 \
62+ optim.lr=1e-4 \
63+ data.train_batch_size=128 \
64+ data.micro_batch_size=4 \
65+ model.partial_pretrain=Qwen/Qwen2.5-0.5B \
66+ trainer.default_local_dir=$save_path \
67+ trainer.experiment_name=$experiment_name \
68+ trainer.logger=[' console' ,' wandb' ] \
69+ trainer.total_epochs=5 \
70+ trainer.default_hdfs_dir=null \
71+ trainer.validate_before_training=True \
72+ model.enable_gradient_checkpointing=False \
73+ $lora_config \
74+ " $@ " \
75+ 2>&1 | tee $save_path /train.log
76+
77+ echo " Training completed. Logs saved to $save_path /train.log"
0 commit comments