Skip to content

Commit 39a75ee

Browse files
authored
Merge pull request #52 from bigcode-project/finetune-starcoder
Finetune StarCoder Megatron
2 parents b4efd14 + b291323 commit 39a75ee

File tree

2 files changed

+285
-0
lines changed

2 files changed

+285
-0
lines changed
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=starcoderpy
3+
#SBATCH --nodes=64
4+
#SBATCH --ntasks-per-node=1
5+
#SBATCH --exclusive
6+
#SBATCH --gres=gpu:8
7+
#SBATCH --partition=production-cluster
8+
#SBATCH --output=/fsx/leandro/logs/starcoderpy/bcs-%x-%j.out
9+
10+
set -x -e
11+
source /admin/home/leandro/.bashrc
12+
13+
conda activate megatron
14+
15+
echo "START TIME: $(date)"
16+
17+
# File Path setup
18+
SCRIPT_REPO=/fsx/leandro/git/Megatron-LM-BC
19+
pushd $SCRIPT_REPO
20+
21+
LOG_PATH=$SCRIPT_REPO/main_log.txt
22+
23+
# Training setup
24+
GPUS_PER_NODE=8
25+
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
26+
MASTER_PORT=6000
27+
NNODES=$SLURM_NNODES
28+
NODE_RANK=$SLURM_PROCID
29+
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
30+
31+
# File path setup
32+
STARCODER_PATH=/fsx/boomcode/starcoder/
33+
CHECKPOINT_PATH=/fsx/boomcode/starcoderpy/$SLURM_JOB_ID
34+
TOKENIZER_FILE=/fsx/boomcode/tokenizer-starcoder/tokenizer.json
35+
WEIGHTS_TRAIN=/fsx/boomcode/datamix_python/train_data_paths.txt.tmp
36+
WEIGHTS_VALID=/fsx/boomcode/datamix_python/valid_data_paths.txt.tmp
37+
DATA_PATH=/fsx/boomcode/tokenized/python/
38+
mkdir -p $CHECKPOINT_PATH/tensorboard
39+
40+
GPT_ARGS="\
41+
--tensor-model-parallel-size 4 \
42+
--pipeline-model-parallel-size 4 \
43+
--sequence-parallel \
44+
--num-layers 40 \
45+
--hidden-size 6144 \
46+
--num-attention-heads 48 \
47+
--attention-head-type multiquery \
48+
--init-method-std 0.01275 \
49+
--seq-length 8192 \
50+
--max-position-embeddings 8192 \
51+
--attention-dropout 0.1 \
52+
--hidden-dropout 0.1 \
53+
--micro-batch-size 1 \
54+
--global-batch-size 512 \
55+
--lr 0.00005 \
56+
--min-lr 0.000005 \
57+
--train-iters 258500 \
58+
--lr-decay-iters 8500 \
59+
--lr-decay-style cosine \
60+
--lr-warmup-iters 500 \
61+
--weight-decay .1 \
62+
--adam-beta2 .95 \
63+
--clip-grad 1.0 \
64+
--bf16 \
65+
--use-flash-attn \
66+
--fim-rate 0.5 \
67+
--log-interval 10 \
68+
--save-interval 2500 \
69+
--eval-interval 100 \
70+
--eval-iters 10 \
71+
--valid-num-workers 0 \
72+
--override-opt_param-scheduler \
73+
--no-load-optim \
74+
--no-load-rng \
75+
--finetune \
76+
"
77+
78+
# --dataloader-type cyclic\
79+
TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
80+
81+
CMD=" \
82+
$SCRIPT_REPO/pretrain_gpt.py \
83+
$GPT_ARGS \
84+
--tokenizer-type TokenizerFromFile \
85+
--tokenizer-file $TOKENIZER_FILE \
86+
--save $CHECKPOINT_PATH \
87+
--load $STARCODER_PATH \
88+
--train-weighted-split-paths-path $WEIGHTS_TRAIN \
89+
--valid-weighted-split-paths-path $WEIGHTS_VALID \
90+
--structured-logs \
91+
--structured-logs-dir $CHECKPOINT_PATH/logs \
92+
$TENSORBOARD_ARGS \
93+
--wandb-entity-name lvwerra \
94+
--wandb-project-name starcoder-py \
95+
"
96+
97+
# --data-path $DATA_PATH\gpt2-preprocessed_content_document
98+
99+
export LAUNCHER="python -u -m torch.distributed.run \
100+
--nproc_per_node $GPUS_PER_NODE \
101+
--nnodes $NNODES \
102+
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
103+
--rdzv_backend c10d \
104+
--max_restarts 0 \
105+
--tee 3 \
106+
"
107+
108+
echo $CMD
109+
110+
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
111+
# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
112+
113+
# force crashing on nccl issues like hanging broadcast
114+
export NCCL_ASYNC_ERROR_HANDLING=1
115+
# export NCCL_DEBUG=INFO
116+
# export NCCL_DEBUG_SUBSYS=COLL
117+
# export NCCL_SOCKET_NTHREADS=1
118+
# export NCCL_NSOCKS_PERTHREAD=1
119+
# export CUDA_LAUNCH_BLOCKING=1
120+
121+
# AWS specific
122+
export NCCL_PROTO=simple
123+
export RDMAV_FORK_SAFE=1
124+
export FI_EFA_FORK_SAFE=1
125+
export FI_EFA_USE_DEVICE_RDMA=1
126+
export FI_PROVIDER=efa
127+
export FI_LOG_LEVEL=1
128+
export NCCL_IB_DISABLE=1
129+
export NCCL_SOCKET_IFNAME=ens
130+
131+
export CUDA_HOME=/usr/local/cuda-11.6
132+
133+
# srun error handling:
134+
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
135+
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
136+
SRUN_ARGS=" \
137+
--wait=60 \
138+
--kill-on-bad-exit=1 \
139+
"
140+
141+
# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
142+
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
143+
144+
echo "END TIME: $(date)"
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=starcoderplus
3+
#SBATCH --nodes=64
4+
#SBATCH --ntasks-per-node=1
5+
#SBATCH --exclusive
6+
#SBATCH --gres=gpu:8
7+
#SBATCH --partition=production-cluster
8+
#SBATCH --output=/fsx/leandro/logs/starcoderplus/bcs-%x-%j.out
9+
10+
set -x -e
11+
source /admin/home/leandro/.bashrc
12+
13+
conda activate megatron
14+
15+
echo "START TIME: $(date)"
16+
17+
# File Path setup
18+
SCRIPT_REPO=/fsx/leandro/git/Megatron-LM-BC
19+
pushd $SCRIPT_REPO
20+
21+
LOG_PATH=$SCRIPT_REPO/main_log.txt
22+
23+
# Training setup
24+
GPUS_PER_NODE=8
25+
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
26+
MASTER_PORT=6000
27+
NNODES=$SLURM_NNODES
28+
NODE_RANK=$SLURM_PROCID
29+
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
30+
31+
# File path setup
32+
STARCODER_PATH=/fsx/boomcode/starcoder/
33+
CHECKPOINT_PATH=/fsx/boomcode/starcoderplus/$SLURM_JOB_ID
34+
TOKENIZER_FILE=/fsx/boomcode/tokenizer-starcoder/tokenizer.json
35+
WEIGHTS_TRAIN=/fsx/boomcode/datamix/train_data_paths.txt.tmp
36+
WEIGHTS_VALID=/fsx/boomcode/datamix/valid_data_paths.txt.tmp
37+
38+
mkdir -p $CHECKPOINT_PATH/tensorboard
39+
40+
GPT_ARGS="\
41+
--tensor-model-parallel-size 4 \
42+
--pipeline-model-parallel-size 4 \
43+
--sequence-parallel \
44+
--num-layers 40 \
45+
--hidden-size 6144 \
46+
--num-attention-heads 48 \
47+
--attention-head-type multiquery \
48+
--init-method-std 0.01275 \
49+
--seq-length 8192 \
50+
--max-position-embeddings 8192 \
51+
--attention-dropout 0.1 \
52+
--hidden-dropout 0.1 \
53+
--micro-batch-size 1 \
54+
--global-batch-size 512 \
55+
--lr 0.0001 \
56+
--min-lr 0.00001 \
57+
--train-iters 400000 \
58+
--lr-decay-iters 150000 \
59+
--lr-decay-style cosine \
60+
--lr-warmup-iters 1000 \
61+
--weight-decay .1 \
62+
--adam-beta2 .95 \
63+
--clip-grad 1.0 \
64+
--bf16 \
65+
--use-flash-attn \
66+
--fim-rate 0.5 \
67+
--log-interval 10 \
68+
--save-interval 2500 \
69+
--eval-interval 2500 \
70+
--eval-iters 2 \
71+
--valid-num-workers 0 \
72+
--override-opt_param-scheduler \
73+
--no-load-optim \
74+
--no-load-rng \
75+
--finetune \
76+
"
77+
78+
TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
79+
80+
CMD=" \
81+
$SCRIPT_REPO/pretrain_gpt.py \
82+
$GPT_ARGS \
83+
--tokenizer-type TokenizerFromFile \
84+
--tokenizer-file $TOKENIZER_FILE \
85+
--save $CHECKPOINT_PATH \
86+
--load $STARCODER_PATH \
87+
--train-weighted-split-paths-path $WEIGHTS_TRAIN \
88+
--valid-weighted-split-paths-path $WEIGHTS_VALID \
89+
--structured-logs \
90+
--structured-logs-dir $CHECKPOINT_PATH/logs \
91+
$TENSORBOARD_ARGS \
92+
--wandb-entity-name lvwerra \
93+
--wandb-project-name starcoder-plus \
94+
"
95+
96+
export LAUNCHER="python -u -m torch.distributed.run \
97+
--nproc_per_node $GPUS_PER_NODE \
98+
--nnodes $NNODES \
99+
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
100+
--rdzv_backend c10d \
101+
--max_restarts 0 \
102+
--tee 3 \
103+
"
104+
105+
echo $CMD
106+
107+
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
108+
# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
109+
110+
# force crashing on nccl issues like hanging broadcast
111+
export NCCL_ASYNC_ERROR_HANDLING=1
112+
# export NCCL_DEBUG=INFO
113+
# export NCCL_DEBUG_SUBSYS=COLL
114+
# export NCCL_SOCKET_NTHREADS=1
115+
# export NCCL_NSOCKS_PERTHREAD=1
116+
# export CUDA_LAUNCH_BLOCKING=1
117+
118+
# AWS specific
119+
export NCCL_PROTO=simple
120+
export RDMAV_FORK_SAFE=1
121+
export FI_EFA_FORK_SAFE=1
122+
export FI_EFA_USE_DEVICE_RDMA=1
123+
export FI_PROVIDER=efa
124+
export FI_LOG_LEVEL=1
125+
export NCCL_IB_DISABLE=1
126+
export NCCL_SOCKET_IFNAME=ens
127+
128+
export CUDA_HOME=/usr/local/cuda-11.6
129+
130+
# srun error handling:
131+
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
132+
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
133+
SRUN_ARGS=" \
134+
--wait=60 \
135+
--kill-on-bad-exit=1 \
136+
"
137+
138+
# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
139+
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
140+
141+
echo "END TIME: $(date)"

0 commit comments

Comments
 (0)