Skip to content

Commit bd0aaba

Browse files
authored
Merge pull request #70 from bigcode-project/script_7b-starcoder
[WIP] Add training scripts
2 parents ebd38e9 + 5a9c239 commit bd0aaba

File tree

3 files changed

+428
-0
lines changed

3 files changed

+428
-0
lines changed

examples/pretrain_bigcode_1b.slurm

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=1b-starcoder
3+
#SBATCH --nodes=16
4+
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5+
#SBATCH --cpus-per-task=38
6+
#SBATCH --gres=gpu:8
7+
#SBATCH --partition=production-cluster
8+
#SBATCH --output=/fsx/bigcode/bigcode-training/logs/1b/%x-%j.out
9+
10+
set -x -e
11+
source /admin/home/loubna/.bashrc
12+
13+
conda activate megatron
14+
15+
echo "START TIME: $(date)"
16+
17+
# File Path setup
18+
SCRIPT_REPO=/fsx/loubna/code/Megatron-LM
19+
pushd $SCRIPT_REPO
20+
21+
LOG_PATH=$SCRIPT_REPO/main_log.txt
22+
23+
# Training setup
24+
GPUS_PER_NODE=8
25+
# so processes know who to talk to
26+
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
27+
MASTER_PORT=6000
28+
NNODES=$SLURM_NNODES
29+
NODE_RANK=$SLURM_PROCID
30+
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
31+
32+
# File path setup
33+
CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/1b # Adjust: Directory to store the checkpoints
34+
# Starcoder tokenizer and data paths in /fsx/bigcode
35+
TOKENIZER_FILE=/fsx/loubna/starcoder-tokenizer/15b/tokenizer.json
36+
WEIGHTS_TRAIN=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp
37+
WEIGHTS_VALID=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp
38+
39+
mkdir -p $CHECKPOINT_PATH/tensorboard
40+
41+
GPT_ARGS="\
42+
--tensor-model-parallel-size 1 \
43+
--pipeline-model-parallel-size 1 \
44+
--num-layers 24 \
45+
--hidden-size 2048 \
46+
--num-attention-heads 16 \
47+
--attention-head-type multiquery \
48+
--init-method-std 0.02209 \
49+
--seq-length 8192 \
50+
--max-position-embeddings 8192 \
51+
--attention-dropout 0.1 \
52+
--hidden-dropout 0.1 \
53+
--micro-batch-size 1 \
54+
--global-batch-size 128 \
55+
--lr 0.0004 \
56+
--min-lr 0.00004 \
57+
--train-iters 1000000 \
58+
--lr-decay-iters 1000000 \
59+
--lr-decay-style cosine \
60+
--lr-warmup-iters 2000 \
61+
--weight-decay .1 \
62+
--adam-beta2 .95 \
63+
--clip-grad 1.0 \
64+
--bf16 \
65+
--use-flash-attn \
66+
--fim-rate 0.5 \
67+
--log-interval 10 \
68+
--save-interval 10000 \
69+
--eval-interval 10000 \
70+
--eval-iters 2 \
71+
--valid-num-workers 0 \
72+
"
73+
74+
TENSORBOARD_ARGS="--tensorboard-dir /fsx/bigcode/experiments/pretraining/1b/tensorboard"
75+
76+
CMD=" \
77+
/fsx/loubna/code/Megatron-LM/pretrain_gpt.py \
78+
$GPT_ARGS \
79+
--tokenizer-type TokenizerFromFile \
80+
--tokenizer-file $TOKENIZER_FILE \
81+
--save $CHECKPOINT_PATH \
82+
--load $CHECKPOINT_PATH \
83+
--train-weighted-split-paths-path $WEIGHTS_TRAIN \
84+
--valid-weighted-split-paths-path $WEIGHTS_VALID \
85+
--structured-logs \
86+
--structured-logs-dir $CHECKPOINT_PATH/logs \
87+
$TENSORBOARD_ARGS \
88+
--wandb-entity-name loubnabnl \
89+
--wandb-project-name bigcode-pretraining \
90+
"
91+
92+
export LAUNCHER="python -u -m torch.distributed.run \
93+
--nproc_per_node $GPUS_PER_NODE \
94+
--nnodes $NNODES \
95+
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
96+
--rdzv_backend c10d \
97+
--max_restarts 0 \
98+
--tee 3 \
99+
"
100+
101+
echo $CMD
102+
103+
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
104+
# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
105+
106+
# force crashing on nccl issues like hanging broadcast
107+
export NCCL_ASYNC_ERROR_HANDLING=1
108+
# export NCCL_DEBUG=INFO
109+
# export NCCL_DEBUG_SUBSYS=COLL
110+
# export NCCL_SOCKET_NTHREADS=1
111+
# export NCCL_NSOCKS_PERTHREAD=1
112+
# export CUDA_LAUNCH_BLOCKING=1
113+
114+
# AWS specific
115+
export NCCL_PROTO=simple
116+
export RDMAV_FORK_SAFE=1
117+
export FI_EFA_FORK_SAFE=1
118+
export FI_EFA_USE_DEVICE_RDMA=1
119+
export FI_PROVIDER=efa
120+
export FI_LOG_LEVEL=1
121+
export NCCL_IB_DISABLE=1
122+
export NCCL_SOCKET_IFNAME=ens
123+
124+
export CUDA_HOME=/usr/local/cuda-11.6
125+
# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was.
126+
#export PATH="/usr/local/cuda-11.6/bin:$PATH"
127+
#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH"
128+
#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so
129+
#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH
130+
131+
# srun error handling:
132+
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
133+
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
134+
SRUN_ARGS=" \
135+
--wait=60 \
136+
--kill-on-bad-exit=1 \
137+
"
138+
139+
# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
140+
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
141+
142+
echo "END TIME: $(date)"

examples/pretrain_bigcode_3b.slurm

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=3b-bigcode
3+
#SBATCH --nodes=32
4+
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5+
#SBATCH --cpus-per-task=40
6+
#SBATCH --gres=gpu:8
7+
#SBATCH --partition=production-cluster
8+
#SBATCH --output=/fsx/bigcode/bigcode-training/logs/3b/%x-%j.out
9+
10+
set -x -e
11+
source /admin/home/loubna/.bashrc
12+
13+
conda activate megatron
14+
15+
echo "START TIME: $(date)"
16+
17+
# File Path setup
18+
SCRIPT_REPO=/fsx/loubna/code/Megatron-LM
19+
pushd $SCRIPT_REPO
20+
21+
LOG_PATH=$SCRIPT_REPO/main_log.txt
22+
23+
# Training setup
24+
GPUS_PER_NODE=8
25+
# so processes know who to talk to
26+
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
27+
MASTER_PORT=6000
28+
NNODES=$SLURM_NNODES
29+
NODE_RANK=$SLURM_PROCID
30+
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
31+
32+
# File path setup
33+
CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/3b # Adjust: Directory to store the checkpoints
34+
# Starcoder tokenizer and data paths in /fsx/bigcode
35+
TOKENIZER_FILE=/fsx/bigcode/bigcode-training/tokenizer-starcoder/tokenizer.json
36+
WEIGHTS_TRAIN=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp
37+
WEIGHTS_VALID=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp
38+
39+
mkdir -p $CHECKPOINT_PATH/tensorboard
40+
41+
GPT_ARGS="\
42+
--tensor-model-parallel-size 1 \
43+
--pipeline-model-parallel-size 1 \
44+
--num-layers 36 \
45+
--hidden-size 2816 \
46+
--num-attention-heads 22 \
47+
--attention-head-type multiquery \
48+
--init-method-std 0.01884 \
49+
--seq-length 8192 \
50+
--max-position-embeddings 8192 \
51+
--attention-dropout 0.1 \
52+
--hidden-dropout 0.1 \
53+
--micro-batch-size 1 \
54+
--global-batch-size 256 \
55+
--lr 0.0005 \
56+
--min-lr 0.00005 \
57+
--train-iters 500000 \
58+
--lr-decay-iters 500000 \
59+
--lr-decay-style cosine \
60+
--lr-warmup-iters 2000 \
61+
--weight-decay .1 \
62+
--adam-beta2 .95 \
63+
--clip-grad 1.0 \
64+
--bf16 \
65+
--use-flash-attn \
66+
--fim-rate 0.5 \
67+
--log-interval 10 \
68+
--save-interval 5000 \
69+
--eval-interval 5000 \
70+
--eval-iters 2 \
71+
--use-distributed-optimizer \
72+
--valid-num-workers 0 \
73+
"
74+
75+
TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
76+
77+
CMD=" \
78+
/fsx/loubna/code/Megatron-LM/pretrain_gpt.py \
79+
$GPT_ARGS \
80+
--tokenizer-type TokenizerFromFile \
81+
--tokenizer-file $TOKENIZER_FILE \
82+
--save $CHECKPOINT_PATH \
83+
--load $CHECKPOINT_PATH \
84+
--train-weighted-split-paths-path $WEIGHTS_TRAIN \
85+
--valid-weighted-split-paths-path $WEIGHTS_VALID \
86+
--structured-logs \
87+
--structured-logs-dir $CHECKPOINT_PATH/logs \
88+
$TENSORBOARD_ARGS \
89+
--wandb-entity-name loubnabnl \
90+
--wandb-project-name bigcode-3b \
91+
"
92+
93+
export LAUNCHER="python -u -m torch.distributed.run \
94+
--nproc_per_node $GPUS_PER_NODE \
95+
--nnodes $NNODES \
96+
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
97+
--rdzv_backend c10d \
98+
--max_restarts 0 \
99+
--tee 3 \
100+
"
101+
102+
echo $CMD
103+
104+
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
105+
# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
106+
107+
# force crashing on nccl issues like hanging broadcast
108+
export NCCL_ASYNC_ERROR_HANDLING=1
109+
# export NCCL_DEBUG=INFO
110+
# export NCCL_DEBUG_SUBSYS=COLL
111+
# export NCCL_SOCKET_NTHREADS=1
112+
# export NCCL_NSOCKS_PERTHREAD=1
113+
# export CUDA_LAUNCH_BLOCKING=1
114+
115+
# AWS specific
116+
export NCCL_PROTO=simple
117+
export RDMAV_FORK_SAFE=1
118+
export FI_EFA_FORK_SAFE=1
119+
export FI_EFA_USE_DEVICE_RDMA=1
120+
export FI_PROVIDER=efa
121+
export FI_LOG_LEVEL=1
122+
export NCCL_IB_DISABLE=1
123+
export NCCL_SOCKET_IFNAME=ens
124+
125+
export CUDA_HOME=/usr/local/cuda-11.6
126+
# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was.
127+
#export PATH="/usr/local/cuda-11.6/bin:$PATH"
128+
#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH"
129+
#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so
130+
#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH
131+
132+
# srun error handling:
133+
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
134+
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
135+
SRUN_ARGS=" \
136+
--wait=60 \
137+
--kill-on-bad-exit=1 \
138+
"
139+
140+
# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
141+
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
142+
143+
echo "END TIME: $(date)"

0 commit comments

Comments
 (0)