Skip to content

Commit 22b8611

Browse files
author
loubnabnl
committed
add bigcode model slurm script
1 parent 86ba4c0 commit 22b8611

File tree

1 file changed

+140
-0
lines changed

1 file changed

+140
-0
lines changed
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=bigcode-training
3+
#SBATCH --nodes=64
4+
#SBATCH --ntasks-per-node=1
5+
#SBATCH --cpus-per-task=96
6+
#SBATCH --gres=gpu:8
7+
#SBATCH --exclusive
8+
#SBATCH --partition=production-cluster
9+
#SBATCH --output=/fsx/bigcode/bigcode-training/logs/run-%x-%j.out
10+
11+
set -x -e
12+
source /admin/home/loubna/.bashrc
13+
14+
conda activate megatron
15+
16+
echo "START TIME: $(date)"
17+
18+
# File Path setup
19+
SCRIPT_REPO=/fsx/loubna/code/Megatron-LM
20+
pushd $SCRIPT_REPO
21+
22+
LOG_PATH=$SCRIPT_REPO/main_log.txt
23+
24+
# Training setup
25+
GPUS_PER_NODE=8
26+
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
27+
MASTER_PORT=6000
28+
NNODES=$SLURM_NNODES
29+
NODE_RANK=$SLURM_PROCID
30+
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
31+
32+
# File path setup
33+
CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/6672
34+
TOKENIZER_FILE=/fsx/loubna/data/tokenizer/tokenizer-the-stack-march-sample-v3-no-prefix-spaces/tokenizer.json
35+
WEIGHTS_TRAIN=/fsx/loubna/code/bigcode-data-mix/data/train_data_paths.txt.tmp
36+
WEIGHTS_VALID=/fsx/loubna/code/bigcode-data-mix/data/valid_data_paths.txt.tmp
37+
38+
mkdir -p $CHECKPOINT_PATH/tensorboard
39+
40+
GPT_ARGS="\
41+
--tensor-model-parallel-size 4 \
42+
--pipeline-model-parallel-size 4 \
43+
--sequence-parallel \
44+
--num-layers 40 \
45+
--hidden-size 6144 \
46+
--num-attention-heads 48 \
47+
--attention-head-type multiquery \
48+
--init-method-std 0.01275 \
49+
--seq-length 8192 \
50+
--max-position-embeddings 8192 \
51+
--attention-dropout 0.1 \
52+
--hidden-dropout 0.1 \
53+
--micro-batch-size 1 \
54+
--global-batch-size 512 \
55+
--lr 0.0003 \
56+
--min-lr 0.00003 \
57+
--train-iters 250000 \
58+
--lr-decay-iters 250000 \
59+
--lr-decay-style cosine \
60+
--lr-warmup-iters 2000 \
61+
--weight-decay .1 \
62+
--adam-beta2 .95 \
63+
--clip-grad 1.0 \
64+
--bf16 \
65+
--use-flash-attn \
66+
--fim-rate 0.5 \
67+
--log-interval 10 \
68+
--save-interval 2500 \
69+
--eval-interval 2500 \
70+
--eval-iters 2 \
71+
--use-distributed-optimizer \
72+
--valid-num-workers 0 \
73+
"
74+
75+
TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
76+
77+
CMD=" \
78+
/fsx/loubna/code/Megatron-LM/pretrain_gpt.py \
79+
$GPT_ARGS \
80+
--tokenizer-type TokenizerFromFile \
81+
--tokenizer-file $TOKENIZER_FILE \
82+
--save $CHECKPOINT_PATH \
83+
--load $CHECKPOINT_PATH \
84+
--train-weighted-split-paths-path $WEIGHTS_TRAIN \
85+
--valid-weighted-split-paths-path $WEIGHTS_VALID \
86+
--structured-logs \
87+
--structured-logs-dir $CHECKPOINT_PATH/logs \
88+
$TENSORBOARD_ARGS \
89+
--wandb-entity-name loubnabnl \
90+
--wandb-project-name bigcode-pretraining \
91+
"
92+
93+
export LAUNCHER="python -u -m torch.distributed.run \
94+
--nproc_per_node $GPUS_PER_NODE \
95+
--nnodes $NNODES \
96+
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
97+
--rdzv_backend c10d \
98+
--max_restarts 0 \
99+
--tee 3 \
100+
"
101+
102+
echo $CMD
103+
104+
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
105+
# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
106+
107+
# force crashing on nccl issues like hanging broadcast
108+
export NCCL_ASYNC_ERROR_HANDLING=1
109+
# export NCCL_DEBUG=INFO
110+
# export NCCL_DEBUG_SUBSYS=COLL
111+
# export NCCL_SOCKET_NTHREADS=1
112+
# export NCCL_NSOCKS_PERTHREAD=1
113+
# export CUDA_LAUNCH_BLOCKING=1
114+
115+
# AWS specific
116+
export NCCL_PROTO=simple
117+
export RDMAV_FORK_SAFE=1
118+
export FI_EFA_FORK_SAFE=1
119+
export FI_EFA_USE_DEVICE_RDMA=1
120+
export FI_PROVIDER=efa
121+
export FI_LOG_LEVEL=1
122+
export NCCL_IB_DISABLE=1
123+
export NCCL_SOCKET_IFNAME=ens
124+
125+
export CUDA_HOME=/usr/local/cuda-11.6
126+
127+
# srun error handling:
128+
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
129+
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
130+
SRUN_ARGS=" \
131+
--wait=60 \
132+
--kill-on-bad-exit=1 \
133+
"
134+
135+
# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
136+
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
137+
138+
rm -rf $CHECKPOINT_PATH
139+
140+
echo "END TIME: $(date)"

0 commit comments

Comments
 (0)