1+ #! /bin/bash
2+ # SBATCH --job-name=bigcode-training
3+ # SBATCH --nodes=64
4+ # SBATCH --ntasks-per-node=1
5+ # SBATCH --cpus-per-task=96
6+ # SBATCH --gres=gpu:8
7+ # SBATCH --exclusive
8+ # SBATCH --partition=production-cluster
9+ # SBATCH --output=/fsx/bigcode/bigcode-training/logs/run-%x-%j.out
10+
11+ set -x -e
12+ source /admin/home/loubna/.bashrc
13+
14+ conda activate megatron
15+
16+ echo " START TIME: $( date) "
17+
18+ # File Path setup
19+ SCRIPT_REPO=/fsx/loubna/code/Megatron-LM
20+ pushd $SCRIPT_REPO
21+
22+ LOG_PATH=$SCRIPT_REPO /main_log.txt
23+
24+ # Training setup
25+ GPUS_PER_NODE=8
26+ MASTER_ADDR=$( scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
27+ MASTER_PORT=6000
28+ NNODES=$SLURM_NNODES
29+ NODE_RANK=$SLURM_PROCID
30+ WORLD_SIZE=$(( $GPUS_PER_NODE * $NNODES ))
31+
32+ # File path setup
33+ CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/6672
34+ TOKENIZER_FILE=/fsx/loubna/data/tokenizer/tokenizer-the-stack-march-sample-v3-no-prefix-spaces/tokenizer.json
35+ WEIGHTS_TRAIN=/fsx/loubna/code/bigcode-data-mix/data/train_data_paths.txt.tmp
36+ WEIGHTS_VALID=/fsx/loubna/code/bigcode-data-mix/data/valid_data_paths.txt.tmp
37+
38+ mkdir -p $CHECKPOINT_PATH /tensorboard
39+
40+ GPT_ARGS=" \
41+ --tensor-model-parallel-size 4 \
42+ --pipeline-model-parallel-size 4 \
43+ --sequence-parallel \
44+ --num-layers 40 \
45+ --hidden-size 6144 \
46+ --num-attention-heads 48 \
47+ --attention-head-type multiquery \
48+ --init-method-std 0.01275 \
49+ --seq-length 8192 \
50+ --max-position-embeddings 8192 \
51+ --attention-dropout 0.1 \
52+ --hidden-dropout 0.1 \
53+ --micro-batch-size 1 \
54+ --global-batch-size 512 \
55+ --lr 0.0003 \
56+ --min-lr 0.00003 \
57+ --train-iters 250000 \
58+ --lr-decay-iters 250000 \
59+ --lr-decay-style cosine \
60+ --lr-warmup-iters 2000 \
61+ --weight-decay .1 \
62+ --adam-beta2 .95 \
63+ --clip-grad 1.0 \
64+ --bf16 \
65+ --use-flash-attn \
66+ --fim-rate 0.5 \
67+ --log-interval 10 \
68+ --save-interval 2500 \
69+ --eval-interval 2500 \
70+ --eval-iters 2 \
71+ --use-distributed-optimizer \
72+ --valid-num-workers 0 \
73+ "
74+
75+ TENSORBOARD_ARGS=" --tensorboard-dir ${CHECKPOINT_PATH} /tensorboard"
76+
77+ CMD=" \
78+ /fsx/loubna/code/Megatron-LM/pretrain_gpt.py \
79+ $GPT_ARGS \
80+ --tokenizer-type TokenizerFromFile \
81+ --tokenizer-file $TOKENIZER_FILE \
82+ --save $CHECKPOINT_PATH \
83+ --load $CHECKPOINT_PATH \
84+ --train-weighted-split-paths-path $WEIGHTS_TRAIN \
85+ --valid-weighted-split-paths-path $WEIGHTS_VALID \
86+ --structured-logs \
87+ --structured-logs-dir $CHECKPOINT_PATH /logs \
88+ $TENSORBOARD_ARGS \
89+ --wandb-entity-name loubnabnl \
90+ --wandb-project-name bigcode-pretraining \
91+ "
92+
93+ export LAUNCHER=" python -u -m torch.distributed.run \
94+ --nproc_per_node $GPUS_PER_NODE \
95+ --nnodes $NNODES \
96+ --rdzv_endpoint $MASTER_ADDR :$MASTER_PORT \
97+ --rdzv_backend c10d \
98+ --max_restarts 0 \
99+ --tee 3 \
100+ "
101+
102+ echo $CMD
103+
104+ # hide duplicated errors using this hack - will be properly fixed in pt-1.12
105+ # export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
106+
107+ # force crashing on nccl issues like hanging broadcast
108+ export NCCL_ASYNC_ERROR_HANDLING=1
109+ # export NCCL_DEBUG=INFO
110+ # export NCCL_DEBUG_SUBSYS=COLL
111+ # export NCCL_SOCKET_NTHREADS=1
112+ # export NCCL_NSOCKS_PERTHREAD=1
113+ # export CUDA_LAUNCH_BLOCKING=1
114+
115+ # AWS specific
116+ export NCCL_PROTO=simple
117+ export RDMAV_FORK_SAFE=1
118+ export FI_EFA_FORK_SAFE=1
119+ export FI_EFA_USE_DEVICE_RDMA=1
120+ export FI_PROVIDER=efa
121+ export FI_LOG_LEVEL=1
122+ export NCCL_IB_DISABLE=1
123+ export NCCL_SOCKET_IFNAME=ens
124+
125+ export CUDA_HOME=/usr/local/cuda-11.6
126+
127+ # srun error handling:
128+ # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
129+ # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
130+ SRUN_ARGS=" \
131+ --wait=60 \
132+ --kill-on-bad-exit=1 \
133+ "
134+
135+ # py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
136+ clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c " $LAUNCHER --node_rank \$ SLURM_PROCID --role \$ SLURMD_NODENAME: $CMD " 2>&1 | tee $LOG_PATH
137+
138+ rm -rf $CHECKPOINT_PATH
139+
140+ echo " END TIME: $( date) "
0 commit comments