File tree Expand file tree Collapse file tree 1 file changed +3
-1
lines changed Expand file tree Collapse file tree 1 file changed +3
-1
lines changed Original file line number Diff line number Diff line change 44# SBATCH --constraint=a100
55# SBATCH --reservation=hug
66# SBATCH --qos=qos_gpu-gc # up to 100h
7- # SBATCH --nodes=48
7+ # SBATCH --nodes=24
88# SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
99# SBATCH --cpus-per-task=64 # number of cores per tasks
1010# SBATCH --hint=nomultithread # we get physical cores not logical
@@ -115,6 +115,7 @@ GPT_ARGS=" \
115115 --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
116116 --init-method-std 0.0048 \
117117 --embed-layernorm \
118+ --layernorm-tp-auto-sync \
118119 --bf16 \
119120 --seed 42 \
120121 --position-embedding-type alibi \
@@ -178,6 +179,7 @@ export LAUNCHER="python -u -m torch.distributed.run \
178179 --tee 3 \
179180 "
180181
182+ # --universal-checkpoint \
181183export CMD=" \
182184 ` pwd` /pretrain_gpt.py \
183185 --tensor-model-parallel-size $TP_SIZE \
You can’t perform that action at this time.
0 commit comments