switch to 24n, add layernorm-tp-auto-sync switch

stas00 · stas00 · commit 95f8a8b8b722 · 2022-07-05T06:17:51.000+02:00
diff --git a/train/tr11-176B-ml/tr11-176B-ml.slurm b/train/tr11-176B-ml/tr11-176B-ml.slurm
@@ -4,7 +4,7 @@
 #SBATCH --constraint=a100
 #SBATCH --reservation=hug
 #SBATCH --qos=qos_gpu-gc             # up to 100h
-#SBATCH --nodes=48
+#SBATCH --nodes=24
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 #SBATCH --cpus-per-task=64           # number of cores per tasks
 #SBATCH --hint=nomultithread         # we get physical cores not logical
@@ -115,6 +115,7 @@ GPT_ARGS=" \
     --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
     --init-method-std 0.0048 \
     --embed-layernorm \
+    --layernorm-tp-auto-sync \
     --bf16 \
     --seed 42 \
     --position-embedding-type alibi \
@@ -178,6 +179,7 @@ export LAUNCHER="python -u -m torch.distributed.run \
     --tee 3 \
     "
 
+#    --universal-checkpoint \
 export CMD=" \
     `pwd`/pretrain_gpt.py \
     --tensor-model-parallel-size $TP_SIZE \