@@ -21,32 +21,32 @@ GPT_ARGS="\
2121 --tensor-model-parallel-size 1 \
2222 --pipeline-model-parallel-size 1 \
2323 --recompute-activations \
24- --num-layers 24 \
25- --hidden-size 2048 \
26- --num-attention-heads 16 \
27- --attention-head-type multiquery \
28- --init-method-std 0.022 \
24+ --num-layers 24 \
25+ --hidden-size 2048 \
26+ --num-attention-heads 16 \
27+ --attention-head-type multiquery \
28+ --init-method-std 0.022 \
2929 --seq-length 2048 \
3030 --max-position-embeddings 2048 \
31- --attention-dropout 0.1 \
32- --hidden-dropout 0.1 \
31+ --attention-dropout 0.1 \
32+ --hidden-dropout 0.1 \
3333 --micro-batch-size 2 \
3434 --global-batch-size 192 \
35- --lr 0.0002 \
36- --train-iters 3000 \
37- --lr-decay-iters 600000 \
38- --lr-decay-style cosine \
39- --lr-warmup-fraction 0.02 \
40- --weight-decay .1 \
41- --adam-beta2 .95 \
42- --clip-grad 1.0 \
43- --fp16 \
35+ --lr 0.0002 \
36+ --train-iters 300000 \
37+ --lr-decay-iters 600000 \
38+ --lr-decay-style cosine \
39+ --lr-warmup-fraction 0.02 \
40+ --weight-decay .1 \
41+ --adam-beta2 .95 \
42+ --clip-grad 1.0 \
43+ --fp16 \
4444 --log-interval 10 \
4545 --save-interval 4000 \
4646 --eval-interval 200 \
4747 --eval-iters 10 \
48- --initial-loss-scale 65536 \
49- --fim-rate 0.5 \
48+ --initial-loss-scale 65536 \
49+ --fim-rate 0.5 \
5050"
5151
5252TENSORBOARD_ARGS=" --tensorboard-dir ${CHECKPOINT_PATH} /tensorboard"
@@ -59,4 +59,4 @@ torchrun $DISTRIBUTED_ARGS \
5959 --save $CHECKPOINT_PATH \
6060 --load $CHECKPOINT_PATH \
6161 --data-path $DATA_PATH \
62- $TENSORBOARD_ARGS
62+ $TENSORBOARD_ARGS
0 commit comments