From 94adbdd6d7cdf591e692091bcbec59e7745c40aa Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Mon, 4 Jul 2022 11:36:14 +0200 Subject: [PATCH 01/13] Add t0 scripts --- train/t0/t0_test.slurm | 193 +++++++++++++++++++++++++++++++++ train/t0/tr11f-6B3-ml-t0.slurm | 193 +++++++++++++++++++++++++++++++++ 2 files changed, 386 insertions(+) create mode 100644 train/t0/t0_test.slurm create mode 100644 train/t0/tr11f-6B3-ml-t0.slurm diff --git a/train/t0/t0_test.slurm b/train/t0/t0_test.slurm new file mode 100644 index 00000000..1822667b --- /dev/null +++ b/train/t0/t0_test.slurm @@ -0,0 +1,193 @@ +#!/bin/bash +#SBATCH --job-name=tr11e-350M-ml-t0 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:1 # number of gpus +#SBATCH -C v100-32g +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --account=six@v100 + +set -x -e + +source $six_ALL_CCFRWORK/start-muennighofflmeval +echo "START TIME: $(date)" + +variant=main + +DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11e-350M-ml +CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints/$variant +REPO_PATH=$DATA_OUTPUT_PATH/tr11e-350M-ml-logs +TENSORBOARD_PATH=$REPO_PATH/tensorboard-test/$variant +LOGS_PATH=$REPO_PATH/logs-test/$variant +mkdir -p $LOGS_PATH + +MEGATRON_DEEPSPEED_REPO=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/megdsmtf/thomas/Megatron-DeepSpeed +cd $MEGATRON_DEEPSPEED_REPO + +BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code/bigscience +TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles + +# defining the right environment variables +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +# testing for potential faulty nodes +# srun --jobid $SLURM_JOBID bash -c 'python -c "import torch, socket; print(socket.gethostname(), torch.cuda.is_available())"' + +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6001 + +GPUS_PER_NODE=1 +NNODES=1 + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=4 + +NLAYERS=2 +NHIDDEN=1024 +NHEADS=16 +SEQ_LEN=256 + +SAVE_INTERVAL=250 + +TRAIN_SAMPLES=10 # TODO +LR_DECAY_SAMPLES=10 # TODO +LR_WARMUP_SAMPLES=1 # TODO + + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 3.0e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $LR_DECAY_SAMPLES \ + --lr-warmup-samples $LR_WARMUP_SAMPLES \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " +# for 20h 1190, for 100h 5990 +# --exit-duration-in-mins 1190 \ +EXIT_OPTS=" \ + --exit-duration-in-mins 5990 \ + " + +GPT_ARGS=" \ + --pp-partition-method 'type:transformer|embedding' \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \ + --init-method-std 0.0048 \ + --embed-layernorm \ + --fp16 \ + --seed 42 \ + --position-embedding-type alibi \ + --abort-on-unmet-fused-kernel-constraints \ + --pad-vocab-size-to 250880 \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 1 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 # important: bf16 must use z0! it implements its own zero stage 1 equivalent + +config_json="./ds_config.$SLURM_JOBID.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +DATA_PATH="/gpfswork/rech/six/commun/bigscience-training/p3t0/p3_t0_train" + + +export CMD=" \ + `pwd`/finetune_t0_non_causal_decoder.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --data-path $DATA_PATH \ + --split 100,0,0 \ + --dataloader-type single \ + --data-impl mmap \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +# do not remove or the training will hang and nodes will be lost w/o this workaround +export CUDA_LAUNCH_BLOCKING=1 + +# hide duplicated errors using this hack - will be properly fixed in pt-1.12 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json + +clear; srun --jobid $SLURM_JOBID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID $CMD" 2>&1 | tee -a $LOGS_PATH/main_log.txt + +echo "END TIME: $(date)" diff --git a/train/t0/tr11f-6B3-ml-t0.slurm b/train/t0/tr11f-6B3-ml-t0.slurm new file mode 100644 index 00000000..b5cb3031 --- /dev/null +++ b/train/t0/tr11f-6B3-ml-t0.slurm @@ -0,0 +1,193 @@ +#!/bin/bash +#SBATCH --job-name=tr11f-6B3-ml-t0 +#SBATCH --partition=gpu_p5 +#SBATCH --constraint=a100 +#SBATCH --reservation=hug +#SBATCH --qos=qos_gpu-gc # up to 100h +#SBATCH --nodes=16 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=64 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --time 100:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --account=six@a100 + +set -x -e + +source $six_ALL_CCFRWORK/start-tr11f-6B3-ml-t0 # TODO +echo "START TIME: $(date)" + +variant=main + +DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11f-6B3-ml-t0 +CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints/$variant +REPO_PATH=$DATA_OUTPUT_PATH/tr11f-6B3-ml-t0-logs +TENSORBOARD_PATH=$REPO_PATH/tensorboard/$variant +LOGS_PATH=$REPO_PATH/logs/$variant +mkdir -p $LOGS_PATH + +MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr11f-6B3-ml-t0/Megatron-DeepSpeed # TODO +cd $MEGATRON_DEEPSPEED_REPO + +BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code/bigscience +TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles + +# defining the right environment variables +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +# testing for potential faulty nodes +# srun --jobid $SLURM_JOBID bash -c 'python -c "import torch, socket; print(socket.gethostname(), torch.cuda.is_available())"' + +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6001 + +GPUS_PER_NODE=8 +NNODES=$SLURM_NNODES + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=512 + +NLAYERS=30 +NHIDDEN=4096 +NHEADS=32 +SEQ_LEN=2048 + +SAVE_INTERVAL=500 + +TRAIN_SAMPLES=6_400_000 # 13e9 / 2048 +LR_WARMUP_SAMPLES=640_000 # 10% + + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 3.0e-4 \ + --lr-decay-style constant \ + --lr-warmup-samples $LR_WARMUP_SAMPLES \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " +# for 20h 1190, for 100h 5990 +# --exit-duration-in-mins 1190 \ +EXIT_OPTS=" \ + --exit-duration-in-mins 5990 \ + " + +GPT_ARGS=" \ + --pp-partition-method 'type:transformer|embedding' \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \ + --init-method-std 0.0048 \ + --embed-layernorm \ + --fp16 \ + --seed 42 \ + --position-embedding-type alibi \ + --abort-on-unmet-fused-kernel-constraints \ + --pad-vocab-size-to 250880 \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 1 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=1 # important: bf16 must use z0! it implements its own zero stage 1 equivalent + +config_json="./ds_config.$SLURM_JOBID.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +DATA_PATH="/gpfswork/rech/six/commun/bigscience-training/p3t0/p3_t0_train" + + +export CMD=" \ + `pwd`/finetune_t0_non_causal_decoder.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --data-path $DATA_PATH \ + --split 95,5,0 \ + --dataloader-type single \ + --data-impl mmap \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +# do not remove or the training will hang and nodes will be lost w/o this workaround +export CUDA_LAUNCH_BLOCKING=1 + +# hide duplicated errors using this hack - will be properly fixed in pt-1.12 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json + +clear; srun --jobid $SLURM_JOBID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID $CMD" 2>&1 | tee -a $LOGS_PATH/main_log.txt + +echo "END TIME: $(date)" From 3b8cc314014cb98b2141f109f515f40b957a2bb9 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Mon, 4 Jul 2022 11:45:19 +0200 Subject: [PATCH 02/13] Add T0 specific args --- train/t0/tr11f-6B3-ml-t0.slurm | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/train/t0/tr11f-6B3-ml-t0.slurm b/train/t0/tr11f-6B3-ml-t0.slurm index b5cb3031..6ed37af7 100644 --- a/train/t0/tr11f-6B3-ml-t0.slurm +++ b/train/t0/tr11f-6B3-ml-t0.slurm @@ -54,6 +54,10 @@ NNODES=$SLURM_NNODES PP_SIZE=1 TP_SIZE=1 +# T0 paper: +# ...truncate input and target sequences to 1024 and 256 tokens... +# ...use a batch size of 1024 sequences ... 2^20 total input tokens per batch... +# We use 2048 total tokens and 512 batch size = 2**20 MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=512 @@ -65,15 +69,16 @@ SEQ_LEN=2048 SAVE_INTERVAL=500 TRAIN_SAMPLES=6_400_000 # 13e9 / 2048 -LR_WARMUP_SAMPLES=640_000 # 10% - +LR_WARMUP_SAMPLES=640_000 # 10% - TODO: T0 paper says nothing about warmup +# T0 paper: +# "...we use a learning rate of 1e-3..." OPTIMIZER_ARGS=" \ --optimizer adam \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ - --lr 3.0e-4 \ + --lr 1e-3 \ --lr-decay-style constant \ --lr-warmup-samples $LR_WARMUP_SAMPLES \ --clip-grad 1.0 \ @@ -172,6 +177,8 @@ export CMD=" \ --pipeline-model-parallel-size $PP_SIZE \ $GPT_ARGS \ $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ --data-path $DATA_PATH \ --split 95,5,0 \ --dataloader-type single \ From 058e60b3ddc0f063cebe50445678956b1cf952d2 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Mon, 4 Jul 2022 13:02:49 +0200 Subject: [PATCH 03/13] Remove warmup --- train/t0/tr11f-6B3-ml-t0.slurm | 2 -- 1 file changed, 2 deletions(-) diff --git a/train/t0/tr11f-6B3-ml-t0.slurm b/train/t0/tr11f-6B3-ml-t0.slurm index 6ed37af7..0aaa3f74 100644 --- a/train/t0/tr11f-6B3-ml-t0.slurm +++ b/train/t0/tr11f-6B3-ml-t0.slurm @@ -69,7 +69,6 @@ SEQ_LEN=2048 SAVE_INTERVAL=500 TRAIN_SAMPLES=6_400_000 # 13e9 / 2048 -LR_WARMUP_SAMPLES=640_000 # 10% - TODO: T0 paper says nothing about warmup # T0 paper: # "...we use a learning rate of 1e-3..." @@ -80,7 +79,6 @@ OPTIMIZER_ARGS=" \ --adam-eps 1e-8 \ --lr 1e-3 \ --lr-decay-style constant \ - --lr-warmup-samples $LR_WARMUP_SAMPLES \ --clip-grad 1.0 \ --weight-decay 1e-1 \ " From d93e16914f7e6b7a476745c07a6e91d022e612d1 Mon Sep 17 00:00:00 2001 From: Niklas Muennighoff Date: Mon, 4 Jul 2022 15:33:29 +0200 Subject: [PATCH 04/13] Update train/t0/tr11f-6B3-ml-t0.slurm Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com> --- train/t0/tr11f-6B3-ml-t0.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/t0/tr11f-6B3-ml-t0.slurm b/train/t0/tr11f-6B3-ml-t0.slurm index 0aaa3f74..f879cf3f 100644 --- a/train/t0/tr11f-6B3-ml-t0.slurm +++ b/train/t0/tr11f-6B3-ml-t0.slurm @@ -31,7 +31,7 @@ MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr11f-6B3-ml-t0/Megatron-DeepSpee cd $MEGATRON_DEEPSPEED_REPO BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code/bigscience -TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles +TOKENIZER_NAME_OR_PATH=bigscience/tokenizer # defining the right environment variables export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models From 78791e28284e071a70d8a34e771ab3d3c84c4123 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Mon, 4 Jul 2022 16:25:18 +0200 Subject: [PATCH 05/13] t0 -> tr13 --- train/README.md | 2 ++ train/{t0 => tr13-t0}/t0_test.slurm | 0 .../tr13f-6B3-ml-t0.slurm} | 21 ++++++++++--------- 3 files changed, 13 insertions(+), 10 deletions(-) rename train/{t0 => tr13-t0}/t0_test.slurm (100%) rename train/{t0/tr11f-6B3-ml-t0.slurm => tr13-t0/tr13f-6B3-ml-t0.slurm} (91%) diff --git a/train/README.md b/train/README.md index 67dc573c..e3a2b6ad 100644 --- a/train/README.md +++ b/train/README.md @@ -36,3 +36,5 @@ Location of the checkpoints of the trained models plus logs and anything else of - tr9b-350M-swiglu: `six_ALL_CCFRSTORE/checkpoints/tr9b-350M-swiglu` - tr9c-1B3-swiglu-pile: `six_ALL_CCFRSTORE/checkpoints/tr9b-1B3-swiglu-pile` + +- tr13: Multi-Task Fine-tuning (T0) diff --git a/train/t0/t0_test.slurm b/train/tr13-t0/t0_test.slurm similarity index 100% rename from train/t0/t0_test.slurm rename to train/tr13-t0/t0_test.slurm diff --git a/train/t0/tr11f-6B3-ml-t0.slurm b/train/tr13-t0/tr13f-6B3-ml-t0.slurm similarity index 91% rename from train/t0/tr11f-6B3-ml-t0.slurm rename to train/tr13-t0/tr13f-6B3-ml-t0.slurm index 0aaa3f74..5a9fa425 100644 --- a/train/t0/tr11f-6B3-ml-t0.slurm +++ b/train/tr13-t0/tr13f-6B3-ml-t0.slurm @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=tr11f-6B3-ml-t0 +#SBATCH --job-name=tr13f-6B3-ml-t0 #SBATCH --partition=gpu_p5 #SBATCH --constraint=a100 #SBATCH --reservation=hug @@ -15,22 +15,22 @@ set -x -e -source $six_ALL_CCFRWORK/start-tr11f-6B3-ml-t0 # TODO +source $six_ALL_CCFRWORK/start-tr13f-6B3-ml-t0 # TODO echo "START TIME: $(date)" variant=main -DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11f-6B3-ml-t0 +DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr13f-6B3-ml-t0 CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints/$variant -REPO_PATH=$DATA_OUTPUT_PATH/tr11f-6B3-ml-t0-logs +REPO_PATH=$DATA_OUTPUT_PATH/tr13f-6B3-ml-t0-logs TENSORBOARD_PATH=$REPO_PATH/tensorboard/$variant LOGS_PATH=$REPO_PATH/logs/$variant mkdir -p $LOGS_PATH -MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr11f-6B3-ml-t0/Megatron-DeepSpeed # TODO +MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/Megatron-DeepSpeed # TODO cd $MEGATRON_DEEPSPEED_REPO -BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code/bigscience +BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code//tr11f-6B3-ml-t0/bigscience TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles # defining the right environment variables @@ -66,9 +66,9 @@ NHIDDEN=4096 NHEADS=32 SEQ_LEN=2048 -SAVE_INTERVAL=500 +SAVE_INTERVAL=1000 -TRAIN_SAMPLES=6_400_000 # 13e9 / 2048 +TRAIN_SAMPLES=6_348_800 # 13e9 / 2048 # T0 paper: # "...we use a learning rate of 1e-3..." @@ -79,6 +79,7 @@ OPTIMIZER_ARGS=" \ --adam-eps 1e-8 \ --lr 1e-3 \ --lr-decay-style constant \ + --lr-warmup-samples 0 \ --clip-grad 1.0 \ --weight-decay 1e-1 \ " @@ -115,7 +116,7 @@ OUTPUT_ARGS=" \ --log-interval 1 \ --save-interval $SAVE_INTERVAL \ --eval-interval 1000 \ - --eval-iters 1 \ + --eval-iters 10 \ --tensorboard-dir $TENSORBOARD_PATH \ --tensorboard-queue-size 5 \ --log-timers-to-tensorboard \ @@ -123,7 +124,7 @@ OUTPUT_ARGS=" \ --log-validation-ppl-to-tensorboard \ " -ZERO_STAGE=1 # important: bf16 must use z0! it implements its own zero stage 1 equivalent +ZERO_STAGE=1 config_json="./ds_config.$SLURM_JOBID.json" From 2fc43b013a32c8364ca56929661da53c289c1128 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 5 Jul 2022 08:53:51 +0200 Subject: [PATCH 06/13] Use weighted-split-path --- train/tr13-t0/t0_test.slurm | 14 +++++++++----- train/tr13-t0/tr13f-6B3-ml-t0.slurm | 11 +++++------ 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/train/tr13-t0/t0_test.slurm b/train/tr13-t0/t0_test.slurm index 1822667b..f93b3ded 100644 --- a/train/tr13-t0/t0_test.slurm +++ b/train/tr13-t0/t0_test.slurm @@ -24,10 +24,12 @@ TENSORBOARD_PATH=$REPO_PATH/tensorboard-test/$variant LOGS_PATH=$REPO_PATH/logs-test/$variant mkdir -p $LOGS_PATH -MEGATRON_DEEPSPEED_REPO=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/megdsmtf/thomas/Megatron-DeepSpeed +MEGATRON_DEEPSPEED_REPO=/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/megdsmtf/thomas2/Megatron-DeepSpeed cd $MEGATRON_DEEPSPEED_REPO BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code/bigscience +TRAIN_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/p3_train.txt +VALID_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/p3_validation.txt TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles # defining the right environment variables @@ -163,8 +165,10 @@ export LAUNCHER="python -u -m torch.distributed.run \ --tee 3 \ " -DATA_PATH="/gpfswork/rech/six/commun/bigscience-training/p3t0/p3_t0_train" - +# Data loading option 1: +#DATA_PATH="/gpfswork/rech/six/commun/bigscience-training/p3t0/p3_t0_train" +# --data-path $DATA_PATH \ +# --split 100,0,0 \ export CMD=" \ `pwd`/finetune_t0_non_causal_decoder.py \ @@ -172,8 +176,8 @@ export CMD=" \ --pipeline-model-parallel-size $PP_SIZE \ $GPT_ARGS \ $OUTPUT_ARGS \ - --data-path $DATA_PATH \ - --split 100,0,0 \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ --dataloader-type single \ --data-impl mmap \ --distributed-backend nccl \ diff --git a/train/tr13-t0/tr13f-6B3-ml-t0.slurm b/train/tr13-t0/tr13f-6B3-ml-t0.slurm index 668b76d1..893ca7d4 100644 --- a/train/tr13-t0/tr13f-6B3-ml-t0.slurm +++ b/train/tr13-t0/tr13f-6B3-ml-t0.slurm @@ -30,7 +30,9 @@ mkdir -p $LOGS_PATH MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/Megatron-DeepSpeed # TODO cd $MEGATRON_DEEPSPEED_REPO -BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code/tr11f-6B3-ml-t0/bigscience # TODO +BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/bigscience # TODO +TRAIN_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/train-splits-6B3.txt +VALID_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/valid-splits-6B3.txt TOKENIZER_NAME_OR_PATH=bigscience/tokenizer # defining the right environment variables @@ -167,9 +169,6 @@ export LAUNCHER="python -u -m torch.distributed.run \ --tee 3 \ " -DATA_PATH="/gpfswork/rech/six/commun/bigscience-training/p3t0/p3_t0_train" - - export CMD=" \ `pwd`/finetune_t0_non_causal_decoder.py \ --tensor-model-parallel-size $TP_SIZE \ @@ -178,8 +177,8 @@ export CMD=" \ $OUTPUT_ARGS \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --split 95,5,0 \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ --dataloader-type single \ --data-impl mmap \ --distributed-backend nccl \ From 41c1ef45d95130731a17449e415592a1d1e17af0 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Sat, 9 Jul 2022 09:10:19 +0200 Subject: [PATCH 07/13] Add 350M script & adjust HPs --- train/tr13-t0/tr13f-6B3-ml-t0.slurm | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/train/tr13-t0/tr13f-6B3-ml-t0.slurm b/train/tr13-t0/tr13f-6B3-ml-t0.slurm index 893ca7d4..a3d078ba 100644 --- a/train/tr13-t0/tr13f-6B3-ml-t0.slurm +++ b/train/tr13-t0/tr13f-6B3-ml-t0.slurm @@ -4,7 +4,7 @@ #SBATCH --constraint=a100 #SBATCH --reservation=hug #SBATCH --qos=qos_gpu-gc # up to 100h -#SBATCH --nodes=16 +#SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=64 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical @@ -15,7 +15,7 @@ set -x -e -source $six_ALL_CCFRWORK/start-tr13f-6B3-ml-t0 # TODO +source $six_ALL_CCFRWORK/start-tr13f-6B3-ml-t0 echo "START TIME: $(date)" variant=main @@ -27,12 +27,12 @@ TENSORBOARD_PATH=$REPO_PATH/tensorboard/$variant LOGS_PATH=$REPO_PATH/logs/$variant mkdir -p $LOGS_PATH -MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/Megatron-DeepSpeed # TODO +MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/Megatron-DeepSpeed cd $MEGATRON_DEEPSPEED_REPO -BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/bigscience # TODO -TRAIN_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/train-splits-6B3.txt -VALID_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/valid-splits-6B3.txt +BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/bigscience +TRAIN_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/p3_train.txt +VALID_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/p3_validation.txt TOKENIZER_NAME_OR_PATH=bigscience/tokenizer # defining the right environment variables @@ -70,7 +70,7 @@ SEQ_LEN=2048 SAVE_INTERVAL=1000 -TRAIN_SAMPLES=6_348_800 # 13e9 / 2048 +TRAIN_SAMPLES=6_348_800 # T0 paper: # "...we use a learning rate of 1e-3..." @@ -79,11 +79,12 @@ OPTIMIZER_ARGS=" \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ - --lr 1e-3 \ + --lr 1e-5 \ --lr-decay-style constant \ --lr-warmup-samples 0 \ --clip-grad 1.0 \ - --weight-decay 1e-1 \ + --weight-decay 0 \ + --no-load-optim \ " # for 20h 1190, for 100h 5990 # --exit-duration-in-mins 1190 \ @@ -175,6 +176,7 @@ export CMD=" \ --pipeline-model-parallel-size $PP_SIZE \ $GPT_ARGS \ $OUTPUT_ARGS \ + --finetune \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH \ --train-weighted-split-paths-path $TRAIN_DATA_PATH \ From 877158ba51e9a78bfbc0171d071cae5550b38a66 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Sat, 9 Jul 2022 09:13:05 +0200 Subject: [PATCH 08/13] Adjust tr13f-350M --- train/tr13-t0/tr13f-350M-ml-t0.slurm | 200 +++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 train/tr13-t0/tr13f-350M-ml-t0.slurm diff --git a/train/tr13-t0/tr13f-350M-ml-t0.slurm b/train/tr13-t0/tr13f-350M-ml-t0.slurm new file mode 100644 index 00000000..5dca4e8f --- /dev/null +++ b/train/tr13-t0/tr13f-350M-ml-t0.slurm @@ -0,0 +1,200 @@ +#!/bin/bash +#SBATCH --job-name=tr13e-350M-ml-t0 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=10 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:1 # number of gpus +#SBATCH -C v100-32g +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --account=six@v100 + +# THIS SCRIPT IS FOR TESTING PURPOSES +# T0 FINE-TUNING IS NOT BENEFICIAL FOR SMALL MODELS + +set -x -e + +source $six_ALL_CCFRWORK/start-tr13f-6B3-ml-t0 +echo "START TIME: $(date)" + +variant=main + +DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr13f-350M-ml-t0 +CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints/$variant +REPO_PATH=$DATA_OUTPUT_PATH/tr13f-350M-ml-t0-logs +TENSORBOARD_PATH=$REPO_PATH/tensorboard/$variant +LOGS_PATH=$REPO_PATH/logs/$variant +mkdir -p $LOGS_PATH + +MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/Megatron-DeepSpeed +cd $MEGATRON_DEEPSPEED_REPO + +BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/bigscience +TRAIN_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/p3_train.txt +VALID_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/p3_validation.txt +TOKENIZER_NAME_OR_PATH=bigscience/tokenizer + +# defining the right environment variables +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +# testing for potential faulty nodes +# srun --jobid $SLURM_JOBID bash -c 'python -c "import torch, socket; print(socket.gethostname(), torch.cuda.is_available())"' + +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6001 + +GPUS_PER_NODE=1 +NNODES=$SLURM_NNODES + +PP_SIZE=1 +TP_SIZE=1 + +# T0 paper: +# ...truncate input and target sequences to 1024 and 256 tokens... +# ...use a batch size of 1024 sequences ... 2^20 total input tokens per batch... +# We use 2048 total tokens and 512 batch size = 2**20 +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=512 + +NLAYERS=24 +NHIDDEN=1024 +NHEADS=16 +SEQ_LEN=2048 + +SAVE_INTERVAL=1000 + +TRAIN_SAMPLES=6_348_800 + +# T0 paper: +# "...we use a learning rate of 1e-3..." +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 1e-5 \ + --lr-decay-style constant \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 0 \ + --no-load-optim \ + " +# for 20h 1190, for 100h 5990 +# --exit-duration-in-mins 1190 \ +EXIT_OPTS=" \ + --exit-duration-in-mins 5990 \ + " + +GPT_ARGS=" \ + --pp-partition-method 'type:transformer|embedding' \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \ + --init-method-std 0.0048 \ + --embed-layernorm \ + --fp16 \ + --seed 42 \ + --position-embedding-type alibi \ + --abort-on-unmet-fused-kernel-constraints \ + --pad-vocab-size-to 250880 \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 1 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=1 + +config_json="./ds_config.$SLURM_JOBID.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +export CMD=" \ + `pwd`/finetune_t0_non_causal_decoder.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --finetune \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --dataloader-type single \ + --data-impl mmap \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +# do not remove or the training will hang and nodes will be lost w/o this workaround +export CUDA_LAUNCH_BLOCKING=1 + +# hide duplicated errors using this hack - will be properly fixed in pt-1.12 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json + +clear; srun --jobid $SLURM_JOBID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID $CMD" 2>&1 | tee -a $LOGS_PATH/main_log.txt + +echo "END TIME: $(date)" From 5674c69b44e10cad32109bc63ebcfff9a2efad27 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Sun, 10 Jul 2022 13:04:41 +0200 Subject: [PATCH 09/13] Add weight decay based on FLAN --- train/tr13-t0/tr13f-6B3-ml-t0.slurm | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/train/tr13-t0/tr13f-6B3-ml-t0.slurm b/train/tr13-t0/tr13f-6B3-ml-t0.slurm index a3d078ba..17dd4327 100644 --- a/train/tr13-t0/tr13f-6B3-ml-t0.slurm +++ b/train/tr13-t0/tr13f-6B3-ml-t0.slurm @@ -4,7 +4,7 @@ #SBATCH --constraint=a100 #SBATCH --reservation=hug #SBATCH --qos=qos_gpu-gc # up to 100h -#SBATCH --nodes=1 +#SBATCH --nodes=32 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=64 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical @@ -31,8 +31,8 @@ MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/Megatron-DeepSpee cd $MEGATRON_DEEPSPEED_REPO BIGSCIENCE_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/bigscience -TRAIN_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/p3_train.txt -VALID_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/p3_validation.txt +TRAIN_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/p31_train.txt +VALID_DATA_PATH=$MEGATRON_DEEPSPEED_REPO/data/p31_validation.txt TOKENIZER_NAME_OR_PATH=bigscience/tokenizer # defining the right environment variables @@ -74,6 +74,10 @@ TRAIN_SAMPLES=6_348_800 # T0 paper: # "...we use a learning rate of 1e-3..." +# However, they use Adafactor, which adapts the LR +# For Adam we likely want a lower one +# FLAN: +# "...decay of 1e-4.."" OPTIMIZER_ARGS=" \ --optimizer adam \ --adam-beta1 0.9 \ @@ -83,7 +87,7 @@ OPTIMIZER_ARGS=" \ --lr-decay-style constant \ --lr-warmup-samples 0 \ --clip-grad 1.0 \ - --weight-decay 0 \ + --weight-decay 1e-4 \ --no-load-optim \ " # for 20h 1190, for 100h 5990 From 582ee32abe7cc8b9ca788a78d55f2a0972910c81 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Mon, 11 Jul 2022 11:09:50 +0200 Subject: [PATCH 10/13] Remove finetune & add checkpoint-activations --- train/tr13-t0/tr13f-350M-ml-t0.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/tr13-t0/tr13f-350M-ml-t0.slurm b/train/tr13-t0/tr13f-350M-ml-t0.slurm index 5dca4e8f..eb96a5ba 100644 --- a/train/tr13-t0/tr13f-350M-ml-t0.slurm +++ b/train/tr13-t0/tr13f-350M-ml-t0.slurm @@ -109,6 +109,7 @@ GPT_ARGS=" \ --fp16 \ --seed 42 \ --position-embedding-type alibi \ + --checkpoint-activations \ --abort-on-unmet-fused-kernel-constraints \ --pad-vocab-size-to 250880 \ $OPTIMIZER_ARGS \ @@ -176,7 +177,6 @@ export CMD=" \ --pipeline-model-parallel-size $PP_SIZE \ $GPT_ARGS \ $OUTPUT_ARGS \ - --finetune \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH \ --train-weighted-split-paths-path $TRAIN_DATA_PATH \ From d5b16226c7a0dbeb82d06d24f914bdeac3828f1f Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Mon, 11 Jul 2022 16:39:06 +0200 Subject: [PATCH 11/13] Remove finetune --- train/tr13-t0/tr13f-6B3-ml-t0.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/tr13-t0/tr13f-6B3-ml-t0.slurm b/train/tr13-t0/tr13f-6B3-ml-t0.slurm index 17dd4327..69797648 100644 --- a/train/tr13-t0/tr13f-6B3-ml-t0.slurm +++ b/train/tr13-t0/tr13f-6B3-ml-t0.slurm @@ -113,6 +113,7 @@ GPT_ARGS=" \ --fp16 \ --seed 42 \ --position-embedding-type alibi \ + --checkpoint-activations \ --abort-on-unmet-fused-kernel-constraints \ --pad-vocab-size-to 250880 \ $OPTIMIZER_ARGS \ @@ -180,7 +181,6 @@ export CMD=" \ --pipeline-model-parallel-size $PP_SIZE \ $GPT_ARGS \ $OUTPUT_ARGS \ - --finetune \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH \ --train-weighted-split-paths-path $TRAIN_DATA_PATH \ From 56729d3bfdd5324652044d2bfd0f41b2565fb70f Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Thu, 14 Jul 2022 12:30:53 +0200 Subject: [PATCH 12/13] Change BS for more throughput & increase LR --- train/tr13-t0/tr13f-6B3-ml-t0.slurm | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/train/tr13-t0/tr13f-6B3-ml-t0.slurm b/train/tr13-t0/tr13f-6B3-ml-t0.slurm index 69797648..67ea6722 100644 --- a/train/tr13-t0/tr13f-6B3-ml-t0.slurm +++ b/train/tr13-t0/tr13f-6B3-ml-t0.slurm @@ -4,7 +4,7 @@ #SBATCH --constraint=a100 #SBATCH --reservation=hug #SBATCH --qos=qos_gpu-gc # up to 100h -#SBATCH --nodes=32 +#SBATCH --nodes=8 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=64 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical @@ -59,9 +59,10 @@ TP_SIZE=1 # T0 paper: # ...truncate input and target sequences to 1024 and 256 tokens... # ...use a batch size of 1024 sequences ... 2^20 total input tokens per batch... -# We use 2048 total tokens and 512 batch size = 2**20 +# We use 2048 total tokens and 512 batch size = 2**20 (1M) +# We changed to 2048 to get more throughput, so 4.2M tokens / batch MICRO_BATCH_SIZE=1 -GLOBAL_BATCH_SIZE=512 +GLOBAL_BATCH_SIZE=2048 NLAYERS=30 NHIDDEN=4096 @@ -78,12 +79,15 @@ TRAIN_SAMPLES=6_348_800 # For Adam we likely want a lower one # FLAN: # "...decay of 1e-4.."" +# We use 2e-5 due to overflows in fp16 with higher rates +# Remove after the first step in case of restarting: +# --no-load-optim \ OPTIMIZER_ARGS=" \ --optimizer adam \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ - --lr 1e-5 \ + --lr 2e-5 \ --lr-decay-style constant \ --lr-warmup-samples 0 \ --clip-grad 1.0 \ From 318dfc79484c996c3518d25bb210aea9fec8dd4e Mon Sep 17 00:00:00 2001 From: Niklas Muennighoff Date: Sat, 16 Jul 2022 18:10:56 +0200 Subject: [PATCH 13/13] Update train/tr13-t0/t0_test.slurm Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com> --- train/tr13-t0/t0_test.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/tr13-t0/t0_test.slurm b/train/tr13-t0/t0_test.slurm index f93b3ded..25e7bd8c 100644 --- a/train/tr13-t0/t0_test.slurm +++ b/train/tr13-t0/t0_test.slurm @@ -17,7 +17,7 @@ echo "START TIME: $(date)" variant=main -DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11e-350M-ml +DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr13-test-ml CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints/$variant REPO_PATH=$DATA_OUTPUT_PATH/tr11e-350M-ml-logs TENSORBOARD_PATH=$REPO_PATH/tensorboard-test/$variant