Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
150 commits
Select commit Hold shift + click to select a range
d2c35fc
added train script but with prefix manually declared
May 7, 2022
f977b85
made new dataset
May 9, 2022
fcfbf17
minor adjustments
May 9, 2022
870dfd8
added capabilities for padding and prefix lm index
lintangsutawika May 9, 2022
791bbd0
added finetune script
lintangsutawika May 9, 2022
0f44b92
removed script
lintangsutawika May 9, 2022
2ff0815
added adjustments and new dataset
May 9, 2022
f0a79f6
try mlm dataset
May 9, 2022
eb416c7
minor changes
May 9, 2022
c0bc21b
minor addition of import packages
May 9, 2022
82e824c
minor error fix
May 9, 2022
7bb17ec
minor error fix
May 9, 2022
9929766
samples follow how gpt dataset is loaded
May 9, 2022
861c41f
added masked_lm_prob
May 9, 2022
fe95115
fixed tokenizer abstractions for HF tokenizer
May 9, 2022
8ea5943
added mask id
May 9, 2022
aa0d146
added mask id
May 9, 2022
215e8cc
added mask id
May 9, 2022
b6eef43
added mask id
May 9, 2022
bfc73a5
added fix
May 9, 2022
1890f87
added bos and eos token id
May 9, 2022
01392a9
no need for sentinal token
May 9, 2022
923decb
add aux functions
May 9, 2022
4611d67
add aux functions
May 9, 2022
4356de3
add aux functions
May 9, 2022
f31c686
add pad_id
May 9, 2022
a3951e8
changed lm predictions to t5
May 18, 2022
97b9a92
changed lm predictions to t5
May 18, 2022
fe73a73
changed lm predictions to t5
May 18, 2022
6a9cb75
changed lm predictions to t5
May 18, 2022
469848f
changed lm predictions to t5
May 18, 2022
e68283f
tokenizer add mask, cls, sep tokens
May 18, 2022
476ae94
commit latest changes
May 21, 2022
72ff575
commit latest changes
May 21, 2022
3647291
added sentinal tokens
May 21, 2022
fcdc987
added sentinal tokens
May 21, 2022
d6fbe78
added sentinal tokens
May 21, 2022
c44daba
added additional_special_tokens
May 21, 2022
a2725d8
added additional_special_tokens
May 21, 2022
0e94245
check t5_input and output
May 21, 2022
b599ab6
check decoder in and decoder out
May 21, 2022
626b0ae
made into input and output tokens
May 22, 2022
6008937
made into input and output tokens
May 22, 2022
c1524db
made into input and output tokens
May 22, 2022
c59c061
made into input and output tokens
May 22, 2022
e677e16
made into input and output tokens
May 22, 2022
9ffaeb9
made into input and output tokens
May 22, 2022
d0a6a2f
made into input and output tokens
May 22, 2022
47fd987
made into input and output tokens
May 23, 2022
4f377e8
made into input and output tokens
May 23, 2022
5c0bf76
added eos
May 23, 2022
7c63e4b
added eos
May 23, 2022
871124c
test text_token
May 24, 2022
55a593d
test text_token
May 24, 2022
adb59ca
test text_token
May 24, 2022
d71afb4
test text_token
May 24, 2022
7b99bb7
test text_token
May 24, 2022
922b09d
assigned array
May 24, 2022
469a02d
assigned array
May 24, 2022
15cb6a0
assigned array
May 24, 2022
5b0bc17
hardcoded sequence length
May 24, 2022
0671c79
check again
May 28, 2022
6db5c9b
show sentinal tokens
lintangsutawika May 28, 2022
8a58007
show sentinal tokens
lintangsutawika May 28, 2022
8b0bbc2
show sentinal tokens
lintangsutawika May 28, 2022
3d1b256
show sentinal tokens
lintangsutawika May 28, 2022
ce00fd9
add more special tokens
lintangsutawika May 28, 2022
3bcc50c
changed how mlm data is loaded
lintangsutawika May 28, 2022
76960f7
changed how mlm data is loaded
lintangsutawika May 28, 2022
229d661
changed how mlm data is loaded
lintangsutawika May 28, 2022
55e3df7
changed how mlm data is loaded
lintangsutawika May 28, 2022
05dea6d
changed how mlm data is loaded
lintangsutawika May 28, 2022
661c8bb
added new script
lintangsutawika May 28, 2022
97d3810
added new script
lintangsutawika May 28, 2022
71388ee
added new script
lintangsutawika May 28, 2022
b0f04d5
try t5 dataset
lintangsutawika May 28, 2022
cd43a54
try t5 dataset
lintangsutawika May 28, 2022
e0dc666
try t5 dataset
lintangsutawika May 28, 2022
866cee1
try t5 dataset
lintangsutawika May 28, 2022
0b56a7d
try t5 dataset
lintangsutawika May 28, 2022
5bb512b
try t5 dataset
lintangsutawika May 28, 2022
31d844f
try t5 dataset
lintangsutawika May 28, 2022
1d21963
try t5 dataset
lintangsutawika May 28, 2022
1429645
try t5 dataset
lintangsutawika May 28, 2022
f5341f8
try t5 dataset
lintangsutawika May 28, 2022
b05b175
try t5 dataset
lintangsutawika May 28, 2022
59a6e32
try t5 dataset
lintangsutawika May 28, 2022
ab76d49
developing
lintangsutawika May 28, 2022
0d8dfac
developing
lintangsutawika May 28, 2022
e629224
developing
lintangsutawika May 28, 2022
efcf50f
developing
lintangsutawika May 28, 2022
e5eb615
developing
lintangsutawika May 28, 2022
2eee807
developing
lintangsutawika May 28, 2022
5840a11
developing
lintangsutawika May 28, 2022
6d38f73
test to see output of get_ltor_masks_and_position_ids
lintangsutawika May 29, 2022
430fa6f
test to see output of get_ltor_masks_and_position_ids
lintangsutawika May 29, 2022
444314f
add new script
May 29, 2022
26c837d
add new script
May 29, 2022
feb023c
add new script
May 29, 2022
f30b9b1
changed settings
May 30, 2022
0a9203a
changed settings
May 30, 2022
672a866
tidy up
May 31, 2022
3780e61
changed tokenizer and position embedding
May 31, 2022
2130c31
modifying mlm to reflect original implementation
Jun 2, 2022
26afe43
minor fix
Jun 2, 2022
c1b9816
minor fix
Jun 2, 2022
453822f
minor fix
Jun 2, 2022
a62266a
minor fix
Jun 2, 2022
02dda79
minor fix
Jun 2, 2022
80331cb
minor fix
Jun 2, 2022
350227d
minor fix
Jun 2, 2022
d0eecd4
minor fix
Jun 2, 2022
243cebe
minor fix
Jun 2, 2022
da22e0b
minor fix
Jun 2, 2022
083dce7
minor fix
Jun 2, 2022
541e9d6
minor fix
Jun 2, 2022
86bfc8a
minor fix
Jun 2, 2022
e21a448
minor fix
Jun 2, 2022
f47d678
minor fix
Jun 2, 2022
415b8bc
minor fix
Jun 2, 2022
79bd6f8
minor fix
Jun 2, 2022
ba19fdf
minor fix
Jun 2, 2022
d200f4d
minor fix
Jun 2, 2022
102a461
minor fix
Jun 2, 2022
e530440
minor fix
Jun 2, 2022
2568039
minor fix
Jun 2, 2022
e6b4120
minor fix
Jun 2, 2022
fd7fe97
minor fix
Jun 2, 2022
861fc7b
minor fix
Jun 2, 2022
21c1984
minor fix
Jun 2, 2022
14e8d0f
minor fix
Jun 2, 2022
920343f
minor fix
Jun 2, 2022
a68873d
minor fix
Jun 2, 2022
5d43986
minor fix
Jun 2, 2022
79e8c1a
set correct seq len
Jun 2, 2022
786d252
refined sampling method
Jun 8, 2022
9110520
refined sampling method
Jun 8, 2022
7db34b9
refined sampling method
Jun 8, 2022
d946515
refined sampling method
Jun 8, 2022
bb4e656
refined sampling method
Jun 8, 2022
2e7161d
refined sampling method
Jun 8, 2022
00473e4
first commit, adding non causal mlm dataset
Jun 8, 2022
5992776
fixed mlm dataset
Jun 8, 2022
83f5dee
fixed mlm dataset
Jun 8, 2022
3235c2d
fixed mlm dataset
Jun 8, 2022
5449978
fixed mlm dataset
Jun 8, 2022
95c9851
fixed mlm dataset
Jun 8, 2022
9ff6172
Merge branch 'bigscience-workshop:main' into mt0
Jun 12, 2022
451318f
minor changes
Jun 14, 2022
edfaa19
Merge branch 'mt0' of https://github.com/lintangsutawika/Megatron-Dee…
Jun 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 157 additions & 0 deletions 4B8-en-CD-FLM.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#!/bin/bash

EXPERIMENT_NAME=4B8-en-CD-FLM
REPO_PATH=experiments/$EXPERIMENT_NAME
CHECKPOINT_PATH=$REPO_PATH/checkpoints
TENSORBOARD_PATH=$REPO_PATH/tensorboard
CODECARBON_PATH=$REPO_PATH/codecarbon
LOGS_PATH=$REPO_PATH/logs

DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document

# XXX: edit me
GPUS_PER_NODE=8
NNODES=1
PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here
TP_SIZE=1 # always fixed to the size of a single node
DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer

MICRO_BATCH_SIZE=32
GLOBAL_BATCH_SIZE=2048
TRAIN_ITER=131_072
SEQ_LEN=626


NLAYERS=24
NHIDDEN=4096
NHEADS=64
FFN_HIDDEN_SIZE=10240
MAX_POSITION_EMBEDDING=1280

SAVE_INTERVAL=1500

OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.999 \
--adam-eps 1e-8 \
--lr 2e-4 \
--min-lr 1e-5 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"

EXIT_OPTS=" \
--exit-duration-in-mins 1190 \
"

GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--ffn-hidden-size $FFN_HIDDEN_SIZE \
--max-position-embeddings $SEQ_LEN \
--position-embedding-type alibi \
--seq-length $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-iters $TRAIN_ITER \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path bigscience/tokenizer \
--loss-scale 12 \
--clip-grad 1.0 \
--fp16 \
--checkpoint-activations \
$OPTIMIZER_ARGS \
$EXIT_OPTS \
"

OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval $TRAIN_ITER \
--eval-iters 1 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"

ZERO_STAGE=1

config_json="./ds_config.json"

# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT


DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"

# export LAUNCHER="python -u -m torch.distributed.launch \
# --nproc_per_node $GPUS_PER_NODE \
# "
# # --nnodes $NNODES \
# # --master_addr $MASTER_ADDR \
# # --master_port $MASTER_PORT \

export CMD=" \
`pwd`/pretrain_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
"


# # clear old checkpoint as it'd mismatch while we sort things out
# rm -rf $SAVE_CHECKPOINT_PATH


echo $CMD

# We create the folder where the logs and codecarbon will be stored.
mkdir -p $REPO_PATH
mkdir -p $LOGS_PATH
# to debug - add echo (it exits and prints what it would have launched)

# python -u -m torch.distributed.launch \
# --nproc_per_node $GPUS_PER_NODE \
# $CMD

deepspeed --num_gpus $GPUS_PER_NODE \
$CMD

# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt
156 changes: 156 additions & 0 deletions 4B8-en-ND-MLM.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#!/bin/bash

EXPERIMENT_NAME=4B8-en-ND-MLM
REPO_PATH=experiments/$EXPERIMENT_NAME
CHECKPOINT_PATH=$REPO_PATH/checkpoints
TENSORBOARD_PATH=$REPO_PATH/tensorboard
CODECARBON_PATH=$REPO_PATH/codecarbon
LOGS_PATH=$REPO_PATH/logs

DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document
TOKENIZER_PATH=bigscience-tokenizer-padded

# XXX: edit me
GPUS_PER_NODE=8
NNODES=1
PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here
TP_SIZE=1 # always fixed to the size of a single node
DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer

MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=512
TRAIN_ITER=48_562
INPUT_LEN=1675
TARGET_LEN=373
Comment on lines +23 to +24
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking we need a way to compute these values from a given SEQ_LEN. Typically given a noise_density, mean_noise_span_length, and sequence_length we should be able to compute an input and target no? The reason why, is because what we really care about is that SEQ_LEN is 2048 (for performance), the rest we don';t really care as they are implementation details.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree. But I'm not sure where to put this function in.

SEQ_LEN=$((INPUT_LEN+TARGET_LEN))

NLAYERS=24
NHIDDEN=4096
NHEADS=64
FFN_HIDDEN_SIZE=10240


SAVE_INTERVAL=1500

OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.999 \
--adam-eps 1e-8 \
--lr 2e-4 \
--min-lr 1e-5 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"

EXIT_OPTS=" \
--exit-duration-in-mins 1190 \
"

GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--ffn-hidden-size $FFN_HIDDEN_SIZE \
--max-position-embeddings $SEQ_LEN \
--position-embedding-type alibi \
--seq-length $SEQ_LEN \
--input-length $INPUT_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-iters $TRAIN_ITER \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path $TOKENIZER_PATH \
--loss-scale 12 \
--clip-grad 1.0 \
--fp16 \
--checkpoint-activations \
$OPTIMIZER_ARGS \
$EXIT_OPTS \
"

OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval $TRAIN_ITER \
--eval-iters 1 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"

ZERO_STAGE=1

config_json="./ds_config.json"

# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT


DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"

# export LAUNCHER="python -u -m torch.distributed.launch \
# --nproc_per_node $GPUS_PER_NODE \
# "
# # --nnodes $NNODES \
# # --master_addr $MASTER_ADDR \
# # --master_port $MASTER_PORT \

export CMD=" \
`pwd`/train_ND_MLM_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
"


# # clear old checkpoint as it'd mismatch while we sort things out
# rm -rf $SAVE_CHECKPOINT_PATH


echo $CMD

# We create the folder where the logs and codecarbon will be stored.
mkdir -p $REPO_PATH
mkdir -p $LOGS_PATH
# to debug - add echo (it exits and prints what it would have launched)

deepspeed --num_gpus $GPUS_PER_NODE \
$CMD

# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt
Loading