Skip to content

Commit be8d859

Browse files
committed
Use NullTokenizer for mock data; TFLOPs calculation to be fixed later
Signed-off-by: tailaim <tailaim@nvidia.com>
1 parent b3a3190 commit be8d859

File tree

2 files changed

+11
-4
lines changed

2 files changed

+11
-4
lines changed

examples/run_hybrid_cp.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,12 +115,12 @@ fi
115115
if [[ $USE_MOCK_DATA -eq 1 ]]; then
116116
# EXTRA_ARGS+=" --mock-data --sft-mock-dataset-config-json '{\"mode\":\"file\",\"path\":\"path/to/file\"}'"
117117
if [[ $BATCH -eq 0 ]]; then
118-
EXTRA_ARGS+=" --mock-data --sft-mock-dataset-config-json {\"mode\":\"distribution\",\"type\":\"lognormal\",\"min_seq_len\":1024,\"max_seq_len\":16384,\"mean_seq_len\":8192,\"lognormal_sigma\":1.1} "
118+
EXTRA_ARGS+=" --mock-data --sft-mock-dataset-config-json {\"mode\":\"distribution\",\"type\":\"lognormal\",\"min_seq_len\":1024,\"max_seq_len\":16384,\"mean_seq_len\":8192,\"lognormal_sigma\":1.1} --tokenizer-type NullTokenizer --vocab-size 131072 "
119119
else
120-
EXTRA_ARGS+=" --mock-data --sft-mock-dataset-config-json '{\"mode\":\"distribution\",\"type\":\"lognormal\",\"min_seq_len\":1024,\"max_seq_len\":16384,\"mean_seq_len\":8192,\"lognormal_sigma\":1.1}' "
120+
EXTRA_ARGS+=" --mock-data --sft-mock-dataset-config-json '{\"mode\":\"distribution\",\"type\":\"lognormal\",\"min_seq_len\":1024,\"max_seq_len\":16384,\"mean_seq_len\":8192,\"lognormal_sigma\":1.1}' --tokenizer-type NullTokenizer --vocab-size 131072 "
121121
fi
122122
else
123-
EXTRA_ARGS+=" --data-path ${DATA_TRAIN} "
123+
EXTRA_ARGS+=" --data-path ${DATA_TRAIN} --tokenizer-model ${TOKENIZER} "
124124
fi
125125

126126
if [[ $USE_FSDP -eq 1 ]]; then
@@ -143,7 +143,6 @@ OPTIONS=" \
143143
--use-distributed-optimizer \
144144
--disable-bias-linear \
145145
--sft-tokenizer-prompt-format nemotron-h-aligned \
146-
--tokenizer-model ${TOKENIZER} \
147146
--transformer-impl transformer_engine \
148147
--normalization RMSNorm \
149148
--norm-epsilon 1e-06 \

megatron/training/tokenizer/tokenizer.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -871,6 +871,14 @@ def eos(self):
871871
def additional_special_tokens_ids(self):
872872
return None
873873

874+
@property
875+
def force_eod(self):
876+
"""To force an EOD at the end of every data sample in SFT."""
877+
return True
878+
879+
@property
880+
def pad(self):
881+
return self._eod_id - 1
874882

875883
class _NullMultimodalTokenizer(MegatronLegacyTokenizer):
876884
def __init__(self, vocab_size, image_token=None, image_token_id=None):

0 commit comments

Comments
 (0)