Use NullTokenizer for mock data; TFLOPs calculation to be fixed later

xiaoyao0115 · xiaoyao0115 · commit be8d859161f7 · 2025-12-02T23:24:26.000-08:00
Signed-off-by: tailaim &lt;tailaim@nvidia.com&gt;
diff --git a/examples/run_hybrid_cp.sh b/examples/run_hybrid_cp.sh
@@ -115,12 +115,12 @@ fi
 if [[ $USE_MOCK_DATA -eq 1 ]]; then
     # EXTRA_ARGS+=" --mock-data --sft-mock-dataset-config-json '{\"mode\":\"file\",\"path\":\"path/to/file\"}'"
     if [[ $BATCH -eq 0 ]]; then
-    EXTRA_ARGS+=" --mock-data --sft-mock-dataset-config-json {\"mode\":\"distribution\",\"type\":\"lognormal\",\"min_seq_len\":1024,\"max_seq_len\":16384,\"mean_seq_len\":8192,\"lognormal_sigma\":1.1} "
+    EXTRA_ARGS+=" --mock-data --sft-mock-dataset-config-json {\"mode\":\"distribution\",\"type\":\"lognormal\",\"min_seq_len\":1024,\"max_seq_len\":16384,\"mean_seq_len\":8192,\"lognormal_sigma\":1.1} --tokenizer-type NullTokenizer --vocab-size 131072 "
     else
-    EXTRA_ARGS+=" --mock-data --sft-mock-dataset-config-json '{\"mode\":\"distribution\",\"type\":\"lognormal\",\"min_seq_len\":1024,\"max_seq_len\":16384,\"mean_seq_len\":8192,\"lognormal_sigma\":1.1}' "
+    EXTRA_ARGS+=" --mock-data --sft-mock-dataset-config-json '{\"mode\":\"distribution\",\"type\":\"lognormal\",\"min_seq_len\":1024,\"max_seq_len\":16384,\"mean_seq_len\":8192,\"lognormal_sigma\":1.1}' --tokenizer-type NullTokenizer --vocab-size 131072 "
     fi
 else
-    EXTRA_ARGS+=" --data-path ${DATA_TRAIN} "
+    EXTRA_ARGS+=" --data-path ${DATA_TRAIN} --tokenizer-model ${TOKENIZER} "
 fi
 
 if [[ $USE_FSDP -eq 1 ]]; then
@@ -143,7 +143,6 @@ OPTIONS=" \
     --use-distributed-optimizer \
     --disable-bias-linear \
     --sft-tokenizer-prompt-format nemotron-h-aligned \
-    --tokenizer-model ${TOKENIZER} \
     --transformer-impl transformer_engine \
     --normalization RMSNorm \
     --norm-epsilon 1e-06 \
diff --git a/megatron/training/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py
@@ -871,6 +871,14 @@ def eos(self):
     def additional_special_tokens_ids(self):
         return None
 
+    @property
+    def force_eod(self):
+        """To force an EOD at the end of every data sample in SFT."""
+        return True
+    
+    @property
+    def pad(self):
+        return self._eod_id - 1
 
 class _NullMultimodalTokenizer(MegatronLegacyTokenizer):
     def __init__(self, vocab_size, image_token=None, image_token_id=None):