Fix preprocess_data_many_cores to use dtype

thomasw21 · thomasw21 · commit 2c43a0fed29c · 2022-02-25T15:29:11.000+01:00
diff --git a/tools/preprocess_data_many_cores.py b/tools/preprocess_data_many_cores.py
@@ -49,7 +49,7 @@
 
 from megatron.tokenizer import build_tokenizer
 from megatron.data import indexed_dataset
-from megatron.data.indexed_dataset import index_file_path, data_file_path
+from megatron.data.indexed_dataset import index_file_path, data_file_path, best_fitting_dtype
 
 
 # https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
@@ -328,9 +328,10 @@ def main():
         output_filename = f"{args.output_prefix}_{key}_{level}"
         output_bin_files[key] = data_file_path(output_filename)
         output_idx_files[key] = index_file_path(output_filename)
+        best_dtype = best_fitting_dtype(args.vocab_size) if args.dataset_impl == "mmap" else None
         builders[key] = indexed_dataset.make_builder(output_bin_files[key],
                                                      impl=args.dataset_impl,
-                                                     vocab_size=tokenizer.vocab_size)
+                                                     dtype=best_dtype)
 
     for key in args.json_keys:
         for process_id in process_ids: