Skip to content

Commit 2c43a0f

Browse files
committed
Fix preprocess_data_many_cores to use dtype
1 parent ae6277f commit 2c43a0f

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

tools/preprocess_data_many_cores.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949

5050
from megatron.tokenizer import build_tokenizer
5151
from megatron.data import indexed_dataset
52-
from megatron.data.indexed_dataset import index_file_path, data_file_path
52+
from megatron.data.indexed_dataset import index_file_path, data_file_path, best_fitting_dtype
5353

5454

5555
# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
@@ -328,9 +328,10 @@ def main():
328328
output_filename = f"{args.output_prefix}_{key}_{level}"
329329
output_bin_files[key] = data_file_path(output_filename)
330330
output_idx_files[key] = index_file_path(output_filename)
331+
best_dtype = best_fitting_dtype(args.vocab_size) if args.dataset_impl == "mmap" else None
331332
builders[key] = indexed_dataset.make_builder(output_bin_files[key],
332333
impl=args.dataset_impl,
333-
vocab_size=tokenizer.vocab_size)
334+
dtype=best_dtype)
334335

335336
for key in args.json_keys:
336337
for process_id in process_ids:

0 commit comments

Comments
 (0)