File tree Expand file tree Collapse file tree 1 file changed +3
-2
lines changed Expand file tree Collapse file tree 1 file changed +3
-2
lines changed Original file line number Diff line number Diff line change 4949
5050from megatron .tokenizer import build_tokenizer
5151from megatron .data import indexed_dataset
52- from megatron .data .indexed_dataset import index_file_path , data_file_path
52+ from megatron .data .indexed_dataset import index_file_path , data_file_path , best_fitting_dtype
5353
5454
5555# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
@@ -328,9 +328,10 @@ def main():
328328 output_filename = f"{ args .output_prefix } _{ key } _{ level } "
329329 output_bin_files [key ] = data_file_path (output_filename )
330330 output_idx_files [key ] = index_file_path (output_filename )
331+ best_dtype = best_fitting_dtype (args .vocab_size ) if args .dataset_impl == "mmap" else None
331332 builders [key ] = indexed_dataset .make_builder (output_bin_files [key ],
332333 impl = args .dataset_impl ,
333- vocab_size = tokenizer . vocab_size )
334+ dtype = best_dtype )
334335
335336 for key in args .json_keys :
336337 for process_id in process_ids :
You can’t perform that action at this time.
0 commit comments