Skip to content

Commit b289eb8

Browse files
committed
Fix preprocess_data_many_cores to use dtype
1 parent 2c43a0f commit b289eb8

File tree

1 file changed

+2
-1
lines changed

1 file changed

+2
-1
lines changed

tools/preprocess_data_many_cores.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,10 @@ def process_samples(simple_queue, process_id, args, level, writer: Connection):
117117
output_filename = get_output_filename(args.output_prefix, key, level, process_id)
118118
output_bin_files[key] = data_file_path(output_filename)
119119
output_idx_files[key] = index_file_path(output_filename)
120+
best_dtype = best_fitting_dtype(args.vocab_size) if args.dataset_impl == "mmap" else None
120121
builders[key] = indexed_dataset.make_builder(output_bin_files[key],
121122
impl=args.dataset_impl,
122-
vocab_size=encoder.tokenizer.vocab_size)
123+
dtype=best_dtype)
123124

124125
json_lines = simple_queue.get()
125126
while json_lines is not None:

0 commit comments

Comments
 (0)