@@ -731,7 +731,7 @@ int main(int argc, char** argv)
731731 const std::string tokenizer_path = get_option (parser, " tokenizer" , " enwiki_tokenizer.vocab" );
732732 // Default number of prompt tokens = input sequence length
733733 const bool force_tokenize = parser.option (" force-tokenize" );
734- const long num_tokens = 50000 ;
734+ const long num_tokens = 5000 ;
735735
736736 // Calculate max bytes to process
737737 size_t max_bytes = 0 , max_tokens = 0 ;
@@ -803,7 +803,7 @@ int main(int argc, char** argv)
803803 // 2) Train a new tokenizer if needed
804804 if (!file_exists (tokenizer_path)) {
805805 cout << " Training new BPE tokenizer with vocabulary size " << num_tokens << " ...\n " ;
806- tokenizer.train (enwiki_text, num_tokens, true );
806+ tokenizer.train (enwiki_text, num_tokens, 1e6 , true );
807807 serialize (tokenizer_path) << tokenizer;
808808 cout << " Tokenizer saved to " << tokenizer_path << endl;
809809 }
@@ -874,7 +874,7 @@ int main(int argc, char** argv)
874874 // 2) Train a new tokenizer if needed
875875 if (!file_exists (tokenizer_path)) {
876876 cout << " Training new BPE tokenizer with vocabulary size " << num_tokens << " ...\n " ;
877- tokenizer.train (enwiki_text, num_tokens, true );
877+ tokenizer.train (enwiki_text, num_tokens, 1e6 , true );
878878 serialize (tokenizer_path) << tokenizer;
879879 cout << " Tokenizer saved to " << tokenizer_path << endl;
880880 }
0 commit comments