Skip to content

Commit 2db56f5

Browse files
committed
Example updated.
1 parent f6c8526 commit 2db56f5

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

examples/slm_advanced_train_ex.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -731,7 +731,7 @@ int main(int argc, char** argv)
731731
const std::string tokenizer_path = get_option(parser, "tokenizer", "enwiki_tokenizer.vocab");
732732
// Default number of prompt tokens = input sequence length
733733
const bool force_tokenize = parser.option("force-tokenize");
734-
const long num_tokens = 50000;
734+
const long num_tokens = 5000;
735735

736736
// Calculate max bytes to process
737737
size_t max_bytes = 0, max_tokens = 0;
@@ -803,7 +803,7 @@ int main(int argc, char** argv)
803803
// 2) Train a new tokenizer if needed
804804
if (!file_exists(tokenizer_path)) {
805805
cout << "Training new BPE tokenizer with vocabulary size " << num_tokens << "...\n";
806-
tokenizer.train(enwiki_text, num_tokens, true);
806+
tokenizer.train(enwiki_text, num_tokens, 1e6, true);
807807
serialize(tokenizer_path) << tokenizer;
808808
cout << "Tokenizer saved to " << tokenizer_path << endl;
809809
}
@@ -874,7 +874,7 @@ int main(int argc, char** argv)
874874
// 2) Train a new tokenizer if needed
875875
if (!file_exists(tokenizer_path)) {
876876
cout << "Training new BPE tokenizer with vocabulary size " << num_tokens << "...\n";
877-
tokenizer.train(enwiki_text, num_tokens, true);
877+
tokenizer.train(enwiki_text, num_tokens, 1e6, true);
878878
serialize(tokenizer_path) << tokenizer;
879879
cout << "Tokenizer saved to " << tokenizer_path << endl;
880880
}

0 commit comments

Comments
 (0)