@@ -102,6 +102,26 @@ void HuggingFaceTokenizerExample() {
102102 TestTokenizer (std::move (tok), false , true );
103103}
104104
105+ void HuggingFaceBPETokenizerExample () {
106+ std::cout << " Tokenizer: Huggingface BPE" << std::endl;
107+
108+ auto start = std::chrono::high_resolution_clock::now ();
109+
110+ // Read blob from file.
111+ auto vocab_blob = LoadBytesFromFile (" dist/vocab.json" );
112+ auto merges_blob = LoadBytesFromFile (" dist/merges.txt" );
113+ // Note: all the current factory APIs takes in-memory blob as input.
114+ // This gives some flexibility on how these blobs can be read.
115+ auto tok = Tokenizer::FromBlobByteLevelBPE (vocab_blob, merges_blob);
116+
117+ auto end = std::chrono::high_resolution_clock::now ();
118+ auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count ();
119+
120+ std::cout << " Load time: " << duration << " ms" << std::endl;
121+
122+ TestTokenizer (std::move (tok), false , true );
123+ }
124+
105125// RWKV world tokenizer
106126// - dist/tokenizer_model
107127void RWKVWorldTokenizerExample () {
@@ -123,5 +143,6 @@ void RWKVWorldTokenizerExample() {
123143int main (int argc, char * argv[]) {
124144 SentencePieceTokenizerExample ();
125145 HuggingFaceTokenizerExample ();
146+ HuggingFaceBPETokenizerExample ();
126147 RWKVWorldTokenizerExample ();
127148}
0 commit comments