Skip to content

Commit 5f68f07

Browse files
authored
Avoid loading added_tokens when this variable is empty (#77)
* [BugFix] donot add `added_tokens` when it's empty
1 parent aae1209 commit 5f68f07

File tree

3 files changed

+38
-9
lines changed

3 files changed

+38
-9
lines changed

example/build_and_run.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ if [ ! -f "tokenizer_model" ]; then
2020
wget https://github.com/BBuf/run-rwkv-world-4-in-mlc-llm/releases/download/v1.0.0/tokenizer_model.zip
2121
unzip tokenizer_model.zip
2222
fi
23+
if [ ! -f "vocab.json" ]; then
24+
wget https://huggingface.co/Qwen/Qwen2.5-3B-Instruct/resolve/main/vocab.json
25+
fi
26+
if [ ! -f "merges.txt" ]; then
27+
wget https://huggingface.co/Qwen/Qwen2.5-3B-Instruct/resolve/main/merges.txt
28+
fi
2329
cd ..
2430

2531
# run

example/example.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,26 @@ void HuggingFaceTokenizerExample() {
102102
TestTokenizer(std::move(tok), false, true);
103103
}
104104

105+
void HuggingFaceBPETokenizerExample() {
106+
std::cout << "Tokenizer: Huggingface BPE" << std::endl;
107+
108+
auto start = std::chrono::high_resolution_clock::now();
109+
110+
// Read blob from file.
111+
auto vocab_blob = LoadBytesFromFile("dist/vocab.json");
112+
auto merges_blob = LoadBytesFromFile("dist/merges.txt");
113+
// Note: all the current factory APIs takes in-memory blob as input.
114+
// This gives some flexibility on how these blobs can be read.
115+
auto tok = Tokenizer::FromBlobByteLevelBPE(vocab_blob, merges_blob);
116+
117+
auto end = std::chrono::high_resolution_clock::now();
118+
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
119+
120+
std::cout << "Load time: " << duration << " ms" << std::endl;
121+
122+
TestTokenizer(std::move(tok), false, true);
123+
}
124+
105125
// RWKV world tokenizer
106126
// - dist/tokenizer_model
107127
void RWKVWorldTokenizerExample() {
@@ -123,5 +143,6 @@ void RWKVWorldTokenizerExample() {
123143
int main(int argc, char* argv[]) {
124144
SentencePieceTokenizerExample();
125145
HuggingFaceTokenizerExample();
146+
HuggingFaceBPETokenizerExample();
126147
RWKVWorldTokenizerExample();
127148
}

rust/src/lib.rs

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ impl TokenizerWrapper {
3535
added_tokens: &str,
3636
) -> TokenizerWrapper {
3737
let vocab_json: Value = serde_json::from_str(vocab).unwrap();
38-
let added_tokens_json: Value = serde_json::from_str(added_tokens).unwrap();
3938
let mut vocab = HashMap::new();
4039
match vocab_json {
4140
Value::Object(m) => {
@@ -48,16 +47,19 @@ impl TokenizerWrapper {
4847
}
4948
_ => panic!("Invalid vocab.json file."),
5049
};
51-
match added_tokens_json {
52-
Value::Object(m) => {
53-
for (token, id) in m {
54-
if let Value::Number(id) = id {
55-
let id = id.as_u64().unwrap() as u32;
56-
vocab.insert(token, id);
50+
if !added_tokens.is_empty() {
51+
let added_tokens_json: Value = serde_json::from_str(added_tokens).unwrap();
52+
match added_tokens_json {
53+
Value::Object(m) => {
54+
for (token, id) in m {
55+
if let Value::Number(id) = id {
56+
let id = id.as_u64().unwrap() as u32;
57+
vocab.insert(token, id);
58+
}
5759
}
5860
}
59-
}
60-
_ => panic!("Invalid added_tokens.json file."),
61+
_ => panic!("Invalid added_tokens.json file."),
62+
};
6163
}
6264

6365
let merges = merges

0 commit comments

Comments
 (0)