Skip to content

Commit 91ddf73

Browse files
committed
fix(gguf): 兼容 fm9g8b 和 gpt2 词表
Signed-off-by: YdrMaster <[email protected]>
1 parent 09af79d commit 91ddf73

File tree

2 files changed

+5
-4
lines changed

2 files changed

+5
-4
lines changed

gguf/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ minijinja = { version = "2.7", default-features = false, features = [
1212
"serde",
1313
] }
1414
serde = { version = "1.0", features = ["derive"] }
15-
tokeneer = { git = "https://github.com/InfiniTensor/tokeneer", rev = "c0da02c" }
15+
tokeneer = { git = "https://github.com/InfiniTensor/tokeneer", rev = "5fc0af8" }
1616
memmap2 = "0.9"
1717

1818
[dev-dependencies]

gguf/src/tokenizer.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ impl GGufModel<'_> {
1717
pub fn tokenizer(&self) -> Tokenizer {
1818
match self.tokenizer_ggml_model().unwrap() {
1919
"llama" => Tokenizer::bpe_from_gguf(self),
20-
"fm9g8b" | "gpt2" => Tokenizer::lpe_from_gguf(self),
20+
"gpt2" => Tokenizer::lpe_from_gguf(self, true),
21+
"fm9g8b" => Tokenizer::lpe_from_gguf(self, false),
2122
model => panic!("Unsupported tokenizer model: {model}"),
2223
}
2324
}
@@ -95,7 +96,7 @@ impl Tokenizer {
9596
}
9697
}
9798

98-
fn lpe_from_gguf(gguf: &GGufModel) -> Self {
99+
fn lpe_from_gguf(gguf: &GGufModel, map_utf8: bool) -> Self {
99100
let tokens = gguf.tokenizer_ggml_tokens().unwrap();
100101

101102
let token_type = gguf.tokenizer_ggml_token_type().unwrap();
@@ -126,7 +127,7 @@ impl Tokenizer {
126127
bos
127128
});
128129

129-
let tokeneer = Tokeneer::new(Lpe::new(vocabs, token_type, unk));
130+
let tokeneer = Tokeneer::new(Lpe::new(vocabs, token_type, unk, map_utf8));
130131
let (en_replace, de_replace) = detective.build_map();
131132
Self {
132133
tokenize: Box::new(tokeneer),

0 commit comments

Comments
 (0)