Skip to content

Commit 09af79d

Browse files
committed
fix(gguf): 更新 tokeneer,支持 gpt 词表的非 ASCII 字符
Signed-off-by: YdrMaster <[email protected]>
1 parent cf78ed5 commit 09af79d

File tree

2 files changed

+28
-22
lines changed

2 files changed

+28
-22
lines changed

gguf/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@ ggus.workspace = true
99
minijinja = { version = "2.7", default-features = false, features = [
1010
"loader",
1111
"builtins",
12+
"serde",
1213
] }
1314
serde = { version = "1.0", features = ["derive"] }
14-
tokeneer = "0.0"
15+
tokeneer = { git = "https://github.com/InfiniTensor/tokeneer", rev = "c0da02c" }
1516
memmap2 = "0.9"
1617

1718
[dev-dependencies]

gguf/src/tokenizer.rs

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use std::{
55
collections::HashMap,
66
str::{from_utf8, from_utf8_unchecked},
77
};
8-
use tokeneer::{utok, Bpe, Lpe, Method, Tokeneer};
8+
use tokeneer::{utok, Bpe, Lpe, TokenType, Tokeneer};
99

1010
pub struct Tokenizer {
1111
tokenize: Box<dyn Tokenize>,
@@ -62,31 +62,31 @@ impl Tokenizer {
6262

6363
fn bpe_from_gguf(gguf: &GGufModel) -> Self {
6464
let tokens = gguf.tokenizer_ggml_tokens().unwrap();
65+
6566
let scores = gguf.tokenizer_ggml_scores().unwrap();
66-
let token_type = gguf.tokenizer_ggml_token_type().unwrap();
6767
assert_eq!(tokens.len(), scores.len());
68+
let scores = scores.map(|score| score.unwrap());
69+
70+
let token_type = gguf.tokenizer_ggml_token_type().unwrap();
6871
assert_eq!(tokens.len(), token_type.len());
72+
let token_type = token_type.map(|ty| match unsafe { std::mem::transmute(ty.unwrap()) } {
73+
GGmlTokenType::Normal => TokenType::Normal,
74+
GGmlTokenType::Unknown => TokenType::Unknown,
75+
GGmlTokenType::Control => TokenType::Control,
76+
GGmlTokenType::User => TokenType::UserDefined,
77+
GGmlTokenType::Unused => TokenType::Normal,
78+
GGmlTokenType::Byte => TokenType::Byte,
79+
});
6980

7081
let mut detective = SpaceDetective::new();
7182
let vocabs = tokens.map(|piece| {
7283
let piece = piece.unwrap();
7384
detective.record(piece);
7485
piece
7586
});
76-
let scores = scores.map(|score| score.unwrap());
77-
let is_byte = token_type.map(|ty| GGmlTokenType::Byte as i32 == ty.unwrap());
7887

7988
let unk = gguf.tokenizer_ggml_unknown_token_id().unwrap();
80-
let bos = gguf.tokenizer_ggml_bos_token_id().unwrap();
81-
let eos = gguf.tokenizer_ggml_eos_token_id().unwrap();
82-
83-
let bpe = Bpe::new(vocabs, scores, is_byte, unk);
84-
let bos_piece = from_utf8(bpe.decode(bos)).unwrap().to_string();
85-
let eos_piece = from_utf8(bpe.decode(eos)).unwrap().to_string();
86-
87-
let mut tokeneer = Tokeneer::new(bpe);
88-
tokeneer.extend_special([(bos_piece, vec![bos]), (eos_piece, vec![eos])]);
89-
89+
let tokeneer = Tokeneer::new(Bpe::new(vocabs, scores, token_type, unk));
9090
let (en_replace, de_replace) = detective.build_map();
9191
Self {
9292
tokenize: Box::new(tokeneer),
@@ -98,6 +98,17 @@ impl Tokenizer {
9898
fn lpe_from_gguf(gguf: &GGufModel) -> Self {
9999
let tokens = gguf.tokenizer_ggml_tokens().unwrap();
100100

101+
let token_type = gguf.tokenizer_ggml_token_type().unwrap();
102+
assert_eq!(tokens.len(), token_type.len());
103+
let token_type = token_type.map(|ty| match unsafe { std::mem::transmute(ty.unwrap()) } {
104+
GGmlTokenType::Normal => TokenType::Normal,
105+
GGmlTokenType::Unknown => TokenType::Unknown,
106+
GGmlTokenType::Control => TokenType::Control,
107+
GGmlTokenType::User => TokenType::UserDefined,
108+
GGmlTokenType::Unused => TokenType::Normal,
109+
GGmlTokenType::Byte => TokenType::Byte,
110+
});
111+
101112
let mut detective = SpaceDetective::new();
102113
let vocabs = tokens.map(|piece| {
103114
let piece = piece.unwrap();
@@ -115,13 +126,7 @@ impl Tokenizer {
115126
bos
116127
});
117128

118-
let bpe = Lpe::new(vocabs, unk);
119-
let bos_piece = from_utf8(bpe.decode(bos)).unwrap().to_string();
120-
let eos_piece = from_utf8(bpe.decode(eos)).unwrap().to_string();
121-
122-
let mut tokeneer = Tokeneer::new(bpe);
123-
tokeneer.extend_special([(bos_piece, vec![bos]), (eos_piece, vec![eos])]);
124-
129+
let tokeneer = Tokeneer::new(Lpe::new(vocabs, token_type, unk));
125130
let (en_replace, de_replace) = detective.build_map();
126131
Self {
127132
tokenize: Box::new(tokeneer),

0 commit comments

Comments
 (0)