Skip to content

Commit 6f0b911

Browse files
committed
fix(gguf): 尝试继续兼容 llama3.2 gpt2 词表
Signed-off-by: YdrMaster <[email protected]>
1 parent e40376b commit 6f0b911

File tree

1 file changed

+21
-10
lines changed

1 file changed

+21
-10
lines changed

gguf/src/tokenizer.rs

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ impl GGufModel<'_> {
2626

2727
impl Tokenizer {
2828
pub fn encode(&self, text: &str) -> Vec<utok> {
29-
let space = self.en_replace[&' '];
29+
let space = self.en_replace.get(&' ').unwrap_or(&' ');
3030
let mut chars = text.chars();
3131
let mut text = match chars.next() {
3232
Some(c) => {
@@ -110,12 +110,24 @@ impl Tokenizer {
110110
GGmlTokenType::Byte => TokenType::Byte,
111111
});
112112

113-
let mut detective = SpaceDetective::new();
114-
let vocabs = tokens.map(|piece| {
115-
let piece = piece.unwrap();
116-
detective.record(piece);
117-
piece.as_bytes()
118-
});
113+
let buffer = tokens
114+
.map(|piece| {
115+
let piece = piece.unwrap();
116+
if map_utf8 {
117+
piece
118+
.chars()
119+
.map(|c| match c {
120+
'Ġ' => ' ',
121+
'Ċ' => '\n',
122+
_ => c,
123+
})
124+
.collect::<String>()
125+
} else {
126+
piece.to_string()
127+
}
128+
})
129+
.collect::<Vec<_>>();
130+
let vocabs = buffer.iter().map(|s| s.as_bytes());
119131

120132
let bos = gguf.tokenizer_ggml_bos_token_id().unwrap();
121133
let eos = gguf.tokenizer_ggml_eos_token_id().unwrap();
@@ -128,11 +140,10 @@ impl Tokenizer {
128140
});
129141

130142
let tokeneer = Tokeneer::new(Lpe::new(vocabs, token_type, unk, map_utf8));
131-
let (en_replace, de_replace) = detective.build_map();
132143
Self {
133144
tokenize: Box::new(tokeneer),
134-
en_replace,
135-
de_replace,
145+
en_replace: HashMap::new(),
146+
de_replace: HashMap::new(),
136147
}
137148
}
138149
}

0 commit comments

Comments
 (0)