Skip to content

Commit 8729b3d

Browse files
committed
fix(gguf): 修复以 char 为单位替换字符与 byte level 不兼容的问题
Signed-off-by: YdrMaster <ydrml@hotmail.com>
1 parent ccf165a commit 8729b3d

File tree

1 file changed

+14
-10
lines changed

1 file changed

+14
-10
lines changed

gguf/src/tokenizer.rs

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,18 @@ impl Tokenizer {
4545

4646
pub fn decode(&self, token: utok) -> Cow<str> {
4747
let piece = self.tokenize.decode(token);
48-
let ans = piece
49-
.chars()
50-
.map(|c| *self.de_replace.get(&c).unwrap_or(&c))
51-
.collect::<String>();
52-
if ans == piece {
53-
piece.into()
48+
if let Ok(piece) = from_utf8(piece) {
49+
let ans = piece
50+
.chars()
51+
.map(|c| *self.de_replace.get(&c).unwrap_or(&c))
52+
.collect::<String>();
53+
if ans == piece {
54+
piece.into()
55+
} else {
56+
ans.into()
57+
}
5458
} else {
55-
ans.into()
59+
unsafe { from_utf8_unchecked(piece) }.into()
5660
}
5761
}
5862

@@ -133,7 +137,7 @@ trait Tokenize {
133137
/// Encode a text into a sequence of tokens.
134138
fn encode(&self, text: &str) -> Vec<utok>;
135139
/// Decode a token into str.
136-
fn decode(&self, token: utok) -> &str;
140+
fn decode(&self, token: utok) -> &[u8];
137141
}
138142

139143
impl<M: tokeneer::Method> Tokenize for Tokeneer<M> {
@@ -142,8 +146,8 @@ impl<M: tokeneer::Method> Tokenize for Tokeneer<M> {
142146
self.encode(text)
143147
}
144148
#[inline]
145-
fn decode(&self, token: utok) -> &str {
146-
unsafe { from_utf8_unchecked(self.internal().decode(token)) }
149+
fn decode(&self, token: utok) -> &[u8] {
150+
self.internal().decode(token)
147151
}
148152
}
149153

0 commit comments

Comments
 (0)