Skip to content

Commit 86afaee

Browse files
committed
temp: 缝合 crates.io/crates/tokenizers 暂时绕过词表问题
Signed-off-by: YdrMaster <[email protected]>
1 parent cf78ed5 commit 86afaee

File tree

3 files changed

+28
-0
lines changed

3 files changed

+28
-0
lines changed

gguf/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@ ggus.workspace = true
99
minijinja = { version = "2.7", default-features = false, features = [
1010
"loader",
1111
"builtins",
12+
"serde",
1213
] }
1314
serde = { version = "1.0", features = ["derive"] }
1415
tokeneer = "0.0"
16+
tokenizers = { version = "0.21", features = ["http"] }
1517
memmap2 = "0.9"
1618

1719
[dev-dependencies]

gguf/src/tokenizer.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,20 @@ use std::{
66
str::{from_utf8, from_utf8_unchecked},
77
};
88
use tokeneer::{utok, Bpe, Lpe, Method, Tokeneer};
9+
use tokenizers::tokenizer::Tokenizer as Hf;
910

1011
pub struct Tokenizer {
1112
tokenize: Box<dyn Tokenize>,
1213
en_replace: HashMap<char, char>,
1314
de_replace: HashMap<char, char>,
15+
hf: Option<Hf>,
1416
}
1517

1618
impl GGufModel<'_> {
1719
pub fn tokenizer(&self) -> Tokenizer {
20+
if let Ok("deepseek-r1-qwen") = self.get_str("tokenizer.ggml.pre") {
21+
return Tokenizer::deepseek(self);
22+
}
1823
match self.tokenizer_ggml_model().unwrap() {
1924
"llama" => Tokenizer::bpe_from_gguf(self),
2025
"fm9g8b" | "gpt2" => Tokenizer::lpe_from_gguf(self),
@@ -25,6 +30,11 @@ impl GGufModel<'_> {
2530

2631
impl Tokenizer {
2732
pub fn encode(&self, text: &str) -> Vec<utok> {
33+
if let Some(hf) = &self.hf {
34+
let x = hf.encode(text, false).unwrap();
35+
return x.get_ids().to_vec();
36+
}
37+
2838
let space = self.en_replace[&' '];
2939
let mut chars = text.chars();
3040
let mut text = match chars.next() {
@@ -44,6 +54,11 @@ impl Tokenizer {
4454
}
4555

4656
pub fn decode(&self, token: utok) -> Cow<str> {
57+
if let Some(hf) = &self.hf {
58+
let x = hf.decode(&[token], false).unwrap();
59+
return x.into();
60+
}
61+
4762
let piece = self.tokenize.decode(token);
4863
if let Ok(piece) = from_utf8(piece) {
4964
let ans = piece
@@ -92,6 +107,7 @@ impl Tokenizer {
92107
tokenize: Box::new(tokeneer),
93108
en_replace,
94109
de_replace,
110+
hf: None,
95111
}
96112
}
97113

@@ -127,8 +143,16 @@ impl Tokenizer {
127143
tokenize: Box::new(tokeneer),
128144
en_replace,
129145
de_replace,
146+
hf: None,
130147
}
131148
}
149+
150+
fn deepseek(gguf: &GGufModel) -> Self {
151+
let mut ans = Tokenizer::lpe_from_gguf(gguf);
152+
ans.hf =
153+
Some(Hf::from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", None).unwrap());
154+
ans
155+
}
132156
}
133157

134158
/// A trait for tokenization.

models/llama/cuda/src/nccl_parallel.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ fn test_infer() {
172172
);
173173

174174
next.send(pair.idx() as _).unwrap()
175+
} else {
176+
stream.synchronize();
175177
}
176178
}
177179
});

0 commit comments

Comments
 (0)