@@ -6,15 +6,20 @@ use std::{
66 str:: { from_utf8, from_utf8_unchecked} ,
77} ;
88use tokeneer:: { utok, Bpe , Lpe , Method , Tokeneer } ;
9+ use tokenizers:: tokenizer:: Tokenizer as Hf ;
910
1011pub struct Tokenizer {
1112 tokenize : Box < dyn Tokenize > ,
1213 en_replace : HashMap < char , char > ,
1314 de_replace : HashMap < char , char > ,
15+ hf : Option < Hf > ,
1416}
1517
1618impl GGufModel < ' _ > {
1719 pub fn tokenizer ( & self ) -> Tokenizer {
20+ if let Ok ( "deepseek-r1-qwen" ) = self . get_str ( "tokenizer.ggml.pre" ) {
21+ return Tokenizer :: deepseek ( self ) ;
22+ }
1823 match self . tokenizer_ggml_model ( ) . unwrap ( ) {
1924 "llama" => Tokenizer :: bpe_from_gguf ( self ) ,
2025 "fm9g8b" | "gpt2" => Tokenizer :: lpe_from_gguf ( self ) ,
@@ -25,6 +30,11 @@ impl GGufModel<'_> {
2530
2631impl Tokenizer {
2732 pub fn encode ( & self , text : & str ) -> Vec < utok > {
33+ if let Some ( hf) = & self . hf {
34+ let x = hf. encode ( text, false ) . unwrap ( ) ;
35+ return x. get_ids ( ) . to_vec ( ) ;
36+ }
37+
2838 let space = self . en_replace [ & ' ' ] ;
2939 let mut chars = text. chars ( ) ;
3040 let mut text = match chars. next ( ) {
@@ -44,6 +54,11 @@ impl Tokenizer {
4454 }
4555
4656 pub fn decode ( & self , token : utok ) -> Cow < str > {
57+ if let Some ( hf) = & self . hf {
58+ let x = hf. decode ( & [ token] , false ) . unwrap ( ) ;
59+ return x. into ( ) ;
60+ }
61+
4762 let piece = self . tokenize . decode ( token) ;
4863 if let Ok ( piece) = from_utf8 ( piece) {
4964 let ans = piece
@@ -92,6 +107,7 @@ impl Tokenizer {
92107 tokenize : Box :: new ( tokeneer) ,
93108 en_replace,
94109 de_replace,
110+ hf : None ,
95111 }
96112 }
97113
@@ -127,8 +143,16 @@ impl Tokenizer {
127143 tokenize : Box :: new ( tokeneer) ,
128144 en_replace,
129145 de_replace,
146+ hf : None ,
130147 }
131148 }
149+
150+ fn deepseek ( gguf : & GGufModel ) -> Self {
151+ let mut ans = Tokenizer :: lpe_from_gguf ( gguf) ;
152+ ans. hf =
153+ Some ( Hf :: from_pretrained ( "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" , None ) . unwrap ( ) ) ;
154+ ans
155+ }
132156}
133157
134158/// A trait for tokenization.
0 commit comments