@@ -26,7 +26,7 @@ impl GGufModel<'_> {
2626
2727impl Tokenizer {
2828 pub fn encode ( & self , text : & str ) -> Vec < utok > {
29- let space = self . en_replace [ & ' ' ] ;
29+ let space = self . en_replace . get ( & ' ' ) . unwrap_or ( & ' ' ) ;
3030 let mut chars = text. chars ( ) ;
3131 let mut text = match chars. next ( ) {
3232 Some ( c) => {
@@ -110,12 +110,24 @@ impl Tokenizer {
110110 GGmlTokenType :: Byte => TokenType :: Byte ,
111111 } ) ;
112112
113- let mut detective = SpaceDetective :: new ( ) ;
114- let vocabs = tokens. map ( |piece| {
115- let piece = piece. unwrap ( ) ;
116- detective. record ( piece) ;
117- piece. as_bytes ( )
118- } ) ;
113+ let buffer = tokens
114+ . map ( |piece| {
115+ let piece = piece. unwrap ( ) ;
116+ if map_utf8 {
117+ piece
118+ . chars ( )
119+ . map ( |c| match c {
120+ 'Ġ' => ' ' ,
121+ 'Ċ' => '\n' ,
122+ _ => c,
123+ } )
124+ . collect :: < String > ( )
125+ } else {
126+ piece. to_string ( )
127+ }
128+ } )
129+ . collect :: < Vec < _ > > ( ) ;
130+ let vocabs = buffer. iter ( ) . map ( |s| s. as_bytes ( ) ) ;
119131
120132 let bos = gguf. tokenizer_ggml_bos_token_id ( ) . unwrap ( ) ;
121133 let eos = gguf. tokenizer_ggml_eos_token_id ( ) . unwrap ( ) ;
@@ -128,11 +140,10 @@ impl Tokenizer {
128140 } ) ;
129141
130142 let tokeneer = Tokeneer :: new ( Lpe :: new ( vocabs, token_type, unk, map_utf8) ) ;
131- let ( en_replace, de_replace) = detective. build_map ( ) ;
132143 Self {
133144 tokenize : Box :: new ( tokeneer) ,
134- en_replace,
135- de_replace,
145+ en_replace : HashMap :: new ( ) ,
146+ de_replace : HashMap :: new ( ) ,
136147 }
137148 }
138149}
0 commit comments