@@ -17,7 +17,8 @@ impl GGufModel<'_> {
1717 pub fn tokenizer ( & self ) -> Tokenizer {
1818 match self . tokenizer_ggml_model ( ) . unwrap ( ) {
1919 "llama" => Tokenizer :: bpe_from_gguf ( self ) ,
20- "fm9g8b" | "gpt2" => Tokenizer :: lpe_from_gguf ( self ) ,
20+ "gpt2" => Tokenizer :: lpe_from_gguf ( self , true ) ,
21+ "fm9g8b" => Tokenizer :: lpe_from_gguf ( self , false ) ,
2122 model => panic ! ( "Unsupported tokenizer model: {model}" ) ,
2223 }
2324 }
@@ -95,7 +96,7 @@ impl Tokenizer {
9596 }
9697 }
9798
98- fn lpe_from_gguf ( gguf : & GGufModel ) -> Self {
99+ fn lpe_from_gguf ( gguf : & GGufModel , map_utf8 : bool ) -> Self {
99100 let tokens = gguf. tokenizer_ggml_tokens ( ) . unwrap ( ) ;
100101
101102 let token_type = gguf. tokenizer_ggml_token_type ( ) . unwrap ( ) ;
@@ -126,7 +127,7 @@ impl Tokenizer {
126127 bos
127128 } ) ;
128129
129- let tokeneer = Tokeneer :: new ( Lpe :: new ( vocabs, token_type, unk) ) ;
130+ let tokeneer = Tokeneer :: new ( Lpe :: new ( vocabs, token_type, unk, map_utf8 ) ) ;
130131 let ( en_replace, de_replace) = detective. build_map ( ) ;
131132 Self {
132133 tokenize : Box :: new ( tokeneer) ,
0 commit comments