@@ -5,7 +5,7 @@ use std::{
55 collections:: HashMap ,
66 str:: { from_utf8, from_utf8_unchecked} ,
77} ;
8- use tokeneer:: { utok, Bpe , Lpe , Method , Tokeneer } ;
8+ use tokeneer:: { utok, Bpe , Lpe , TokenType , Tokeneer } ;
99
1010pub struct Tokenizer {
1111 tokenize : Box < dyn Tokenize > ,
@@ -62,31 +62,31 @@ impl Tokenizer {
6262
6363 fn bpe_from_gguf ( gguf : & GGufModel ) -> Self {
6464 let tokens = gguf. tokenizer_ggml_tokens ( ) . unwrap ( ) ;
65+
6566 let scores = gguf. tokenizer_ggml_scores ( ) . unwrap ( ) ;
66- let token_type = gguf. tokenizer_ggml_token_type ( ) . unwrap ( ) ;
6767 assert_eq ! ( tokens. len( ) , scores. len( ) ) ;
68+ let scores = scores. map ( |score| score. unwrap ( ) ) ;
69+
70+ let token_type = gguf. tokenizer_ggml_token_type ( ) . unwrap ( ) ;
6871 assert_eq ! ( tokens. len( ) , token_type. len( ) ) ;
72+ let token_type = token_type. map ( |ty| match unsafe { std:: mem:: transmute ( ty. unwrap ( ) ) } {
73+ GGmlTokenType :: Normal => TokenType :: Normal ,
74+ GGmlTokenType :: Unknown => TokenType :: Unknown ,
75+ GGmlTokenType :: Control => TokenType :: Control ,
76+ GGmlTokenType :: User => TokenType :: UserDefined ,
77+ GGmlTokenType :: Unused => TokenType :: Normal ,
78+ GGmlTokenType :: Byte => TokenType :: Byte ,
79+ } ) ;
6980
7081 let mut detective = SpaceDetective :: new ( ) ;
7182 let vocabs = tokens. map ( |piece| {
7283 let piece = piece. unwrap ( ) ;
7384 detective. record ( piece) ;
7485 piece
7586 } ) ;
76- let scores = scores. map ( |score| score. unwrap ( ) ) ;
77- let is_byte = token_type. map ( |ty| GGmlTokenType :: Byte as i32 == ty. unwrap ( ) ) ;
7887
7988 let unk = gguf. tokenizer_ggml_unknown_token_id ( ) . unwrap ( ) ;
80- let bos = gguf. tokenizer_ggml_bos_token_id ( ) . unwrap ( ) ;
81- let eos = gguf. tokenizer_ggml_eos_token_id ( ) . unwrap ( ) ;
82-
83- let bpe = Bpe :: new ( vocabs, scores, is_byte, unk) ;
84- let bos_piece = from_utf8 ( bpe. decode ( bos) ) . unwrap ( ) . to_string ( ) ;
85- let eos_piece = from_utf8 ( bpe. decode ( eos) ) . unwrap ( ) . to_string ( ) ;
86-
87- let mut tokeneer = Tokeneer :: new ( bpe) ;
88- tokeneer. extend_special ( [ ( bos_piece, vec ! [ bos] ) , ( eos_piece, vec ! [ eos] ) ] ) ;
89-
89+ let tokeneer = Tokeneer :: new ( Bpe :: new ( vocabs, scores, token_type, unk) ) ;
9090 let ( en_replace, de_replace) = detective. build_map ( ) ;
9191 Self {
9292 tokenize : Box :: new ( tokeneer) ,
@@ -98,6 +98,17 @@ impl Tokenizer {
9898 fn lpe_from_gguf ( gguf : & GGufModel ) -> Self {
9999 let tokens = gguf. tokenizer_ggml_tokens ( ) . unwrap ( ) ;
100100
101+ let token_type = gguf. tokenizer_ggml_token_type ( ) . unwrap ( ) ;
102+ assert_eq ! ( tokens. len( ) , token_type. len( ) ) ;
103+ let token_type = token_type. map ( |ty| match unsafe { std:: mem:: transmute ( ty. unwrap ( ) ) } {
104+ GGmlTokenType :: Normal => TokenType :: Normal ,
105+ GGmlTokenType :: Unknown => TokenType :: Unknown ,
106+ GGmlTokenType :: Control => TokenType :: Control ,
107+ GGmlTokenType :: User => TokenType :: UserDefined ,
108+ GGmlTokenType :: Unused => TokenType :: Normal ,
109+ GGmlTokenType :: Byte => TokenType :: Byte ,
110+ } ) ;
111+
101112 let mut detective = SpaceDetective :: new ( ) ;
102113 let vocabs = tokens. map ( |piece| {
103114 let piece = piece. unwrap ( ) ;
@@ -115,13 +126,7 @@ impl Tokenizer {
115126 bos
116127 } ) ;
117128
118- let bpe = Lpe :: new ( vocabs, unk) ;
119- let bos_piece = from_utf8 ( bpe. decode ( bos) ) . unwrap ( ) . to_string ( ) ;
120- let eos_piece = from_utf8 ( bpe. decode ( eos) ) . unwrap ( ) . to_string ( ) ;
121-
122- let mut tokeneer = Tokeneer :: new ( bpe) ;
123- tokeneer. extend_special ( [ ( bos_piece, vec ! [ bos] ) , ( eos_piece, vec ! [ eos] ) ] ) ;
124-
129+ let tokeneer = Tokeneer :: new ( Lpe :: new ( vocabs, token_type, unk) ) ;
125130 let ( en_replace, de_replace) = detective. build_map ( ) ;
126131 Self {
127132 tokenize : Box :: new ( tokeneer) ,
0 commit comments