1515 Iterator ,
1616 List ,
1717 Literal ,
18+ Optional ,
1819 Sequence ,
19- TypedDict ,
2020 Union ,
2121)
2222
2323import tiktoken
24- from tiktoken .load import load_tiktoken_bpe
2524
25+ from tiktoken .load import load_tiktoken_bpe
2626
2727logger = getLogger (__name__ )
2828
2929
30- Role = Literal ["system" , "user" , "assistant" ]
31-
30+ # The tiktoken tokenizer can handle <=400k chars without
31+ # pyo3_runtime.PanicException.
32+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
3233
33- class Message (TypedDict ):
34- role : Role
35- content : str
34+ # https://github.com/openai/tiktoken/issues/195
35+ # Here we iterate over subsequences and split if we exceed the limit
36+ # of max consecutive non-whitespace or whitespace characters.
37+ MAX_NO_WHITESPACES_CHARS = 25_000
3638
3739
38- Dialog = Sequence [ Message ]
40+ _INSTANCE = None
3941
4042
4143class Tokenizer :
4244 """
43- tokenizing and encoding/decoding text using the Tiktoken tokenizer.
45+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
4446 """
4547
4648 special_tokens : Dict [str , int ]
@@ -49,14 +51,23 @@ class Tokenizer:
4951
5052 pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" # noqa: E501
5153
54+ @classmethod
55+ def get_instance (cls ):
56+ global _INSTANCE
57+
58+ if _INSTANCE is None :
59+ _INSTANCE = Tokenizer (
60+ os .path .join (os .path .dirname (__file__ ), "tokenizer.model" )
61+ )
62+ return _INSTANCE
63+
5264 def __init__ (self , model_path : str ):
5365 """
5466 Initializes the Tokenizer with a Tiktoken model.
5567
5668 Args:
5769 model_path (str): The path to the Tiktoken model file.
5870 """
59- # reload tokenizer
6071 assert os .path .isfile (model_path ), model_path
6172
6273 mergeable_ranks = load_tiktoken_bpe (model_path )
@@ -66,16 +77,21 @@ def __init__(self, model_path: str):
6677 "<|end_of_text|>" ,
6778 "<|reserved_special_token_0|>" ,
6879 "<|reserved_special_token_1|>" ,
69- "<|reserved_special_token_2 |>" ,
70- "<|reserved_special_token_3 |>" ,
80+ "<|finetune_right_pad_id |>" ,
81+ "<|step_id |>" ,
7182 "<|start_header_id|>" ,
7283 "<|end_header_id|>" ,
73- "<|reserved_special_token_4 |>" ,
84+ "<|eom_id |>" , # end of message
7485 "<|eot_id|>" , # end of turn
75- ] + [
76- f"<|reserved_special_token_{ i } |>"
77- for i in range (5 , self .num_reserved_special_tokens - 5 )
86+ "<|python_tag|>" ,
87+ "<|image|>" ,
88+ ]
89+ reserved_tokens = [
90+ f"<|reserved_special_token_{ 2 + i } |>"
91+ for i in range (self .num_reserved_special_tokens - len (special_tokens ))
7892 ]
93+ special_tokens = special_tokens + reserved_tokens
94+
7995 self .special_tokens = {
8096 token : num_base_tokens + i for i , token in enumerate (special_tokens )
8197 }
@@ -85,28 +101,28 @@ def __init__(self, model_path: str):
85101 mergeable_ranks = mergeable_ranks ,
86102 special_tokens = self .special_tokens ,
87103 )
88- logger .info (f"Reloaded SentencePiece model from { model_path } " )
89104
105+ self .n_words : int = num_base_tokens + len (special_tokens )
90106 # BOS / EOS token IDs
91- self .n_words : int = self .model .n_vocab
92107 self .bos_id : int = self .special_tokens ["<|begin_of_text|>" ]
93108 self .eos_id : int = self .special_tokens ["<|end_of_text|>" ]
94- self .pad_id : int = - 1
95- self .stop_tokens = {
96- self .special_tokens ["<|end_of_text|>" ],
109+ self .eot_id : int = self .special_tokens ["<|eot_id|>" ]
110+ self .eom_id : int = self .special_tokens ["<|eom_id|>" ]
111+ self .python_tag_id = self .special_tokens ["<|python_tag|>" ]
112+ self .pad_id : int = self .special_tokens ["<|finetune_right_pad_id|>" ]
113+ self .stop_tokens = [
114+ self .eos_id ,
115+ self .special_tokens ["<|eom_id|>" ],
97116 self .special_tokens ["<|eot_id|>" ],
98- }
99- logger .info (
100- f"#words: { self .n_words } - BOS ID: { self .bos_id } - EOS ID: { self .eos_id } "
101- )
117+ ]
102118
103119 def encode (
104120 self ,
105121 s : str ,
106122 * ,
107123 bos : bool ,
108124 eos : bool ,
109- allowed_special : Union [Literal ["all" ], AbstractSet [str ]] = set (), # noqa B006
125+ allowed_special : Optional [ Union [Literal ["all" ], AbstractSet [str ]]] = None ,
110126 disallowed_special : Union [Literal ["all" ], Collection [str ]] = (),
111127 ) -> List [int ]:
112128 """
@@ -125,22 +141,15 @@ def encode(
125141 By default, setting disallowed_special=() encodes a string by ignoring
126142 special tokens. Specifically:
127143 - Setting `disallowed_special` to () will cause all text corresponding
128- to special tokens to be encoded as natural text (instead of raising
144+ to special tokens to be encoded as natural text (insteading of raising
129145 an error).
130146 - Setting `allowed_special` to "all" will treat all text corresponding
131147 to special tokens to be encoded as special tokens.
132148 """
149+ if allowed_special is None :
150+ allowed_special = set ()
133151 assert type (s ) is str
134152
135- # The tiktoken tokenizer can handle <=400k chars without
136- # pyo3_runtime.PanicException (may go beyond 400k)
137- TIKTOKEN_MAX_ENCODE_CHARS = 400_000
138-
139- # https://github.com/openai/tiktoken/issues/195
140- # Here we iterate over subsequences and split if we exceed the limit
141- # of max consecutive non-whitespace or whitespace characters.
142- MAX_NO_WHITESPACES_CHARS = 25_000
143-
144153 substrs = (
145154 substr
146155 for i in range (0 , len (s ), TIKTOKEN_MAX_ENCODE_CHARS )
@@ -173,16 +182,16 @@ def decode(self, t: Sequence[int]) -> str:
173182 Returns:
174183 str: The decoded string.
175184 """
176- # typecast is safe here, Tiktoken doesn't do anything list-related with the sequence.
185+ # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
177186 return self .model .decode (cast (List [int ], t ))
178187
179188 @staticmethod
180189 def _split_whitespaces_or_nonwhitespaces (
181190 s : str , max_consecutive_slice_len : int
182191 ) -> Iterator [str ]:
183192 """
184- Split the string `s` so that each substring contains no more than `max_consecutive_slice_len`
185- consecutive whitespaces or consecutive non-whitespaces
193+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
194+ consecutive whitespaces or consecutive non-whitespaces.
186195 """
187196 current_slice_len = 0
188197 current_slice_is_space = s [0 ].isspace () if len (s ) > 0 else False
@@ -201,33 +210,3 @@ def _split_whitespaces_or_nonwhitespaces(
201210 slice_start = i
202211 current_slice_len = 1
203212 yield s [slice_start :]
204-
205-
206- class ChatFormat :
207- def __init__ (self , tokenizer : Tokenizer ):
208- self .tokenizer = tokenizer
209-
210- def encode_header (self , message : Message ) -> List [int ]:
211- tokens = []
212- tokens .append (self .tokenizer .special_tokens ["<|start_header_id|>" ])
213- tokens .extend (self .tokenizer .encode (message ["role" ], bos = False , eos = False ))
214- tokens .append (self .tokenizer .special_tokens ["<|end_header_id|>" ])
215- tokens .extend (self .tokenizer .encode ("\n \n " , bos = False , eos = False ))
216- return tokens
217-
218- def encode_message (self , message : Message ) -> List [int ]:
219- tokens = self .encode_header (message )
220- tokens .extend (
221- self .tokenizer .encode (message ["content" ].strip (), bos = False , eos = False )
222- )
223- tokens .append (self .tokenizer .special_tokens ["<|eot_id|>" ])
224- return tokens
225-
226- def encode_dialog_prompt (self , dialog : Dialog ) -> List [int ]:
227- tokens = []
228- tokens .append (self .tokenizer .special_tokens ["<|begin_of_text|>" ])
229- for message in dialog :
230- tokens .extend (self .encode_message (message ))
231- # Add the start of an assistant message for the model to complete
232- tokens .extend (self .encode_header ({"role" : "assistant" , "content" : "" }))
233- return tokens
0 commit comments