@@ -153,11 +153,23 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
153153 'Ⅵ-a' , # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
154154 '\uFEFF //' , # unicode_ranges_control, 0xFEFF (BOM)
155155 'Cửa Việt' , # llama-3, ignore_merges = true
156- '<s>a' , # TODO: Phi-3 fail
156+ '<s>a' , # Phi-3 fail
157+ '<unk><|endoftext|><s>' # Phi-3 fail
157158 'a\n a' , # TODO: Bert fail
158159 ]
159160
160161
162+ def generator_random_special_tokens (special_tokens :list [str ], iterations = 100 ) -> Iterator [str ]:
163+ special_tokens = set (special_tokens )
164+ special_tokens .update ([" " , "\n " , "\t " , "-" , "!" , "one" , "1" , "<s>" , "</s>" ])
165+ special_tokens = list (sorted (special_tokens ))
166+ rand = random .Random ()
167+ for m in range (iterations ):
168+ rand .seed (m )
169+ words = rand .choices (special_tokens , k = 500 )
170+ yield "" .join (words )
171+
172+
161173def generator_vocab_words (vocab : list [str ]) -> Iterator [str ]:
162174 """Brute force check all vocab words"""
163175 yield from vocab
@@ -289,14 +301,31 @@ def func_tokenize1(text: str):
289301 vocab = list (sorted (tokenizer .batch_decode (list (tokenizer .get_vocab ().values ()), skip_special_tokens = True )))
290302 test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_custom_text ())
291303 test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_custom_text_edge_cases ())
304+ test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_random_special_tokens (tokenizer .all_special_tokens , 10_000 ))
292305 test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_vocab_words (vocab ))
293306 test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_random_chars (10_000 ))
294307 test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_random_vocab_chars (vocab , 10_000 ))
295- test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_random_vocab_words (vocab , 10_000 ))
308+ test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_random_vocab_words (vocab , 5_000 ))
296309 # test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
297310
298311 model .free ()
299312
300313
301314if __name__ == "__main__" :
302- main ()
315+ # main()
316+
317+ path_tokenizers = "./models/tokenizers/"
318+ path_vocab_format = "./models/ggml-vocab-%s.gguf"
319+
320+ # import os
321+ # tokenizers = os.listdir(path_tokenizers)
322+ tokenizers = [
323+ "llama-spm" , # SPM
324+ "phi-3" , # SPM
325+ ]
326+
327+ for tokenizer in tokenizers :
328+ print ("\n " + "=" * 50 + "\n " + tokenizer + "\n " ) # noqa
329+ vocab_file = path_vocab_format % tokenizer
330+ dir_tokenizer = path_tokenizers + "/" + tokenizer
331+ main ([vocab_file , dir_tokenizer , "--verbose" ])
0 commit comments