File tree Expand file tree Collapse file tree 2 files changed +30
-3
lines changed
Expand file tree Collapse file tree 2 files changed +30
-3
lines changed Original file line number Diff line number Diff line change @@ -905,9 +905,15 @@ def reduced_vocabulary(
905905 )
906906
907907 if token_str :
908- # invalid utf-8 sequences are replaced with � (\ufffd), but there
909- # might also be tokens specifically for �, ��, ���, etc.
910- if "\ufffd " in token_str and not re_replacement_seq .match (token ):
908+ if isinstance (token , bytes ):
909+ # Handle BPE tokenizers where the tokens are directly stored as bytes
910+ # https://github.com/QwenLM/Qwen/blob/main/tokenization_note.md#regular-tokens
911+ token_str = "" .join (byte_symbol (b ) for b in token )
912+
913+ elif "\ufffd " in token_str and not re_replacement_seq .match (token ):
914+ # invalid utf-8 sequences are replaced with � (\ufffd), but there
915+ # might also be tokens specifically for �, ��, ���, etc.
916+
911917 if re_llama_byte_token .match (token ):
912918 # llama-like tokenizers have <0xXX> tokens for all
913919 # bytes >= 0x80 and represent all incomplete utf-8
Original file line number Diff line number Diff line change @@ -714,8 +714,29 @@ def test_reduced_vocabulary_with_rare_tokens(rare_token):
714714
715715 [1]: https://github.com/dottxt-ai/outlines/pull/763
716716 [2]: https://github.com/dottxt-ai/outlines/pull/948
717+ [3]: https://github.com/dottxt-ai/outlines/pull/1153
717718 """
718719 tokenizer = AutoTokenizer .from_pretrained ("openai-community/gpt2" )
719720 tokenizer = TransformerTokenizer (tokenizer = tokenizer )
720721 tokenizer .vocabulary [rare_token ] = max (tokenizer .vocabulary .values ()) + 1
721722 reduced_vocabulary (tokenizer )
723+
724+
725+ def test_reduced_vocabulary_with_byte_tokens ():
726+ class MockTokenizer :
727+ vocabulary = {
728+ "string" : 1 ,
729+ b"\xa1 " : 2 , # Qwen-Style
730+ "eos" : 3 ,
731+ }
732+ special_tokens = {"eos" }
733+ eos_token_id = 3
734+
735+ def convert_token_to_string (self , token ):
736+ return b"\xef \xbf \xbd " .decode ()
737+
738+ reduced_vocab = reduced_vocabulary (MockTokenizer ())
739+
740+ # See fsm.regex.get_token_transition_keys()
741+ # FSM transition keys represents bytes as <null_prefix><hex_byte>
742+ assert reduced_vocab [0 ][1 ][0 ] == "\x00 A1"
You can’t perform that action at this time.
0 commit comments