File tree Expand file tree Collapse file tree 1 file changed +7
-3
lines changed Expand file tree Collapse file tree 1 file changed +7
-3
lines changed Original file line number Diff line number Diff line change @@ -1881,10 +1881,14 @@ def decode_token(
1881
1881
"""tokenizer decoding for the streaming generation use case. This method can be overrided for tokenizer that doesn't follow this API"""
1882
1882
# The prefix text is necessary only to defeat cleanup algorithms in the decode
1883
1883
# which decide to add a space or not depending on the surrounding ids.
1884
- prefix_text = self .decode (all_input_ids [prefix_offset :read_offset ], skip_special_tokens = False )
1885
- new_text = self .decode (all_input_ids [prefix_offset :], skip_special_tokens = False )
1884
+ prefix_text = self .decode (
1885
+ all_input_ids [prefix_offset :read_offset ], skip_special_tokens = False , clean_up_tokenization_spaces = False
1886
+ )
1887
+ new_text = self .decode (
1888
+ all_input_ids [prefix_offset :], skip_special_tokens = False , clean_up_tokenization_spaces = False
1889
+ )
1886
1890
1887
- if len (new_text ) > len (prefix_text ) and not new_text .endswith ("�" ):
1891
+ if len (new_text ) > len (prefix_text ) and not prefix_text . endswith ( "�" ) and not new_text .endswith ("�" ):
1888
1892
# utf-8 char at the end means it's a potential unfinished byte sequence
1889
1893
# from byte fallback tokenization.
1890
1894
# If it's in the middle, it's probably a real invalid id generated
You can’t perform that action at this time.
0 commit comments