Skip to content

Commit c93bada

Browse files
authored
[Tokenizer] Fix decode output with space in decode_token (#9010)
* fix * fix
1 parent 90cef20 commit c93bada

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

paddlenlp/transformers/tokenizer_utils.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1881,10 +1881,14 @@ def decode_token(
18811881
"""tokenizer decoding for the streaming generation use case. This method can be overrided for tokenizer that doesn't follow this API"""
18821882
# The prefix text is necessary only to defeat cleanup algorithms in the decode
18831883
# which decide to add a space or not depending on the surrounding ids.
1884-
prefix_text = self.decode(all_input_ids[prefix_offset:read_offset], skip_special_tokens=False)
1885-
new_text = self.decode(all_input_ids[prefix_offset:], skip_special_tokens=False)
1884+
prefix_text = self.decode(
1885+
all_input_ids[prefix_offset:read_offset], skip_special_tokens=False, clean_up_tokenization_spaces=False
1886+
)
1887+
new_text = self.decode(
1888+
all_input_ids[prefix_offset:], skip_special_tokens=False, clean_up_tokenization_spaces=False
1889+
)
18861890

1887-
if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
1891+
if len(new_text) > len(prefix_text) and not prefix_text.endswith("�") and not new_text.endswith("�"):
18881892
# utf-8 char at the end means it's a potential unfinished byte sequence
18891893
# from byte fallback tokenization.
18901894
# If it's in the middle, it's probably a real invalid id generated

0 commit comments

Comments
 (0)