We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 2123476 commit d144036Copy full SHA for d144036
paddleformers/transformers/tokenizer_utils.py
@@ -582,7 +582,7 @@ def decode_token(
582
all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=False
583
)
584
585
- if len(new_text) > len(prefix_text) and "�" not in prefix_text and "�" not in new_text:
+ if len(new_text) > len(prefix_text) and not new_text.endswith("�") and not new_text[:-1].endswith("�"):
586
# utf-8 char at the end means it's a potential unfinished byte sequence
587
# from byte fallback tokenization.
588
# If it's in the middle, it's probably a real invalid id generated
0 commit comments