fix decode token (#2698)

yuanlehome · web-flow · commit d144036ba953 · 2025-09-28T19:23:39.000+08:00
diff --git a/paddleformers/transformers/tokenizer_utils.py b/paddleformers/transformers/tokenizer_utils.py
@@ -582,7 +582,7 @@ def decode_token(
             all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=False
         )
 
-        if len(new_text) > len(prefix_text) and "�" not in prefix_text and "�" not in new_text:
+        if len(new_text) > len(prefix_text) and not new_text.endswith("�") and not new_text[:-1].endswith("�"):
             # utf-8 char at the end means it's a potential unfinished byte sequence
             # from byte fallback tokenization.
             # If it's in the middle, it's probably a real invalid id generated

Original file line number	Diff line number	Diff line change
`@@ -582,7 +582,7 @@ def decode_token(`
`582`	`582`	`all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=False`
`583`	`583`	`)`
`584`	`584`
`585`		`- if len(new_text) > len(prefix_text) and "�" not in prefix_text and "�" not in new_text:`
	`585`	`+ if len(new_text) > len(prefix_text) and not new_text.endswith("�") and not new_text[:-1].endswith("�"):`
`586`	`586`	`# utf-8 char at the end means it's a potential unfinished byte sequence`
`587`	`587`	`# from byte fallback tokenization.`
`588`	`588`	`# If it's in the middle, it's probably a real invalid id generated`