[BugFix] fix decode_token (#2544)

yuanlehome · liuyuanle · web-flow · commit 109672603eeb · 2025-09-05T11:31:25.000+08:00
Co-authored-by: liuyuanle &lt;liuyuanle@baidu.com&gt;
diff --git a/paddleformers/transformers/legacy/tokenizer_utils_base.py b/paddleformers/transformers/legacy/tokenizer_utils_base.py
@@ -3491,7 +3491,7 @@ def decode_token(
             all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=False
         )
 
-        if len(new_text) > len(prefix_text) and not prefix_text.endswith("�") and not new_text.endswith("�"):
+        if len(new_text) > len(prefix_text) and "�" not in prefix_text and "�" not in new_text:
             # utf-8 char at the end means it's a potential unfinished byte sequence
             # from byte fallback tokenization.
             # If it's in the middle, it's probably a real invalid id generated

Original file line number	Diff line number	Diff line change
`@@ -3491,7 +3491,7 @@ def decode_token(`
`3491`	`3491`	`all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=False`
`3492`	`3492`	`)`
`3493`	`3493`
`3494`		`- if len(new_text) > len(prefix_text) and not prefix_text.endswith("�") and not new_text.endswith("�"):`
	`3494`	`+ if len(new_text) > len(prefix_text) and "�" not in prefix_text and "�" not in new_text:`
`3495`	`3495`	`# utf-8 char at the end means it's a potential unfinished byte sequence`
`3496`	`3496`	`# from byte fallback tokenization.`
`3497`	`3497`	`# If it's in the middle, it's probably a real invalid id generated`