Skip to content

Commit 21073db

Browse files
authored
Merge branch 'PaddlePaddle:develop' into develop
2 parents 5b36686 + 0ee3767 commit 21073db

File tree

3 files changed

+14
-4
lines changed

3 files changed

+14
-4
lines changed

paddleformers/transformers/legacy/tokenizer_utils_base.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3503,8 +3503,6 @@ def decode_token(
35033503
else:
35043504
return "", prefix_offset, len(all_input_ids)
35053505
else:
3506-
if len(all_input_ids[prefix_offset:]) > 3:
3507-
return new_text, len(all_input_ids), len(all_input_ids)
35083506
return "", prefix_offset, read_offset
35093507

35103508
def batch_decode(

paddleformers/transformers/tokenizer_utils.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -499,8 +499,6 @@ def decode_token(
499499
else:
500500
return "", prefix_offset, len(all_input_ids)
501501
else:
502-
if len(all_input_ids[prefix_offset:]) > 3:
503-
return new_text, len(all_input_ids), len(all_input_ids)
504502
return "", prefix_offset, read_offset
505503

506504

tests/transformers/test_hf_tokenizer.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,17 @@ def test_encode_chat_inputs(self):
142142
}
143143
encode_dict_text = tokenizer.encode_chat_inputs(dict_query)
144144
self.assertListEqual(encode_text["conversations"], encode_dict_text)
145+
146+
def test_tokenizer_decode_token(self) -> None:
147+
tokenizer = AutoTokenizer.from_pretrained("PaddleNLP/Qwen2.5-7B", download_hub="aistudio")
148+
test_cases = ["1. 百度 2. 腾讯", "hello world! I like eating banana", "🤓😖", "🤓😖testtest"]
149+
for test_case in test_cases:
150+
input_ids = tokenizer(test_case)["input_ids"]
151+
decoded_text = tokenizer.decode(input_ids)
152+
stream_decoded_text = ""
153+
offset = 0
154+
token_offset = 0
155+
for i in range(len(input_ids)):
156+
token_text, offset, token_offset = tokenizer.decode_token(input_ids[: i + 1], offset, token_offset)
157+
stream_decoded_text += token_text
158+
self.assertEqual(decoded_text, stream_decoded_text)

0 commit comments

Comments
 (0)