Skip to content

Commit 0ee3767

Browse files
authored
add test_tokenizer_decode_token (#2562)
1 parent b09c70d commit 0ee3767

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed

tests/transformers/test_hf_tokenizer.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,17 @@ def test_encode_chat_inputs(self):
142142
}
143143
encode_dict_text = tokenizer.encode_chat_inputs(dict_query)
144144
self.assertListEqual(encode_text["conversations"], encode_dict_text)
145+
146+
def test_tokenizer_decode_token(self) -> None:
147+
tokenizer = AutoTokenizer.from_pretrained("PaddleNLP/Qwen2.5-7B", download_hub="aistudio")
148+
test_cases = ["1. 百度 2. 腾讯", "hello world! I like eating banana", "🤓😖", "🤓😖testtest"]
149+
for test_case in test_cases:
150+
input_ids = tokenizer(test_case)["input_ids"]
151+
decoded_text = tokenizer.decode(input_ids)
152+
stream_decoded_text = ""
153+
offset = 0
154+
token_offset = 0
155+
for i in range(len(input_ids)):
156+
token_text, offset, token_offset = tokenizer.decode_token(input_ids[: i + 1], offset, token_offset)
157+
stream_decoded_text += token_text
158+
self.assertEqual(decoded_text, stream_decoded_text)

0 commit comments

Comments
 (0)