Skip to content

Commit 37ca4b3

Browse files
authored
fix: preserve special tokens during decode for prompt block separators (#737)
Signed-off-by: Anthony Casagrande <acasagrande@nvidia.com>
1 parent 0ba858c commit 37ca4b3

File tree

2 files changed

+7
-4
lines changed

2 files changed

+7
-4
lines changed

src/aiperf/common/tokenizer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,10 @@ def __init__(self) -> None:
152152
self._resolved_name: str | None = None
153153
self._call_args = {"add_special_tokens": False}
154154
self._encode_args = {"add_special_tokens": False}
155-
self._decode_args = {"skip_special_tokens": True}
155+
# Prompt generation inserts BOS/EOS tokens as block separators
156+
# (see PromptGenerator._build_token_sequence). Skipping special tokens
157+
# during decode would silently strip those separators.
158+
self._decode_args = {"skip_special_tokens": False}
156159

157160
def _require_init(self) -> None:
158161
"""Raise NotInitializedError if tokenizer is not initialized."""

tests/unit/common/test_tokenizer_kwarg_overrides.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ def test_standard_tokenizer_keeps_defaults(self):
189189
tok = self._make_tokenizer(StandardTokenizerBackend())
190190
assert tok._encode_args == {"add_special_tokens": False}
191191
assert tok._call_args == {"add_special_tokens": False}
192-
assert tok._decode_args == {"skip_special_tokens": True}
192+
assert tok._decode_args == {"skip_special_tokens": False}
193193

194194
def test_kimi_like_overrides_encode_and_call_args(self):
195195
tok = self._make_tokenizer(KimiLikeTokenizerBackend())
@@ -210,14 +210,14 @@ def test_mismatched_call_encode_sets_args_independently(self):
210210
tok = self._make_tokenizer(MismatchedCallEncodeBackend())
211211
assert tok._encode_args == {"allow_special_tokens": False}
212212
assert tok._call_args == {"add_special_tokens": False}
213-
assert tok._decode_args == {"skip_special_tokens": True}
213+
assert tok._decode_args == {"skip_special_tokens": False}
214214

215215
def test_none_tokenizer_is_noop(self):
216216
tok = Tokenizer()
217217
tok._apply_kwarg_overrides()
218218
assert tok._encode_args == {"add_special_tokens": False}
219219
assert tok._call_args == {"add_special_tokens": False}
220-
assert tok._decode_args == {"skip_special_tokens": True}
220+
assert tok._decode_args == {"skip_special_tokens": False}
221221

222222

223223
# -- End-to-end: encode/decode through Tokenizer wrapper --

0 commit comments

Comments
 (0)