Skip to content

Commit e46598b

Browse files
IzzyPuttermanthe-david-oynv-hwoo
authored
Set add special tokens to false (#672)
* Set add special tokens to false * formatting * Fix tests * Ignore special tokens and add test * Add a wrapper around huggingface tokenizer --------- Co-authored-by: David Yastremsky <[email protected]> Co-authored-by: Hyunjae Woo <[email protected]>
1 parent 7df8f67 commit e46598b

File tree

3 files changed

+116
-27
lines changed

3 files changed

+116
-27
lines changed

src/c++/perf_analyzer/genai-perf/genai_perf/tokenizer.py

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,43 +14,65 @@
1414

1515
import contextlib
1616
import io
17-
from typing import Union
17+
from typing import List
1818

1919
from genai_perf.exceptions import GenAIPerfException
2020

2121
# Silence tokenizer warning on import
2222
with contextlib.redirect_stdout(io.StringIO()) as stdout, contextlib.redirect_stderr(
2323
io.StringIO()
2424
) as stderr:
25-
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
25+
from transformers import AutoTokenizer, BatchEncoding
2626
from transformers import logging as token_logger
2727

2828
token_logger.set_verbosity_error()
2929

30-
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
3130
DEFAULT_TOKENIZER = "hf-internal-testing/llama-tokenizer"
3231

3332

34-
def get_tokenizer(
35-
tokenizer_model: str,
36-
) -> Tokenizer:
33+
class Tokenizer:
3734
"""
38-
Download the tokenizer from Huggingface.co
35+
A small wrapper class around Huggingface Tokenizer
3936
"""
40-
try:
41-
# Silence tokenizer warning on first use
42-
with contextlib.redirect_stdout(
43-
io.StringIO()
44-
) as stdout, contextlib.redirect_stderr(io.StringIO()) as stderr:
45-
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
46-
except Exception as e:
47-
raise GenAIPerfException(e)
48-
49-
# Disable add_bos_token so that llama tokenizer does not add bos token
50-
# (aka. beginning-of-sentence) to the beginning of every response
51-
# outputs, increasing the token count by 1 for each output response.
52-
# Note: The type is being ignored here, because not all tokenizers have
53-
# an add_bos_token variable.
54-
tokenizer.add_bos_token = False # type: ignore
55-
56-
return tokenizer
37+
38+
def __init__(self, name: str) -> None:
39+
"""
40+
Initialize by downloading the tokenizer from Huggingface.co
41+
"""
42+
try:
43+
# Silence tokenizer warning on first use
44+
with contextlib.redirect_stdout(
45+
io.StringIO()
46+
) as stdout, contextlib.redirect_stderr(io.StringIO()) as stderr:
47+
tokenizer = AutoTokenizer.from_pretrained(name)
48+
except Exception as e:
49+
raise GenAIPerfException(e)
50+
51+
self._tokenizer = tokenizer
52+
53+
# default tokenizer parameters for __call__, encode, decode methods
54+
self._call_args = {"add_special_tokens": False}
55+
self._encode_args = {"add_special_tokens": False}
56+
self._decode_args = {"skip_special_tokens": True}
57+
58+
def __call__(self, text, **kwargs) -> BatchEncoding:
59+
self._call_args.update(kwargs)
60+
return self._tokenizer(text, **self._call_args)
61+
62+
def encode(self, text, **kwargs) -> List[int]:
63+
self._encode_args.update(kwargs)
64+
return self._tokenizer.encode(text, **self._encode_args)
65+
66+
def decode(self, token_ids, **kwargs) -> str:
67+
self._decode_args.update(kwargs)
68+
return self._tokenizer.decode(token_ids, **self._decode_args)
69+
70+
def __repr__(self) -> str:
71+
return self._tokenizer.__repr__()
72+
73+
74+
def get_tokenizer(tokenizer_model: str) -> Tokenizer:
75+
"""
76+
Return tokenizer for the given model name
77+
"""
78+
return Tokenizer(tokenizer_model)

src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,8 @@
3333

3434
import numpy as np
3535
import pytest
36-
from genai_perf.llm_inputs.llm_inputs import OutputFormat
37-
from genai_perf.llm_metrics import LLMMetrics, LLMProfileDataParser
36+
from genai_perf.llm_metrics import LLMMetrics, LLMProfileDataParser, ResponseFormat
3837
from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer
39-
from transformers import AutoTokenizer
4038

4139

4240
def ns_to_sec(ns: int) -> Union[int, float]:
@@ -380,6 +378,42 @@ def test_merged_sse_response(self, mock_read_write: pytest.MonkeyPatch) -> None:
380378
pd._preprocess_response(res_timestamps, res_outputs)
381379
assert res_outputs[1]["response"] == expected_response
382380

381+
def test_no_special_tokens(self, mock_read_write: pytest.MonkeyPatch) -> None:
382+
"""Test special tokens are not included when counting input/output tokens."""
383+
tokenizer = get_tokenizer(DEFAULT_TOKENIZER)
384+
pd = LLMProfileDataParser(
385+
filename=Path("openai_profile_export.json"),
386+
tokenizer=tokenizer,
387+
)
388+
389+
# There are 3 special tokens in the default tokenizer
390+
# - <unk>: 0 (unknown)
391+
# - <s>: 1 (beginning of sentence)
392+
# - </s>: 2 (end of sentence)
393+
special_token_ids = list(tokenizer._tokenizer.added_tokens_encoder.values())
394+
395+
# Check if special tokens are present in request input
396+
req_input = {"text_input": "This is test input."}
397+
tokens = pd._tokenize_triton_request_input(req_input)
398+
assert all([s not in tokens for s in special_token_ids])
399+
400+
pd._response_format = ResponseFormat.OPENAI_COMPLETIONS
401+
req_input = {"payload": '{"prompt":"This is test input."}'}
402+
tokens = pd._tokenize_openai_request_input(req_input)
403+
assert all([s not in tokens for s in special_token_ids])
404+
405+
pd._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS
406+
req_input = {"payload": '{"messages":[{"content":"This is test input."}]}'}
407+
tokens = pd._tokenize_openai_request_input(req_input)
408+
assert all([s not in tokens for s in special_token_ids])
409+
410+
# Check if special tokens are present in the responses
411+
res_outputs = ["This", "is", "test", "input."]
412+
tokens = []
413+
for t in pd._run_tokenizer(res_outputs):
414+
tokens += t
415+
assert all([s not in tokens for s in special_token_ids])
416+
383417
def test_llm_metrics_get_base_name(self) -> None:
384418
"""Test get_base_name method in LLMMetrics class."""
385419
# initialize with dummy values

src/c++/perf_analyzer/genai-perf/tests/test_tokenizer.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,36 @@ def test_non_default_tokenizer(self):
4141
def test_bad_tokenizer(self):
4242
with pytest.raises(GenAIPerfException):
4343
get_tokenizer("bad_tokenizer")
44+
45+
def test_default_args(self):
46+
tokenizer_model = DEFAULT_TOKENIZER
47+
tokenizer = get_tokenizer(tokenizer_model)
48+
49+
# There are 3 special tokens in the default tokenizer
50+
# - <unk>: 0 (unknown)
51+
# - <s>: 1 (beginning of sentence)
52+
# - </s>: 2 (end of sentence)
53+
special_tokens = list(tokenizer._tokenizer.added_tokens_encoder.keys())
54+
special_token_ids = list(tokenizer._tokenizer.added_tokens_encoder.values())
55+
56+
# special tokens are disabled by default
57+
text = "This is test."
58+
tokens = tokenizer(text)["input_ids"]
59+
assert all([s not in tokens for s in special_token_ids])
60+
61+
tokens = tokenizer.encode(text)
62+
assert all([s not in tokens for s in special_token_ids])
63+
64+
output = tokenizer.decode(tokens)
65+
assert all([s not in output for s in special_tokens])
66+
67+
# check special tokens is enabled
68+
text = "This is test."
69+
tokens = tokenizer(text, add_special_tokens=True)["input_ids"]
70+
assert any([s in tokens for s in special_token_ids])
71+
72+
tokens = tokenizer.encode(text, add_special_tokens=True)
73+
assert any([s in tokens for s in special_token_ids])
74+
75+
output = tokenizer.decode(tokens, skip_special_tokens=False)
76+
assert any([s in output for s in special_tokens])

0 commit comments

Comments
 (0)