|
14 | 14 |
|
15 | 15 | import contextlib |
16 | 16 | import io |
17 | | -from typing import Union |
| 17 | +from typing import List |
18 | 18 |
|
19 | 19 | from genai_perf.exceptions import GenAIPerfException |
20 | 20 |
|
21 | 21 | # Silence tokenizer warning on import |
22 | 22 | with contextlib.redirect_stdout(io.StringIO()) as stdout, contextlib.redirect_stderr( |
23 | 23 | io.StringIO() |
24 | 24 | ) as stderr: |
25 | | - from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast |
| 25 | + from transformers import AutoTokenizer, BatchEncoding |
26 | 26 | from transformers import logging as token_logger |
27 | 27 |
|
28 | 28 | token_logger.set_verbosity_error() |
29 | 29 |
|
30 | | -Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast] |
31 | 30 | DEFAULT_TOKENIZER = "hf-internal-testing/llama-tokenizer" |
32 | 31 |
|
33 | 32 |
|
34 | | -def get_tokenizer( |
35 | | - tokenizer_model: str, |
36 | | -) -> Tokenizer: |
| 33 | +class Tokenizer: |
37 | 34 | """ |
38 | | - Download the tokenizer from Huggingface.co |
| 35 | + A small wrapper class around Huggingface Tokenizer |
39 | 36 | """ |
40 | | - try: |
41 | | - # Silence tokenizer warning on first use |
42 | | - with contextlib.redirect_stdout( |
43 | | - io.StringIO() |
44 | | - ) as stdout, contextlib.redirect_stderr(io.StringIO()) as stderr: |
45 | | - tokenizer = AutoTokenizer.from_pretrained(tokenizer_model) |
46 | | - except Exception as e: |
47 | | - raise GenAIPerfException(e) |
48 | | - |
49 | | - # Disable add_bos_token so that llama tokenizer does not add bos token |
50 | | - # (aka. beginning-of-sentence) to the beginning of every response |
51 | | - # outputs, increasing the token count by 1 for each output response. |
52 | | - # Note: The type is being ignored here, because not all tokenizers have |
53 | | - # an add_bos_token variable. |
54 | | - tokenizer.add_bos_token = False # type: ignore |
55 | | - |
56 | | - return tokenizer |
| 37 | + |
| 38 | + def __init__(self, name: str) -> None: |
| 39 | + """ |
| 40 | + Initialize by downloading the tokenizer from Huggingface.co |
| 41 | + """ |
| 42 | + try: |
| 43 | + # Silence tokenizer warning on first use |
| 44 | + with contextlib.redirect_stdout( |
| 45 | + io.StringIO() |
| 46 | + ) as stdout, contextlib.redirect_stderr(io.StringIO()) as stderr: |
| 47 | + tokenizer = AutoTokenizer.from_pretrained(name) |
| 48 | + except Exception as e: |
| 49 | + raise GenAIPerfException(e) |
| 50 | + |
| 51 | + self._tokenizer = tokenizer |
| 52 | + |
| 53 | + # default tokenizer parameters for __call__, encode, decode methods |
| 54 | + self._call_args = {"add_special_tokens": False} |
| 55 | + self._encode_args = {"add_special_tokens": False} |
| 56 | + self._decode_args = {"skip_special_tokens": True} |
| 57 | + |
| 58 | + def __call__(self, text, **kwargs) -> BatchEncoding: |
| 59 | + self._call_args.update(kwargs) |
| 60 | + return self._tokenizer(text, **self._call_args) |
| 61 | + |
| 62 | + def encode(self, text, **kwargs) -> List[int]: |
| 63 | + self._encode_args.update(kwargs) |
| 64 | + return self._tokenizer.encode(text, **self._encode_args) |
| 65 | + |
| 66 | + def decode(self, token_ids, **kwargs) -> str: |
| 67 | + self._decode_args.update(kwargs) |
| 68 | + return self._tokenizer.decode(token_ids, **self._decode_args) |
| 69 | + |
| 70 | + def __repr__(self) -> str: |
| 71 | + return self._tokenizer.__repr__() |
| 72 | + |
| 73 | + |
| 74 | +def get_tokenizer(tokenizer_model: str) -> Tokenizer: |
| 75 | + """ |
| 76 | + Return tokenizer for the given model name |
| 77 | + """ |
| 78 | + return Tokenizer(tokenizer_model) |
0 commit comments