Skip to content

Commit 2027a14

Browse files
Lunwen Hefacebook-github-bot
authored andcommitted
Fix eval (#5955)
Summary: Pull Request resolved: #5955 This PR fixes a bunch of issues in the eval pipeline: - Use the right token for `eot_token_id` - Do not add `bos` and `eos` during `tok_encode` based on this [discussion](https://fburl.com/code/uifmt746). - Update `executorch/examples/models/llama2/tokenizer/tiktoken.py` to be synced with llama 3.1's official [code](https://github.com/meta-llama/llama-models/blob/main/models/llama3/api/tokenizer.py). Majorly updated set of special tokens. Reviewed By: mergennachin Differential Revision: D62198560 fbshipit-source-id: 856a625ef0d607ab0e910eefe455a6dff1240584
1 parent 80afaf2 commit 2027a14

File tree

2 files changed

+51
-79
lines changed

2 files changed

+51
-79
lines changed

examples/models/llama2/evaluate/eager_eval.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def __init__(
4444

4545
@property
4646
def eot_token_id(self):
47-
return self._tokenizer.eos_id
47+
return self._tokenizer.eot_id
4848

4949
@property
5050
def max_length(self):
@@ -63,17 +63,10 @@ def device(self):
6363
return self._device
6464

6565
def tok_encode(self, string: str, **kwargs): # pyre-ignore
66-
tokens = self._tokenizer.encode(string, bos=True, eos=False)
67-
encoded = torch.tensor(tokens, dtype=torch.int, device=self.device)
68-
# encoded is a pytorch tensor, but some internal logic in the
69-
# eval harness expects it to be a list instead
70-
# TODO: verify this for multi-batch as well
71-
encoded = encoded.tolist()
72-
return encoded
66+
return self._tokenizer.encode(string, bos=False, eos=False)
7367

7468
def tok_decode(self, tokens):
75-
decoded = self._tokenizer.decode(tokens)
76-
return decoded
69+
return self._tokenizer.decode(tokens)
7770

7871
def _model_call(self, inps):
7972
if self._use_kv_cache:

examples/models/llama2/tokenizer/tiktoken.py

Lines changed: 48 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -15,32 +15,34 @@
1515
Iterator,
1616
List,
1717
Literal,
18+
Optional,
1819
Sequence,
19-
TypedDict,
2020
Union,
2121
)
2222

2323
import tiktoken
24-
from tiktoken.load import load_tiktoken_bpe
2524

25+
from tiktoken.load import load_tiktoken_bpe
2626

2727
logger = getLogger(__name__)
2828

2929

30-
Role = Literal["system", "user", "assistant"]
31-
30+
# The tiktoken tokenizer can handle <=400k chars without
31+
# pyo3_runtime.PanicException.
32+
TIKTOKEN_MAX_ENCODE_CHARS = 400_000
3233

33-
class Message(TypedDict):
34-
role: Role
35-
content: str
34+
# https://github.com/openai/tiktoken/issues/195
35+
# Here we iterate over subsequences and split if we exceed the limit
36+
# of max consecutive non-whitespace or whitespace characters.
37+
MAX_NO_WHITESPACES_CHARS = 25_000
3638

3739

38-
Dialog = Sequence[Message]
40+
_INSTANCE = None
3941

4042

4143
class Tokenizer:
4244
"""
43-
tokenizing and encoding/decoding text using the Tiktoken tokenizer.
45+
Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
4446
"""
4547

4648
special_tokens: Dict[str, int]
@@ -49,14 +51,23 @@ class Tokenizer:
4951

5052
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" # noqa: E501
5153

54+
@classmethod
55+
def get_instance(cls):
56+
global _INSTANCE
57+
58+
if _INSTANCE is None:
59+
_INSTANCE = Tokenizer(
60+
os.path.join(os.path.dirname(__file__), "tokenizer.model")
61+
)
62+
return _INSTANCE
63+
5264
def __init__(self, model_path: str):
5365
"""
5466
Initializes the Tokenizer with a Tiktoken model.
5567
5668
Args:
5769
model_path (str): The path to the Tiktoken model file.
5870
"""
59-
# reload tokenizer
6071
assert os.path.isfile(model_path), model_path
6172

6273
mergeable_ranks = load_tiktoken_bpe(model_path)
@@ -66,16 +77,21 @@ def __init__(self, model_path: str):
6677
"<|end_of_text|>",
6778
"<|reserved_special_token_0|>",
6879
"<|reserved_special_token_1|>",
69-
"<|reserved_special_token_2|>",
70-
"<|reserved_special_token_3|>",
80+
"<|finetune_right_pad_id|>",
81+
"<|step_id|>",
7182
"<|start_header_id|>",
7283
"<|end_header_id|>",
73-
"<|reserved_special_token_4|>",
84+
"<|eom_id|>", # end of message
7485
"<|eot_id|>", # end of turn
75-
] + [
76-
f"<|reserved_special_token_{i}|>"
77-
for i in range(5, self.num_reserved_special_tokens - 5)
86+
"<|python_tag|>",
87+
"<|image|>",
88+
]
89+
reserved_tokens = [
90+
f"<|reserved_special_token_{2 + i}|>"
91+
for i in range(self.num_reserved_special_tokens - len(special_tokens))
7892
]
93+
special_tokens = special_tokens + reserved_tokens
94+
7995
self.special_tokens = {
8096
token: num_base_tokens + i for i, token in enumerate(special_tokens)
8197
}
@@ -85,28 +101,28 @@ def __init__(self, model_path: str):
85101
mergeable_ranks=mergeable_ranks,
86102
special_tokens=self.special_tokens,
87103
)
88-
logger.info(f"Reloaded SentencePiece model from {model_path}")
89104

105+
self.n_words: int = num_base_tokens + len(special_tokens)
90106
# BOS / EOS token IDs
91-
self.n_words: int = self.model.n_vocab
92107
self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
93108
self.eos_id: int = self.special_tokens["<|end_of_text|>"]
94-
self.pad_id: int = -1
95-
self.stop_tokens = {
96-
self.special_tokens["<|end_of_text|>"],
109+
self.eot_id: int = self.special_tokens["<|eot_id|>"]
110+
self.eom_id: int = self.special_tokens["<|eom_id|>"]
111+
self.python_tag_id = self.special_tokens["<|python_tag|>"]
112+
self.pad_id: int = self.special_tokens["<|finetune_right_pad_id|>"]
113+
self.stop_tokens = [
114+
self.eos_id,
115+
self.special_tokens["<|eom_id|>"],
97116
self.special_tokens["<|eot_id|>"],
98-
}
99-
logger.info(
100-
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
101-
)
117+
]
102118

103119
def encode(
104120
self,
105121
s: str,
106122
*,
107123
bos: bool,
108124
eos: bool,
109-
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa B006
125+
allowed_special: Optional[Union[Literal["all"], AbstractSet[str]]] = None,
110126
disallowed_special: Union[Literal["all"], Collection[str]] = (),
111127
) -> List[int]:
112128
"""
@@ -125,22 +141,15 @@ def encode(
125141
By default, setting disallowed_special=() encodes a string by ignoring
126142
special tokens. Specifically:
127143
- Setting `disallowed_special` to () will cause all text corresponding
128-
to special tokens to be encoded as natural text (instead of raising
144+
to special tokens to be encoded as natural text (insteading of raising
129145
an error).
130146
- Setting `allowed_special` to "all" will treat all text corresponding
131147
to special tokens to be encoded as special tokens.
132148
"""
149+
if allowed_special is None:
150+
allowed_special = set()
133151
assert type(s) is str
134152

135-
# The tiktoken tokenizer can handle <=400k chars without
136-
# pyo3_runtime.PanicException (may go beyond 400k)
137-
TIKTOKEN_MAX_ENCODE_CHARS = 400_000
138-
139-
# https://github.com/openai/tiktoken/issues/195
140-
# Here we iterate over subsequences and split if we exceed the limit
141-
# of max consecutive non-whitespace or whitespace characters.
142-
MAX_NO_WHITESPACES_CHARS = 25_000
143-
144153
substrs = (
145154
substr
146155
for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
@@ -173,16 +182,16 @@ def decode(self, t: Sequence[int]) -> str:
173182
Returns:
174183
str: The decoded string.
175184
"""
176-
# typecast is safe here, Tiktoken doesn't do anything list-related with the sequence.
185+
# Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
177186
return self.model.decode(cast(List[int], t))
178187

179188
@staticmethod
180189
def _split_whitespaces_or_nonwhitespaces(
181190
s: str, max_consecutive_slice_len: int
182191
) -> Iterator[str]:
183192
"""
184-
Split the string `s` so that each substring contains no more than `max_consecutive_slice_len`
185-
consecutive whitespaces or consecutive non-whitespaces
193+
Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
194+
consecutive whitespaces or consecutive non-whitespaces.
186195
"""
187196
current_slice_len = 0
188197
current_slice_is_space = s[0].isspace() if len(s) > 0 else False
@@ -201,33 +210,3 @@ def _split_whitespaces_or_nonwhitespaces(
201210
slice_start = i
202211
current_slice_len = 1
203212
yield s[slice_start:]
204-
205-
206-
class ChatFormat:
207-
def __init__(self, tokenizer: Tokenizer):
208-
self.tokenizer = tokenizer
209-
210-
def encode_header(self, message: Message) -> List[int]:
211-
tokens = []
212-
tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
213-
tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
214-
tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
215-
tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
216-
return tokens
217-
218-
def encode_message(self, message: Message) -> List[int]:
219-
tokens = self.encode_header(message)
220-
tokens.extend(
221-
self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
222-
)
223-
tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
224-
return tokens
225-
226-
def encode_dialog_prompt(self, dialog: Dialog) -> List[int]:
227-
tokens = []
228-
tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
229-
for message in dialog:
230-
tokens.extend(self.encode_message(message))
231-
# Add the start of an assistant message for the model to complete
232-
tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
233-
return tokens

0 commit comments

Comments
 (0)