|
17 | 17 | from ..tokenizer_utils import warp_tokenizer
|
18 | 18 |
|
19 | 19 | LlamaTokenizer = warp_tokenizer(hf.LlamaTokenizer)
|
20 |
| - |
21 |
| - |
22 |
| -# Legacy PretrainedTokenizer, will be deprecated in the future. |
23 |
| -import base64 |
24 |
| -import os |
25 |
| -import unicodedata |
26 |
| -from typing import Collection, Dict, List, Set, Tuple, Union |
27 |
| - |
28 |
| -from transformers.tokenization_utils import PreTrainedTokenizer |
29 |
| - |
30 |
| -from ...utils.import_utils import is_tiktoken_available |
31 |
| -from ...utils.log import logger |
32 |
| -from ..legacy.tokenizer_utils_base import AddedToken |
33 |
| - |
34 |
| -VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} |
35 |
| - |
36 |
| -PAT_STR = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" |
37 |
| -BEGINOFTEXT = "<|begin_of_text|>" |
38 |
| -ENDOFTEXT = "<|end_of_text|>" |
39 |
| -IMSTART = "<|start_header_id|>" |
40 |
| -IMEND = "<|end_header_id|>" |
41 |
| -EOTID = "<|eot_id|>" |
42 |
| -# as the default behavior is changed to allow special tokens in |
43 |
| -# regular texts, the surface forms of special tokens need to be |
44 |
| -# as different as possible to minimize the impact |
45 |
| -EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(251))) |
46 |
| -SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND, EXTRAS[4], EOTID) + EXTRAS[5:] |
47 |
| - |
48 |
| -tiktoken = None |
49 |
| - |
50 |
| - |
51 |
| -def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]: |
52 |
| - with open(tiktoken_bpe_file, "rb") as f: |
53 |
| - contents = f.read() |
54 |
| - return { |
55 |
| - base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line) |
56 |
| - } |
57 |
| - |
58 |
| - |
59 |
| -class Llama3Tokenizer(PreTrainedTokenizer): |
60 |
| - """QWen tokenizer.""" |
61 |
| - |
62 |
| - model_input_names = ["input_ids", "attention_mask", "position_ids"] |
63 |
| - resource_files_names = {"vocab_file": "tokenizer.model"} |
64 |
| - |
65 |
| - def __init__( |
66 |
| - self, |
67 |
| - vocab_file, |
68 |
| - errors="replace", |
69 |
| - padding_side="left", |
70 |
| - add_bos_token=True, |
71 |
| - add_eos_token=False, |
72 |
| - **kwargs, |
73 |
| - ): |
74 |
| - if not is_tiktoken_available(): |
75 |
| - raise ValueError("tiktoken is not installed, please install it use: pip install tiktoken") |
76 |
| - |
77 |
| - import tiktoken as tk |
78 |
| - |
79 |
| - tiktoken = tk |
80 |
| - |
81 |
| - self.errors = errors # how to handle errors in decoding |
82 |
| - |
83 |
| - self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int] |
84 |
| - self.special_tokens = { |
85 |
| - token: index for index, token in enumerate(SPECIAL_TOKENS, start=len(self.mergeable_ranks)) |
86 |
| - } |
87 |
| - enc = tiktoken.Encoding( |
88 |
| - "Llama3", |
89 |
| - pat_str=PAT_STR, |
90 |
| - mergeable_ranks=self.mergeable_ranks, |
91 |
| - special_tokens=self.special_tokens, |
92 |
| - ) |
93 |
| - assert ( |
94 |
| - len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab |
95 |
| - ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding" |
96 |
| - |
97 |
| - self.decoder = {v: k for k, v in self.mergeable_ranks.items()} # type: dict[int, bytes|str] |
98 |
| - self.decoder.update({v: k for k, v in self.special_tokens.items()}) |
99 |
| - |
100 |
| - self.tokenizer = enc # type: tiktoken.Encoding |
101 |
| - |
102 |
| - self.add_bos_token = add_bos_token |
103 |
| - self.add_eos_token = add_eos_token |
104 |
| - |
105 |
| - self.bod_id = self.special_tokens[BEGINOFTEXT] |
106 |
| - self.eod_id = self.special_tokens[ENDOFTEXT] |
107 |
| - self.start_header_id = self.special_tokens[IMSTART] |
108 |
| - self.end_header_id = self.special_tokens[IMEND] |
109 |
| - self.eot_id = self.special_tokens[EOTID] |
110 |
| - |
111 |
| - if "pad_token_id" in kwargs: |
112 |
| - self.pad_token_id = kwargs["pad_token_id"] |
113 |
| - if "eos_token_id" in kwargs: |
114 |
| - self.eos_token_id = kwargs["eos_token_id"] |
115 |
| - |
116 |
| - self.bos_token = BEGINOFTEXT |
117 |
| - self.eos_token = ENDOFTEXT |
118 |
| - self.bos_token_id = self.bod_id |
119 |
| - self.eos_token_id = self.eod_id |
120 |
| - if "pad_token" not in kwargs: |
121 |
| - self.pad_token = self.convert_ids_to_tokens(self.eos_token_id) |
122 |
| - kwargs["pad_token"] = self.pad_token |
123 |
| - |
124 |
| - super().__init__(**kwargs) |
125 |
| - |
126 |
| - def __len__(self) -> int: |
127 |
| - return self.tokenizer.n_vocab |
128 |
| - |
129 |
| - def get_vocab(self) -> Dict[bytes, int]: |
130 |
| - return {**self.mergeable_ranks, **self.special_tokens} |
131 |
| - |
132 |
| - def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> List[int]: |
133 |
| - ids = [] |
134 |
| - if isinstance(tokens, (str, bytes)): |
135 |
| - if tokens in self.special_tokens: |
136 |
| - return self.special_tokens[tokens] |
137 |
| - else: |
138 |
| - return self.mergeable_ranks.get(tokens) |
139 |
| - for token in tokens: |
140 |
| - if token in self.special_tokens: |
141 |
| - ids.append(self.special_tokens[token]) |
142 |
| - else: |
143 |
| - ids.append(self.mergeable_ranks.get(token)) |
144 |
| - return ids |
145 |
| - |
146 |
| - def convert_ids_to_tokens(self, ids, skip_special_tokens=False): |
147 |
| - if isinstance(ids, int): |
148 |
| - return self.decoder[ids] |
149 |
| - tokens = [] |
150 |
| - for index in ids: |
151 |
| - index = int(index) |
152 |
| - if skip_special_tokens and index >= len(self.mergeable_ranks): |
153 |
| - continue |
154 |
| - if index in self.decoder: |
155 |
| - tokens.append(self.decoder[index]) |
156 |
| - return tokens |
157 |
| - |
158 |
| - def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: |
159 |
| - if not special_tokens and new_tokens: |
160 |
| - raise ValueError("Adding regular tokens is not supported") |
161 |
| - for token in new_tokens: |
162 |
| - surface_form = token.content if isinstance(token, AddedToken) else token |
163 |
| - if surface_form not in SPECIAL_TOKENS: |
164 |
| - logger.info(f"adding a special token '{surface_form}'.") |
165 |
| - token_id = len(self.mergeable_ranks) + len(self.special_tokens) |
166 |
| - self.special_tokens[surface_form] = token_id |
167 |
| - self.decoder[token_id] = surface_form |
168 |
| - |
169 |
| - import tiktoken as tk |
170 |
| - |
171 |
| - tiktoken = tk |
172 |
| - enc = tiktoken.Encoding( |
173 |
| - "Llama3", |
174 |
| - pat_str=PAT_STR, |
175 |
| - mergeable_ranks=self.mergeable_ranks, |
176 |
| - special_tokens=self.special_tokens, |
177 |
| - ) |
178 |
| - assert ( |
179 |
| - len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab |
180 |
| - ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding" |
181 |
| - |
182 |
| - self.tokenizer = enc # type: tiktoken.Encoding |
183 |
| - |
184 |
| - return 0 |
185 |
| - |
186 |
| - def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]: |
187 |
| - """ |
188 |
| - Save only the vocabulary of the tokenizer (vocabulary). |
189 |
| -
|
190 |
| - Returns: |
191 |
| - `Tuple(str)`: Paths to the files saved. |
192 |
| - """ |
193 |
| - file_path = os.path.join(save_directory, "tokenizer.model") |
194 |
| - with open(file_path, "w", encoding="utf8") as w: |
195 |
| - for k, v in self.mergeable_ranks.items(): |
196 |
| - line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n" |
197 |
| - w.write(line) |
198 |
| - return (file_path,) |
199 |
| - |
200 |
| - def tokenize( |
201 |
| - self, |
202 |
| - text: str, |
203 |
| - allowed_special: Union[Set, str] = "all", |
204 |
| - disallowed_special: Union[Collection, str] = (), |
205 |
| - **kwargs, |
206 |
| - ) -> List[Union[bytes, str]]: |
207 |
| - """ |
208 |
| - Converts a string in a sequence of tokens. |
209 |
| -
|
210 |
| - Args: |
211 |
| - text (`str`): |
212 |
| - The sequence to be encoded. |
213 |
| - allowed_special (`Literal["all"]` or `set`): |
214 |
| - The surface forms of the tokens to be encoded as special tokens in regular texts. |
215 |
| - Default to "all". |
216 |
| - disallowed_special (`Literal["all"]` or `Collection`): |
217 |
| - The surface forms of the tokens that should not be in regular texts and trigger errors. |
218 |
| - Default to an empty tuple. |
219 |
| -
|
220 |
| - kwargs (additional keyword arguments, *optional*): |
221 |
| - Will be passed to the underlying model specific encode method. |
222 |
| -
|
223 |
| - Returns: |
224 |
| - `List[bytes|str]`: The list of tokens. |
225 |
| - """ |
226 |
| - tokens = [] |
227 |
| - text = unicodedata.normalize("NFC", text) |
228 |
| - |
229 |
| - # this implementation takes a detour: text -> token id -> token surface forms |
230 |
| - for t in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special): |
231 |
| - tokens.append(self.decoder[t]) |
232 |
| - return tokens |
233 |
| - |
234 |
| - def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str: |
235 |
| - """ |
236 |
| - Converts a sequence of tokens in a single string. |
237 |
| - """ |
238 |
| - text = "" |
239 |
| - temp = b"" |
240 |
| - for t in tokens: |
241 |
| - if isinstance(t, str): |
242 |
| - if temp: |
243 |
| - text += temp.decode("utf-8", errors=self.errors) |
244 |
| - temp = b"" |
245 |
| - text += t |
246 |
| - elif isinstance(t, bytes): |
247 |
| - temp += t |
248 |
| - else: |
249 |
| - raise TypeError("token should only be of type types or str") |
250 |
| - if temp: |
251 |
| - text += temp.decode("utf-8", errors=self.errors) |
252 |
| - return text |
253 |
| - |
254 |
| - @property |
255 |
| - def vocab_size(self): |
256 |
| - return self.tokenizer.n_vocab |
257 |
| - |
258 |
| - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): |
259 |
| - bos_token_id = [self.bod_id] if self.add_bos_token else [] |
260 |
| - eos_token_id = [self.eod_id] if self.add_eos_token else [] |
261 |
| - |
262 |
| - output = bos_token_id + token_ids_0 + eos_token_id |
263 |
| - |
264 |
| - if token_ids_1 is not None: |
265 |
| - output = output + bos_token_id + token_ids_1 + eos_token_id |
266 |
| - |
267 |
| - return output |
268 |
| - |
269 |
| - def _decode( |
270 |
| - self, |
271 |
| - token_ids: Union[int, List[int]], |
272 |
| - skip_special_tokens: bool = False, |
273 |
| - errors: str = None, |
274 |
| - **kwargs, |
275 |
| - ) -> str: |
276 |
| - if isinstance(token_ids, int): |
277 |
| - token_ids = [token_ids] |
278 |
| - if skip_special_tokens: |
279 |
| - token_ids = [i for i in token_ids if i <= len(self.mergeable_ranks)] |
280 |
| - return self.tokenizer.decode(token_ids, errors=errors or self.errors) |
| 20 | +Llama3Tokenizer = warp_tokenizer(hf.PreTrainedTokenizerFast) |
0 commit comments