Skip to content

Commit f23d540

Browse files
authored
[feature] testcase-without-paddle (#2494)
1 parent 83abe5f commit f23d540

File tree

7 files changed

+96
-272
lines changed

7 files changed

+96
-272
lines changed

paddleformers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@
1919
from typing import TYPE_CHECKING
2020

2121
from .utils.lazy_import import _LazyModule
22-
from .utils.paddle_patch import *
2322

2423
PADDLEFORMERS_STABLE_VERSION = "PADDLEFORMERS_STABLE_VERSION"
2524

2625
with suppress(Exception):
2726
import paddle
2827

28+
from .utils.paddle_patch import *
29+
2930
paddle.disable_signal_handler()
3031

3132
# this version is used for develop and test.

paddleformers/transformers/ernie4_5vl/tokenizer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def __init__(
9494
**kwargs: Additional keyword arguments
9595
"""
9696
# Handle possible parameter renaming
97+
9798
if vocab_file is None:
9899
for key in ["tokenizer_file", "model_file", "spm_file"]:
99100
if key in kwargs:

paddleformers/transformers/llama/tokenizer.py

Lines changed: 1 addition & 261 deletions
Original file line numberDiff line numberDiff line change
@@ -17,264 +17,4 @@
1717
from ..tokenizer_utils import warp_tokenizer
1818

1919
LlamaTokenizer = warp_tokenizer(hf.LlamaTokenizer)
20-
21-
22-
# Legacy PretrainedTokenizer, will be deprecated in the future.
23-
import base64
24-
import os
25-
import unicodedata
26-
from typing import Collection, Dict, List, Set, Tuple, Union
27-
28-
from transformers.tokenization_utils import PreTrainedTokenizer
29-
30-
from ...utils.import_utils import is_tiktoken_available
31-
from ...utils.log import logger
32-
from ..legacy.tokenizer_utils_base import AddedToken
33-
34-
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
35-
36-
PAT_STR = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
37-
BEGINOFTEXT = "<|begin_of_text|>"
38-
ENDOFTEXT = "<|end_of_text|>"
39-
IMSTART = "<|start_header_id|>"
40-
IMEND = "<|end_header_id|>"
41-
EOTID = "<|eot_id|>"
42-
# as the default behavior is changed to allow special tokens in
43-
# regular texts, the surface forms of special tokens need to be
44-
# as different as possible to minimize the impact
45-
EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(251)))
46-
SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND, EXTRAS[4], EOTID) + EXTRAS[5:]
47-
48-
tiktoken = None
49-
50-
51-
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
52-
with open(tiktoken_bpe_file, "rb") as f:
53-
contents = f.read()
54-
return {
55-
base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)
56-
}
57-
58-
59-
class Llama3Tokenizer(PreTrainedTokenizer):
60-
"""QWen tokenizer."""
61-
62-
model_input_names = ["input_ids", "attention_mask", "position_ids"]
63-
resource_files_names = {"vocab_file": "tokenizer.model"}
64-
65-
def __init__(
66-
self,
67-
vocab_file,
68-
errors="replace",
69-
padding_side="left",
70-
add_bos_token=True,
71-
add_eos_token=False,
72-
**kwargs,
73-
):
74-
if not is_tiktoken_available():
75-
raise ValueError("tiktoken is not installed, please install it use: pip install tiktoken")
76-
77-
import tiktoken as tk
78-
79-
tiktoken = tk
80-
81-
self.errors = errors # how to handle errors in decoding
82-
83-
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
84-
self.special_tokens = {
85-
token: index for index, token in enumerate(SPECIAL_TOKENS, start=len(self.mergeable_ranks))
86-
}
87-
enc = tiktoken.Encoding(
88-
"Llama3",
89-
pat_str=PAT_STR,
90-
mergeable_ranks=self.mergeable_ranks,
91-
special_tokens=self.special_tokens,
92-
)
93-
assert (
94-
len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
95-
), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
96-
97-
self.decoder = {v: k for k, v in self.mergeable_ranks.items()} # type: dict[int, bytes|str]
98-
self.decoder.update({v: k for k, v in self.special_tokens.items()})
99-
100-
self.tokenizer = enc # type: tiktoken.Encoding
101-
102-
self.add_bos_token = add_bos_token
103-
self.add_eos_token = add_eos_token
104-
105-
self.bod_id = self.special_tokens[BEGINOFTEXT]
106-
self.eod_id = self.special_tokens[ENDOFTEXT]
107-
self.start_header_id = self.special_tokens[IMSTART]
108-
self.end_header_id = self.special_tokens[IMEND]
109-
self.eot_id = self.special_tokens[EOTID]
110-
111-
if "pad_token_id" in kwargs:
112-
self.pad_token_id = kwargs["pad_token_id"]
113-
if "eos_token_id" in kwargs:
114-
self.eos_token_id = kwargs["eos_token_id"]
115-
116-
self.bos_token = BEGINOFTEXT
117-
self.eos_token = ENDOFTEXT
118-
self.bos_token_id = self.bod_id
119-
self.eos_token_id = self.eod_id
120-
if "pad_token" not in kwargs:
121-
self.pad_token = self.convert_ids_to_tokens(self.eos_token_id)
122-
kwargs["pad_token"] = self.pad_token
123-
124-
super().__init__(**kwargs)
125-
126-
def __len__(self) -> int:
127-
return self.tokenizer.n_vocab
128-
129-
def get_vocab(self) -> Dict[bytes, int]:
130-
return {**self.mergeable_ranks, **self.special_tokens}
131-
132-
def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> List[int]:
133-
ids = []
134-
if isinstance(tokens, (str, bytes)):
135-
if tokens in self.special_tokens:
136-
return self.special_tokens[tokens]
137-
else:
138-
return self.mergeable_ranks.get(tokens)
139-
for token in tokens:
140-
if token in self.special_tokens:
141-
ids.append(self.special_tokens[token])
142-
else:
143-
ids.append(self.mergeable_ranks.get(token))
144-
return ids
145-
146-
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
147-
if isinstance(ids, int):
148-
return self.decoder[ids]
149-
tokens = []
150-
for index in ids:
151-
index = int(index)
152-
if skip_special_tokens and index >= len(self.mergeable_ranks):
153-
continue
154-
if index in self.decoder:
155-
tokens.append(self.decoder[index])
156-
return tokens
157-
158-
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
159-
if not special_tokens and new_tokens:
160-
raise ValueError("Adding regular tokens is not supported")
161-
for token in new_tokens:
162-
surface_form = token.content if isinstance(token, AddedToken) else token
163-
if surface_form not in SPECIAL_TOKENS:
164-
logger.info(f"adding a special token '{surface_form}'.")
165-
token_id = len(self.mergeable_ranks) + len(self.special_tokens)
166-
self.special_tokens[surface_form] = token_id
167-
self.decoder[token_id] = surface_form
168-
169-
import tiktoken as tk
170-
171-
tiktoken = tk
172-
enc = tiktoken.Encoding(
173-
"Llama3",
174-
pat_str=PAT_STR,
175-
mergeable_ranks=self.mergeable_ranks,
176-
special_tokens=self.special_tokens,
177-
)
178-
assert (
179-
len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
180-
), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
181-
182-
self.tokenizer = enc # type: tiktoken.Encoding
183-
184-
return 0
185-
186-
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
187-
"""
188-
Save only the vocabulary of the tokenizer (vocabulary).
189-
190-
Returns:
191-
`Tuple(str)`: Paths to the files saved.
192-
"""
193-
file_path = os.path.join(save_directory, "tokenizer.model")
194-
with open(file_path, "w", encoding="utf8") as w:
195-
for k, v in self.mergeable_ranks.items():
196-
line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
197-
w.write(line)
198-
return (file_path,)
199-
200-
def tokenize(
201-
self,
202-
text: str,
203-
allowed_special: Union[Set, str] = "all",
204-
disallowed_special: Union[Collection, str] = (),
205-
**kwargs,
206-
) -> List[Union[bytes, str]]:
207-
"""
208-
Converts a string in a sequence of tokens.
209-
210-
Args:
211-
text (`str`):
212-
The sequence to be encoded.
213-
allowed_special (`Literal["all"]` or `set`):
214-
The surface forms of the tokens to be encoded as special tokens in regular texts.
215-
Default to "all".
216-
disallowed_special (`Literal["all"]` or `Collection`):
217-
The surface forms of the tokens that should not be in regular texts and trigger errors.
218-
Default to an empty tuple.
219-
220-
kwargs (additional keyword arguments, *optional*):
221-
Will be passed to the underlying model specific encode method.
222-
223-
Returns:
224-
`List[bytes|str]`: The list of tokens.
225-
"""
226-
tokens = []
227-
text = unicodedata.normalize("NFC", text)
228-
229-
# this implementation takes a detour: text -> token id -> token surface forms
230-
for t in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special):
231-
tokens.append(self.decoder[t])
232-
return tokens
233-
234-
def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
235-
"""
236-
Converts a sequence of tokens in a single string.
237-
"""
238-
text = ""
239-
temp = b""
240-
for t in tokens:
241-
if isinstance(t, str):
242-
if temp:
243-
text += temp.decode("utf-8", errors=self.errors)
244-
temp = b""
245-
text += t
246-
elif isinstance(t, bytes):
247-
temp += t
248-
else:
249-
raise TypeError("token should only be of type types or str")
250-
if temp:
251-
text += temp.decode("utf-8", errors=self.errors)
252-
return text
253-
254-
@property
255-
def vocab_size(self):
256-
return self.tokenizer.n_vocab
257-
258-
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
259-
bos_token_id = [self.bod_id] if self.add_bos_token else []
260-
eos_token_id = [self.eod_id] if self.add_eos_token else []
261-
262-
output = bos_token_id + token_ids_0 + eos_token_id
263-
264-
if token_ids_1 is not None:
265-
output = output + bos_token_id + token_ids_1 + eos_token_id
266-
267-
return output
268-
269-
def _decode(
270-
self,
271-
token_ids: Union[int, List[int]],
272-
skip_special_tokens: bool = False,
273-
errors: str = None,
274-
**kwargs,
275-
) -> str:
276-
if isinstance(token_ids, int):
277-
token_ids = [token_ids]
278-
if skip_special_tokens:
279-
token_ids = [i for i in token_ids if i <= len(self.mergeable_ranks)]
280-
return self.tokenizer.decode(token_ids, errors=errors or self.errors)
20+
Llama3Tokenizer = warp_tokenizer(hf.PreTrainedTokenizerFast)

paddleformers/transformers/tokenizer_utils.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,24 @@
3131
)
3232
from transformers.utils.generic import ExplicitEnum
3333

34+
from ..utils import is_paddle_available
3435
from ..utils.download import DownloadSource, resolve_file_path
3536
from ..utils.log import logger
36-
from .legacy.tokenizer_utils import PretrainedTokenizer
3737

38-
# legacy PretrainedTokenizer, which is different from huggingface PreTrainedTokenizer
38+
if is_paddle_available():
39+
from .legacy.tokenizer_utils import PretrainedTokenizer
40+
else:
3941

42+
class _MissingPaddleTokenizer:
43+
def __init__(self, *args, **kwargs):
44+
raise ImportError(
45+
"PretrainedTokenizer requires Paddle, but Paddle is not available. "
46+
"Please install Paddle to use this feature."
47+
)
4048

41-
PreTrainedTokenizer = PretrainedTokenizer
49+
PretrainedTokenizer = _MissingPaddleTokenizer
50+
51+
# legacy PretrainedTokenizer, which is different from huggingface PreTrainedTokenizer
4252

4353

4454
class TensorType(ExplicitEnum):
@@ -290,13 +300,15 @@ def _encode_chat_inputs_openai_format(
290300

291301
def _extract_non_learnable_parts(self, origin_msg: List[Dict[str, str]], split_s: List[str]):
292302
"""Split the entire chat by specified words. Extract the non-learnable parts."""
303+
# TODO:We will upgrade this feature later
293304
# distinguish and replace the special words in original string to an uncompiled form: Like | -> \|
294305
regex_pattern = "|".join(map(re.escape, split_s))
295306
# splited by replaced specified words
296307
non_learnable_parts = re.split(
297308
r"(?:%s)" % regex_pattern,
298309
self.apply_chat_template(conversation=origin_msg, add_generation_prompt=False, tokenize=False),
299310
)
311+
300312
if non_learnable_parts[-1] == "":
301313
non_learnable_parts.pop()
302314
return non_learnable_parts
@@ -340,12 +352,7 @@ def _encode_chat_inputs(
340352
)
341353
ans_roundi = roundi_str[len(roundi_no_ans_str) :]
342354
ans.append(ans_roundi)
343-
344355
non_learnable_parts = self._extract_non_learnable_parts(origin_msg, ans)
345-
# assert len(non_learnable_parts) == len(
346-
# ans
347-
# ), f"Get non_learnable_parts len: {len(non_learnable_parts)}, but ans len: {len(ans)}."
348-
349356
conversation_ids = []
350357
for i in range(len(non_learnable_parts)):
351358
conversation_ids.append(
@@ -385,3 +392,8 @@ def encode_chat_inputs(
385392

386393
def warp_tokenizer(hf_tokenizer_class: PreTrainedTokenizer):
387394
return type(hf_tokenizer_class.__name__, (PaddleTokenizerMixin, hf_tokenizer_class), {})
395+
396+
397+
class PreTrainedTokenizer(PaddleTokenizerMixin, PretrainedTokenizer):
398+
def init(self, *args, **kwargs):
399+
super().init(*args, **kwargs)

paddleformers/utils/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from typing import TYPE_CHECKING
1818

1919
from ..utils.lazy_import import _LazyModule
20-
from .paddle_patch import *
2120

2221
import_structure = {
2322
"nested": [

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ tokenizers<=0.20.3; python_version<="3.8"
2020
tokenizers>=0.21,<0.22; python_version>"3.8"
2121
omegaconf
2222
modelscope
23-
transformers>=4.55.1
23+
transformers>=4.55.1

0 commit comments

Comments
 (0)