Skip to content

Commit 0eeb4f8

Browse files
committed
add auto tokenizer tests
1 parent 579d957 commit 0eeb4f8

File tree

3 files changed

+273
-0
lines changed

3 files changed

+273
-0
lines changed

paddlenlp/transformers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,14 @@
5353
# isort: split
5454
from .bert.modeling import *
5555
from .bert.tokenizer import *
56+
from .bert.tokenizer_fast import *
5657
from .bert.configuration import *
5758

5859
# isort: split
5960
from .gpt import *
6061
from .roberta.modeling import *
6162
from .roberta.tokenizer import *
63+
from .roberta.tokenizer_fast import *
6264
from .roberta.configuration import *
6365
from .electra.modeling import *
6466
from .electra.tokenizer import *

paddlenlp/transformers/bert/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,8 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
15+
from .configuration import *
16+
from .modeling import *
17+
from .tokenizer import *
18+
from .tokenizer_fast import *
Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""Fast Tokenization classes for RoBERTa."""
16+
17+
import json
18+
from typing import List, Optional, Tuple
19+
20+
from tokenizers import pre_tokenizers, processors
21+
22+
from ...utils.log import logger
23+
from ..tokenizer_utils_base import AddedToken, BatchEncoding
24+
from ..tokenizer_utils_fast import PretrainedTokenizerFast
25+
from .tokenizer import RobertaTokenizer
26+
27+
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
28+
29+
30+
class RobertaTokenizerFast(PretrainedTokenizerFast):
31+
"""
32+
Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2
33+
tokenizer, using byte-level Byte-Pair-Encoding.
34+
35+
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
36+
be encoded differently whether it is at the beginning of the sentence (without space) or not:
37+
38+
```python
39+
>>> from transformers import RobertaTokenizerFast
40+
41+
>>> tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
42+
>>> tokenizer("Hello world")["input_ids"]
43+
[0, 31414, 232, 2]
44+
45+
>>> tokenizer(" Hello world")["input_ids"]
46+
[0, 20920, 232, 2]
47+
```
48+
49+
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
50+
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
51+
52+
<Tip>
53+
54+
When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
55+
56+
</Tip>
57+
58+
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
59+
refer to this superclass for more information regarding those methods.
60+
61+
Args:
62+
vocab_file (`str`):
63+
Path to the vocabulary file.
64+
merges_file (`str`):
65+
Path to the merges file.
66+
errors (`str`, *optional*, defaults to `"replace"`):
67+
Paradigm to follow when decoding bytes to UTF-8. See
68+
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
69+
bos_token (`str`, *optional*, defaults to `"<s>"`):
70+
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
71+
72+
<Tip>
73+
74+
When building a sequence using special tokens, this is not the token that is used for the beginning of
75+
sequence. The token used is the `cls_token`.
76+
77+
</Tip>
78+
79+
eos_token (`str`, *optional*, defaults to `"</s>"`):
80+
The end of sequence token.
81+
82+
<Tip>
83+
84+
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
85+
The token used is the `sep_token`.
86+
87+
</Tip>
88+
89+
sep_token (`str`, *optional*, defaults to `"</s>"`):
90+
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
91+
sequence classification or for a text and a question for question answering. It is also used as the last
92+
token of a sequence built with special tokens.
93+
cls_token (`str`, *optional*, defaults to `"<s>"`):
94+
The classifier token which is used when doing sequence classification (classification of the whole sequence
95+
instead of per-token classification). It is the first token of the sequence when built with special tokens.
96+
unk_token (`str`, *optional*, defaults to `"<unk>"`):
97+
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
98+
token instead.
99+
pad_token (`str`, *optional*, defaults to `"<pad>"`):
100+
The token used for padding, for example when batching sequences of different lengths.
101+
mask_token (`str`, *optional*, defaults to `"<mask>"`):
102+
The token used for masking values. This is the token used when training this model with masked language
103+
modeling. This is the token which the model will try to predict.
104+
add_prefix_space (`bool`, *optional*, defaults to `False`):
105+
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
106+
other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
107+
trim_offsets (`bool`, *optional*, defaults to `True`):
108+
Whether the post processing step should trim offsets to avoid including whitespaces.
109+
"""
110+
111+
vocab_files_names = VOCAB_FILES_NAMES
112+
model_input_names = ["input_ids", "attention_mask"]
113+
slow_tokenizer_class = RobertaTokenizer
114+
115+
def __init__(
116+
self,
117+
vocab_file=None,
118+
merges_file=None,
119+
tokenizer_file=None,
120+
errors="replace",
121+
bos_token="<s>",
122+
eos_token="</s>",
123+
sep_token="</s>",
124+
cls_token="<s>",
125+
unk_token="<unk>",
126+
pad_token="<pad>",
127+
mask_token="<mask>",
128+
add_prefix_space=False,
129+
trim_offsets=True,
130+
**kwargs,
131+
):
132+
mask_token = (
133+
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
134+
if isinstance(mask_token, str)
135+
else mask_token
136+
)
137+
super().__init__(
138+
vocab_file,
139+
merges_file,
140+
tokenizer_file=tokenizer_file,
141+
errors=errors,
142+
bos_token=bos_token,
143+
eos_token=eos_token,
144+
sep_token=sep_token,
145+
cls_token=cls_token,
146+
unk_token=unk_token,
147+
pad_token=pad_token,
148+
mask_token=mask_token,
149+
add_prefix_space=add_prefix_space,
150+
trim_offsets=trim_offsets,
151+
**kwargs,
152+
)
153+
154+
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
155+
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
156+
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
157+
pre_tok_state["add_prefix_space"] = add_prefix_space
158+
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
159+
160+
self.add_prefix_space = add_prefix_space
161+
162+
tokenizer_component = "post_processor"
163+
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
164+
if tokenizer_component_instance:
165+
state = json.loads(tokenizer_component_instance.__getstate__())
166+
167+
# The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class`
168+
if "sep" in state:
169+
state["sep"] = tuple(state["sep"])
170+
if "cls" in state:
171+
state["cls"] = tuple(state["cls"])
172+
173+
changes_to_apply = False
174+
175+
if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
176+
state["add_prefix_space"] = add_prefix_space
177+
changes_to_apply = True
178+
179+
if state.get("trim_offsets", trim_offsets) != trim_offsets:
180+
state["trim_offsets"] = trim_offsets
181+
changes_to_apply = True
182+
183+
if changes_to_apply:
184+
component_class = getattr(processors, state.pop("type"))
185+
new_value = component_class(**state)
186+
setattr(self.backend_tokenizer, tokenizer_component, new_value)
187+
188+
@property
189+
def mask_token(self) -> str:
190+
"""
191+
`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
192+
having been set.
193+
194+
Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
195+
comprise the space before the *<mask>*.
196+
"""
197+
if self._mask_token is None:
198+
if self.verbose:
199+
logger.error("Using mask_token, but it is not set yet.")
200+
return None
201+
return str(self._mask_token)
202+
203+
@mask_token.setter
204+
def mask_token(self, value):
205+
"""
206+
Overriding the default behavior of the mask token to have it eat the space before it.
207+
208+
This is needed to preserve backward compatibility with all the previously used models based on Roberta.
209+
"""
210+
# Mask token behave like a normal word, i.e. include the space before it
211+
# So we set lstrip to True
212+
value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
213+
self._mask_token = value
214+
215+
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
216+
is_split_into_words = kwargs.get("is_split_into_words", False)
217+
assert self.add_prefix_space or not is_split_into_words, (
218+
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
219+
"to use it with pretokenized inputs."
220+
)
221+
222+
return super()._batch_encode_plus(*args, **kwargs)
223+
224+
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
225+
is_split_into_words = kwargs.get("is_split_into_words", False)
226+
227+
assert self.add_prefix_space or not is_split_into_words, (
228+
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
229+
"to use it with pretokenized inputs."
230+
)
231+
232+
return super()._encode_plus(*args, **kwargs)
233+
234+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
235+
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
236+
return tuple(files)
237+
238+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
239+
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
240+
if token_ids_1 is None:
241+
return output
242+
243+
return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
244+
245+
def create_token_type_ids_from_sequences(
246+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
247+
) -> List[int]:
248+
"""
249+
Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
250+
make use of token type ids, therefore a list of zeros is returned.
251+
252+
Args:
253+
token_ids_0 (`List[int]`):
254+
List of IDs.
255+
token_ids_1 (`List[int]`, *optional*):
256+
Optional second list of IDs for sequence pairs.
257+
258+
Returns:
259+
`List[int]`: List of zeros.
260+
"""
261+
sep = [self.sep_token_id]
262+
cls = [self.cls_token_id]
263+
264+
if token_ids_1 is None:
265+
return len(cls + token_ids_0 + sep) * [0]
266+
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

0 commit comments

Comments
 (0)