Skip to content

Commit a9a6b80

Browse files
authored
【Hackathon 7th No.43】TokenizerFast for BLOOM (#9407)
* add bloom tokenizer fast * fix fast * Update test_tokenizer.py * fix lint * reopen ci * rerun ci * fix ci * fix bloom test * fix bloom coverage test * fix bloom coverage test * add copyright for bert tokenizer fast * add copyright for bloom tokenizer fast
1 parent b1466d7 commit a9a6b80

File tree

6 files changed

+158
-6
lines changed

6 files changed

+158
-6
lines changed

paddlenlp/transformers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@
260260
from .bloom.configuration import *
261261
from .bloom.modeling import *
262262
from .bloom.tokenizer import *
263+
from .bloom.tokenizer_fast import *
263264
from .clipseg.configuration import *
264265
from .clipseg.modeling import *
265266
from .clipseg.processing import *

paddlenlp/transformers/auto/tokenizer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,10 @@
5454
),
5555
),
5656
("blenderbot", "BlenderbotTokenizer"),
57-
("bloom", "BloomTokenizer"),
57+
(
58+
"bloom",
59+
("BloomTokenizer", "BloomTokenizerFast" if is_tokenizers_available() else None),
60+
),
5861
("clip", "CLIPTokenizer"),
5962
("codegen", "CodeGenTokenizer"),
6063
("convbert", "ConvBertTokenizer"),

paddlenlp/transformers/bert/tokenizer_fast.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
12
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# Copyright 2022 The HuggingFace Inc. team.
2+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import pickle
17+
from typing import Optional, Tuple
18+
19+
from ..tokenizer_utils_base import BatchEncoding
20+
from ..tokenizer_utils_fast import PretrainedTokenizerFast
21+
from .tokenizer import BloomTokenizer
22+
23+
VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
24+
25+
26+
class BloomTokenizerFast(PretrainedTokenizerFast):
27+
r"""
28+
Construct a "fast" Bloom tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
29+
Byte-Pair-Encoding.
30+
31+
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
32+
be encoded differently whether it is at the beginning of the sentence (without space) or not:
33+
34+
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
35+
the model was not pretrained this way, it might yield a decrease in performance.
36+
37+
<Tip>
38+
39+
When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
40+
41+
</Tip>
42+
43+
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
44+
refer to this superclass for more information regarding those methods.
45+
46+
Args:
47+
vocab_file (`str`):
48+
Path to the vocabulary file.
49+
merges_file (`str`):
50+
Path to the merges file.
51+
errors (`str`, *optional*, defaults to `"replace"`):
52+
Paradigm to follow when decoding bytes to UTF-8. See
53+
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
54+
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
55+
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
56+
token instead.
57+
bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
58+
The beginning of sequence token.
59+
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
60+
The end of sequence token.
61+
add_prefix_space (`bool`, *optional*, defaults to `False`):
62+
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
63+
other word. (Bloom tokenizer detect beginning of words by the preceding space).
64+
trim_offsets (`bool`, *optional*, defaults to `True`):
65+
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
66+
"""
67+
68+
resource_files_names = VOCAB_FILES_NAMES
69+
model_input_names = ["input_ids", "attention_mask"]
70+
slow_tokenizer_class = BloomTokenizer
71+
72+
def __init__(
73+
self,
74+
vocab_file=None,
75+
merges_file=None,
76+
tokenizer_file=None,
77+
unk_token="<unk>",
78+
bos_token="<s>",
79+
eos_token="</s>",
80+
pad_token="<pad>",
81+
add_prefix_space=False,
82+
clean_up_tokenization_spaces=False,
83+
**kwargs,
84+
):
85+
super().__init__(
86+
vocab_file=vocab_file,
87+
merges_file=merges_file,
88+
tokenizer_file=tokenizer_file,
89+
unk_token=unk_token,
90+
bos_token=bos_token,
91+
eos_token=eos_token,
92+
pad_token=pad_token,
93+
add_prefix_space=add_prefix_space,
94+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
95+
**kwargs,
96+
)
97+
98+
pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
99+
decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
100+
101+
if add_prefix_space:
102+
pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
103+
decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
104+
self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
105+
self.backend_tokenizer.decoder = pickle.loads(decoder_state)
106+
107+
self.add_prefix_space = add_prefix_space
108+
109+
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
110+
is_split_into_words = kwargs.get("is_split_into_words", False)
111+
if not (self.add_prefix_space or not is_split_into_words):
112+
raise Exception(
113+
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
114+
" pretokenized inputs."
115+
)
116+
117+
return super()._batch_encode_plus(*args, **kwargs)
118+
119+
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
120+
is_split_into_words = kwargs.get("is_split_into_words", False)
121+
122+
if not (self.add_prefix_space or not is_split_into_words):
123+
raise Exception(
124+
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
125+
" pretokenized inputs."
126+
)
127+
128+
return super()._encode_plus(*args, **kwargs)
129+
130+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
131+
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
132+
return tuple(files)

paddlenlp/transformers/convert_slow_tokenizer.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -442,10 +442,7 @@ def pre_tokenizer(self, replacement, add_prefix_space):
442442
return None
443443

444444

445-
SLOW_TO_FAST_CONVERTERS = {
446-
"LlamaTokenizer": LlamaConverter,
447-
"BertTokenizer": BertConverter,
448-
}
445+
SLOW_TO_FAST_CONVERTERS = {"LlamaTokenizer": LlamaConverter, "BertTokenizer": BertConverter}
449446

450447

451448
def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer:

tests/transformers/bloom/test_tokenizer.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import os
1818
import unittest
1919

20-
from paddlenlp.transformers import BloomTokenizer
20+
from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast
2121

2222
from ..test_tokenizer_common import TokenizerTesterMixin
2323

@@ -30,6 +30,7 @@
3030
class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
3131

3232
tokenizer_class = BloomTokenizer
33+
rust_tokenizer_class = BloomTokenizerFast
3334
from_pretrained_kwargs = {"add_prefix_space": True}
3435
test_decode_token = True
3536
test_seq2seq = False
@@ -90,8 +91,25 @@ def test_full_tokenizer(self):
9091

9192
input_tokens = tokens + [tokenizer.unk_token]
9293
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
94+
9395
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
9496

97+
# test encode_plus
98+
def test_encodings_from_sample_data(self):
99+
"""
100+
Assert that the created tokens are the same than the hard-coded ones
101+
"""
102+
tokenizer = self.rust_tokenizer_class.from_pretrained("bigscience/bloom-560m")
103+
104+
INPUT_SENTENCES = ["The quick brown fox</s>", "jumps over the lazy dog</s>"]
105+
TARGET_TOKENS = [[2175, 23714, 73173, 144252, 2], [77, 132619, 3478, 368, 109586, 35433, 2]]
106+
107+
computed_tokens = tokenizer.batch_encode(INPUT_SENTENCES)["input_ids"]
108+
self.assertListEqual(TARGET_TOKENS, computed_tokens)
109+
110+
decoded_tokens = tokenizer.batch_decode(computed_tokens)
111+
self.assertListEqual(decoded_tokens, INPUT_SENTENCES)
112+
95113
def test_pretokenized_inputs(self, *args, **kwargs):
96114
pass
97115

0 commit comments

Comments
 (0)