【Hackathon 7th No.43】TokenizerFast for BLOOM (#9407)

FlamingoPg · web-flow · commit a9a6b80a6251 · 2024-11-27T14:36:19.000+08:00
* add bloom tokenizer fast

* fix fast

* Update test_tokenizer.py

* fix lint

* reopen ci

* rerun ci

* fix ci

* fix bloom test

* fix bloom coverage test

* fix bloom coverage test

* add copyright for bert tokenizer fast

* add copyright for bloom tokenizer fast
diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py
@@ -260,6 +260,7 @@
 from .bloom.configuration import *
 from .bloom.modeling import *
 from .bloom.tokenizer import *
+from .bloom.tokenizer_fast import *
 from .clipseg.configuration import *
 from .clipseg.modeling import *
 from .clipseg.processing import *
diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py
@@ -54,7 +54,10 @@
                 ),
             ),
             ("blenderbot", "BlenderbotTokenizer"),
-            ("bloom", "BloomTokenizer"),
+            (
+                "bloom",
+                ("BloomTokenizer", "BloomTokenizerFast" if is_tokenizers_available() else None),
+            ),
             ("clip", "CLIPTokenizer"),
             ("codegen", "CodeGenTokenizer"),
             ("convbert", "ConvBertTokenizer"),
diff --git a/paddlenlp/transformers/bert/tokenizer_fast.py b/paddlenlp/transformers/bert/tokenizer_fast.py
@@ -1,3 +1,4 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py
@@ -0,0 +1,132 @@
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+from typing import Optional, Tuple
+
+from ..tokenizer_utils_base import BatchEncoding
+from ..tokenizer_utils_fast import PretrainedTokenizerFast
+from .tokenizer import BloomTokenizer
+
+VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
+
+
+class BloomTokenizerFast(PretrainedTokenizerFast):
+    r"""
+    Construct a "fast" Bloom tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+    Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+    the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (Bloom tokenizer detect beginning of words by the preceding space).
+        trim_offsets (`bool`, *optional*, defaults to `True`):
+            Whether or not the post-processing step should trim offsets to avoid including whitespaces.
+    """
+
+    resource_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = BloomTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        add_prefix_space=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_prefix_space=add_prefix_space,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+        pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
+        decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
+
+        if add_prefix_space:
+            pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+            decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+        self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
+        self.backend_tokenizer.decoder = pickle.loads(decoder_state)
+
+        self.add_prefix_space = add_prefix_space
+
+    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        if not (self.add_prefix_space or not is_split_into_words):
+            raise Exception(
+                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
+                " pretokenized inputs."
+            )
+
+        return super()._batch_encode_plus(*args, **kwargs)
+
+    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+
+        if not (self.add_prefix_space or not is_split_into_words):
+            raise Exception(
+                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
+                " pretokenized inputs."
+            )
+
+        return super()._encode_plus(*args, **kwargs)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py
@@ -442,10 +442,7 @@ def pre_tokenizer(self, replacement, add_prefix_space):
         return None
 
 
-SLOW_TO_FAST_CONVERTERS = {
-    "LlamaTokenizer": LlamaConverter,
-    "BertTokenizer": BertConverter,
-}
+SLOW_TO_FAST_CONVERTERS = {"LlamaTokenizer": LlamaConverter, "BertTokenizer": BertConverter}
 
 
 def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer:
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
@@ -17,7 +17,7 @@
 import os
 import unittest
 
-from paddlenlp.transformers import BloomTokenizer
+from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast
 
 from ..test_tokenizer_common import TokenizerTesterMixin
 
@@ -30,6 +30,7 @@
 class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     tokenizer_class = BloomTokenizer
+    rust_tokenizer_class = BloomTokenizerFast
     from_pretrained_kwargs = {"add_prefix_space": True}
     test_decode_token = True
     test_seq2seq = False
@@ -90,8 +91,25 @@ def test_full_tokenizer(self):
 
         input_tokens = tokens + [tokenizer.unk_token]
         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+
         self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
+    # test encode_plus
+    def test_encodings_from_sample_data(self):
+        """
+        Assert that the created tokens are the same than the hard-coded ones
+        """
+        tokenizer = self.rust_tokenizer_class.from_pretrained("bigscience/bloom-560m")
+
+        INPUT_SENTENCES = ["The quick brown fox</s>", "jumps over the lazy dog</s>"]
+        TARGET_TOKENS = [[2175, 23714, 73173, 144252, 2], [77, 132619, 3478, 368, 109586, 35433, 2]]
+
+        computed_tokens = tokenizer.batch_encode(INPUT_SENTENCES)["input_ids"]
+        self.assertListEqual(TARGET_TOKENS, computed_tokens)
+
+        decoded_tokens = tokenizer.batch_decode(computed_tokens)
+        self.assertListEqual(decoded_tokens, INPUT_SENTENCES)
+
     def test_pretokenized_inputs(self, *args, **kwargs):
         pass
 

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.`
`1`	`2`	`# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.`
`2`	`3`	`#`
`3`	`4`	`# Licensed under the Apache License, Version 2.0 (the "License");`