Skip to content

Commit 7a221cc

Browse files
authored
【Hackathon 7th No.43】TokenizerFast for Qwen2 (#9532)
* add qwen2 tokenizer fast
1 parent 2522bf8 commit 7a221cc

File tree

6 files changed

+190
-3
lines changed

6 files changed

+190
-3
lines changed

paddlenlp/transformers/auto/tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@
124124
("ernie_vil", "ErnieViLTokenizer"),
125125
("glm", "GLMGPT2Tokenizer"),
126126
("qwen", "QWenTokenizer"),
127-
("qwen2", "Qwen2Tokenizer"),
127+
("qwen2", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
128128
("yuan", "YuanTokenizer"),
129129
]
130130
)

paddlenlp/transformers/convert_slow_tokenizer.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,55 @@ def pre_tokenizer(self, replacement, add_prefix_space):
442442
return None
443443

444444

445-
SLOW_TO_FAST_CONVERTERS = {"LlamaTokenizer": LlamaConverter, "BertTokenizer": BertConverter}
445+
class Qwen2Converter(Converter):
446+
def converted(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]] = None) -> Tokenizer:
447+
if not vocab:
448+
vocab = self.original_tokenizer.encoder
449+
if not merges:
450+
merges = list(self.original_tokenizer.bpe_ranks.keys())
451+
452+
tokenizer = Tokenizer(
453+
BPE(
454+
vocab=vocab,
455+
merges=merges,
456+
dropout=None,
457+
unk_token=None,
458+
continuing_subword_prefix="",
459+
end_of_word_suffix="",
460+
fuse_unk=False,
461+
byte_fallback=False,
462+
)
463+
)
464+
465+
tokenizer.normalizer = normalizers.NFC()
466+
467+
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
468+
[
469+
pre_tokenizers.Split(
470+
Regex(
471+
r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
472+
),
473+
behavior="isolated",
474+
invert=False,
475+
),
476+
pre_tokenizers.ByteLevel(
477+
add_prefix_space=getattr(self.original_tokenizer, "add_prefix_space", False),
478+
use_regex=False,
479+
),
480+
]
481+
)
482+
483+
tokenizer.decoder = decoders.ByteLevel()
484+
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
485+
486+
return tokenizer
487+
488+
489+
SLOW_TO_FAST_CONVERTERS = {
490+
"LlamaTokenizer": LlamaConverter,
491+
"BertTokenizer": BertConverter,
492+
"Qwen2Tokenizer": Qwen2Converter,
493+
}
446494

447495

448496
def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer:

paddlenlp/transformers/qwen2/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@
1717
from .modeling import *
1818
from .modeling_pp import *
1919
from .tokenizer import *
20+
from .tokenizer_fast import *
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""Tokenization classes for Qwen2."""
16+
17+
from typing import Optional, Tuple
18+
19+
from ..tokenizer_utils import AddedToken
20+
from ..tokenizer_utils_fast import PretrainedTokenizerFast
21+
from .tokenizer import Qwen2Tokenizer
22+
23+
VOCAB_FILES_NAMES = {
24+
"vocab_file": "vocab.json",
25+
"merges_file": "merges.txt",
26+
"tokenizer_file": "tokenizer.json",
27+
}
28+
29+
30+
MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
31+
32+
33+
class Qwen2TokenizerFast(PretrainedTokenizerFast):
34+
"""
35+
Construct a "fast" Qwen2 tokenizer (backed by PaddleNLP's *tokenizers* library). Based on byte-level
36+
Byte-Pair-Encoding.
37+
38+
Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
39+
be encoded differently whether it is at the beginning of the sentence (without space) or not:
40+
41+
```python
42+
>>> from transformers import Qwen2TokenizerFast
43+
44+
>>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
45+
>>> tokenizer("Hello world")["input_ids"]
46+
[9707, 1879]
47+
48+
>>> tokenizer(" Hello world")["input_ids"]
49+
[21927, 1879]
50+
```
51+
This is expected.
52+
53+
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
54+
refer to this superclass for more information regarding those methods.
55+
56+
Args:
57+
vocab_file (`str`, *optional*):
58+
Path to the vocabulary file.
59+
merges_file (`str`, *optional*):
60+
Path to the merges file.
61+
tokenizer_file (`str`, *optional*):
62+
Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
63+
contains everything needed to load the tokenizer.
64+
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
65+
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
66+
token instead. Not applicable to this tokenizer.
67+
bos_token (`str`, *optional*):
68+
The beginning of sequence token. Not applicable for this tokenizer.
69+
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
70+
The end of sequence token.
71+
pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
72+
The token used for padding, for example when batching sequences of different lengths.
73+
"""
74+
75+
vocab_files_names = VOCAB_FILES_NAMES
76+
resource_files_names = VOCAB_FILES_NAMES
77+
model_input_names = ["input_ids", "attention_mask"]
78+
slow_tokenizer_class = Qwen2Tokenizer
79+
80+
def __init__(
81+
self,
82+
vocab_file=None,
83+
merges_file=None,
84+
tokenizer_file=None,
85+
unk_token="<|endoftext|>",
86+
bos_token=None,
87+
eos_token="<|endoftext|>",
88+
pad_token="<|endoftext|>",
89+
**kwargs,
90+
):
91+
# We need to at least pass vocab_file and merges_file to base class
92+
# in case a slow tokenizer needs to be initialized; other can be
93+
# configured through files.
94+
# following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
95+
96+
bos_token = (
97+
AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
98+
if isinstance(bos_token, str)
99+
else bos_token
100+
)
101+
eos_token = (
102+
AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
103+
if isinstance(eos_token, str)
104+
else eos_token
105+
)
106+
unk_token = (
107+
AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
108+
if isinstance(unk_token, str)
109+
else unk_token
110+
)
111+
pad_token = (
112+
AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
113+
if isinstance(pad_token, str)
114+
else pad_token
115+
)
116+
117+
super().__init__(
118+
vocab_file=vocab_file,
119+
merges_file=merges_file,
120+
tokenizer_file=tokenizer_file,
121+
unk_token=unk_token,
122+
bos_token=bos_token,
123+
eos_token=eos_token,
124+
pad_token=pad_token,
125+
**kwargs,
126+
)
127+
128+
# Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
129+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
130+
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
131+
return tuple(files)

paddlenlp/transformers/tokenizer_utils_base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1534,6 +1534,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
15341534
"chat_template_file": CHAT_TEMPLATE_CONFIG_NAME,
15351535
}
15361536

1537+
if hasattr(cls, "vocab_files_names") and len(cls.resource_files_names) == 0:
1538+
cls.resource_files_names = copy.deepcopy(cls.vocab_files_names)
1539+
logger.error(
1540+
"The attribute 'vocab_files_names' is deprecated. Please use 'resource_files_names' instead.",
1541+
DeprecationWarning,
1542+
)
15371543
vocab_files_target = {**cls.resource_files_names, **additional_files_names}
15381544
# From HF Hub or AI Studio
15391545
if from_hf_hub or from_aistudio:

tests/transformers/qwen2/test_tokenizer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,15 @@
1818
import os
1919
import unittest
2020

21-
from paddlenlp.transformers import Qwen2Tokenizer
21+
from paddlenlp.transformers import Qwen2Tokenizer, Qwen2TokenizerFast
2222
from paddlenlp.transformers.qwen2.tokenizer import VOCAB_FILES_NAMES, bytes_to_unicode
2323
from tests.transformers.test_tokenizer_common import TokenizerTesterMixin
2424

2525

2626
class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
2727
from_pretrained_id = "__internal_testing__/tiny-random-qwen2"
2828
tokenizer_class = Qwen2Tokenizer
29+
rust_tokenizer_class = Qwen2TokenizerFast
2930
test_slow_tokenizer = True
3031
space_between_special_tokens = False
3132
from_pretrained_kwargs = None

0 commit comments

Comments
 (0)