-
Notifications
You must be signed in to change notification settings - Fork 56
Expand file tree
/
Copy pathtokenization.py
More file actions
97 lines (82 loc) · 3.22 KB
/
tokenization.py
File metadata and controls
97 lines (82 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright 2023 Masatoshi Suzuki (@singletongue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unicodedata
from transformers.models.bert_japanese.tokenization_bert_japanese import (
BertJapaneseTokenizer as BertJapaneseTokenizerBase,
CharacterTokenizer as CharacterTokenizerBase,
)
class BertJapaneseTokenizer(BertJapaneseTokenizerBase):
def __init__(
self,
vocab_file,
spm_file=None,
do_lower_case=False,
do_word_tokenize=True,
do_subword_tokenize=True,
word_tokenizer_type="basic",
subword_tokenizer_type="wordpiece",
vocab_has_no_subword_prefix=False,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
mecab_kwargs=None,
sudachi_kwargs=None,
jumanpp_kwargs=None,
**kwargs,
):
super().__init__(
vocab_file,
spm_file=spm_file,
do_lower_case=do_lower_case,
do_word_tokenize=do_word_tokenize,
do_subword_tokenize=do_subword_tokenize,
word_tokenizer_type=word_tokenizer_type,
subword_tokenizer_type=subword_tokenizer_type,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
mecab_kwargs=mecab_kwargs,
sudachi_kwargs=sudachi_kwargs,
jumanpp_kwargs=jumanpp_kwargs,
**kwargs,
)
self.vocab_has_no_subword_prefix = vocab_has_no_subword_prefix
if do_subword_tokenize and subword_tokenizer_type == "character":
self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
def _convert_token_to_id(self, token):
if self.vocab_has_no_subword_prefix and token.startswith("##"):
token = token[len("##"):]
return self.vocab.get(token, self.vocab.get(self.unk_token))
class CharacterTokenizer(CharacterTokenizerBase):
def __init__(self, vocab, unk_token, normalize_text=True):
super().__init__(vocab, unk_token, normalize_text=normalize_text)
def tokenize(self, text):
if self.normalize_text:
text = unicodedata.normalize("NFKC", text)
output_tokens = []
for i, char in enumerate(text):
if char not in self.vocab:
output_tokens.append(self.unk_token)
continue
if i > 0:
char = "##" + char
output_tokens.append(char)
return output_tokens