|
16 | 16 |
|
17 | 17 | import sentencepiece as spm
|
18 | 18 | import unicodedata
|
| 19 | +from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union |
19 | 20 |
|
| 21 | +from ..tokenizer_utils_base import TensorType, PaddingStrategy, TruncationStrategy |
20 | 22 | from .. import PretrainedTokenizer
|
21 | 23 |
|
22 | 24 | __all__ = ['ErnieMTokenizer']
|
@@ -114,27 +116,50 @@ def __init__(self,
|
114 | 116 | self.SP_CHAR_MAPPING[chr(ch)] = chr(ch - 65248)
|
115 | 117 |
|
116 | 118 | def __call__(self,
|
117 |
| - text, |
118 |
| - text_pair=None, |
119 |
| - max_seq_len=None, |
120 |
| - stride=0, |
121 |
| - is_split_into_words=False, |
122 |
| - pad_to_max_seq_len=False, |
123 |
| - truncation_strategy="longest_first", |
124 |
| - return_position_ids=True, |
125 |
| - return_token_type_ids=False, |
126 |
| - return_attention_mask=True, |
127 |
| - return_length=False, |
128 |
| - return_overflowing_tokens=False, |
129 |
| - return_special_tokens_mask=False, |
130 |
| - max_length=None): |
131 |
| - if max_length is None: |
132 |
| - max_length = max_seq_len |
| 119 | + text: Union[str, List[str], List[List[str]]], |
| 120 | + text_pair: Optional[Union[str, List[str], |
| 121 | + List[List[str]]]] = None, |
| 122 | + max_length: Optional[int] = None, |
| 123 | + stride: int = 0, |
| 124 | + is_split_into_words: bool = False, |
| 125 | + padding: Union[bool, str, PaddingStrategy] = False, |
| 126 | + truncation: Union[bool, str, TruncationStrategy] = False, |
| 127 | + return_position_ids: bool = True, |
| 128 | + return_token_type_ids: bool = False, |
| 129 | + return_attention_mask: bool = True, |
| 130 | + return_length: bool = False, |
| 131 | + return_overflowing_tokens: bool = False, |
| 132 | + return_special_tokens_mask: bool = False, |
| 133 | + return_dict: bool = True, |
| 134 | + return_offsets_mapping: bool = False, |
| 135 | + add_special_tokens: bool = True, |
| 136 | + pad_to_multiple_of: Optional[int] = None, |
| 137 | + return_tensors: Optional[Union[str, TensorType]] = None, |
| 138 | + verbose: bool = True, |
| 139 | + **kwargs): |
133 | 140 | return super(ErnieMTokenizer, self).__call__(
|
134 |
| - text, text_pair, max_length, stride, is_split_into_words, |
135 |
| - pad_to_max_seq_len, truncation_strategy, return_position_ids, |
136 |
| - return_token_type_ids, return_attention_mask, return_length, |
137 |
| - return_overflowing_tokens, return_special_tokens_mask) |
| 141 | + text=text, |
| 142 | + text_pair=text_pair, |
| 143 | + max_length=max_length, |
| 144 | + stride=stride, |
| 145 | + is_split_into_words=is_split_into_words, |
| 146 | + padding=padding, |
| 147 | + truncation=truncation, |
| 148 | + return_position_ids=return_position_ids, |
| 149 | + # Ernie-M model doesn't have token_type embedding. |
| 150 | + # So set "return_token_type_ids" to False. |
| 151 | + return_token_type_ids=False, |
| 152 | + return_attention_mask=return_attention_mask, |
| 153 | + return_length=return_length, |
| 154 | + return_overflowing_tokens=return_overflowing_tokens, |
| 155 | + return_special_tokens_mask=return_special_tokens_mask, |
| 156 | + return_dict=return_dict, |
| 157 | + return_offsets_mapping=return_offsets_mapping, |
| 158 | + add_special_tokens=add_special_tokens, |
| 159 | + pad_to_multiple_of=pad_to_multiple_of, |
| 160 | + return_tensors=return_tensors, |
| 161 | + verbose=verbose, |
| 162 | + **kwargs) |
138 | 163 |
|
139 | 164 | def get_offset_mapping(self, text):
|
140 | 165 | split_tokens = self._tokenize(text)
|
@@ -208,7 +233,7 @@ def _tokenize(self, text, sample=False):
|
208 | 233 | new_pieces.append(piece[lst_i:])
|
209 | 234 | return new_pieces
|
210 | 235 |
|
211 |
| - def tokenize(self, text): |
| 236 | + def tokenize(self, text, **kwargs): |
212 | 237 | r"""
|
213 | 238 | Converts a string to a list of tokens.
|
214 | 239 |
|
|
0 commit comments