Skip to content

Commit 912e027

Browse files
authored
Fix ernie-m tokenizer defalut args (#2916)
1 parent 95c5e00 commit 912e027

File tree

3 files changed

+99
-24
lines changed

3 files changed

+99
-24
lines changed

paddlenlp/transformers/ernie_m/faster_tokenizer.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,12 @@
1515

1616
import os
1717
import json
18-
from typing import List, Optional, Tuple
18+
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
1919
from shutil import copyfile
2020

2121
from faster_tokenizer import normalizers
2222
from ..tokenizer_utils_faster import PretrainedFasterTokenizer
23+
from ..tokenizer_utils_base import TensorType, PaddingStrategy, TruncationStrategy
2324
from .tokenizer import ErnieMTokenizer
2425
from ...utils.log import logger
2526

@@ -87,3 +88,49 @@ def save_vocabulary(self,
8788
copyfile(self.sentencepiece_model_file,
8889
out_sentencepiece_model_file)
8990
return (out_sentencepiece_model_file, )
91+
92+
def __call__(self,
93+
text: Union[str, List[str], List[List[str]]],
94+
text_pair: Optional[Union[str, List[str],
95+
List[List[str]]]] = None,
96+
max_length: Optional[int] = None,
97+
stride: int = 0,
98+
is_split_into_words: bool = False,
99+
padding: Union[bool, str, PaddingStrategy] = False,
100+
truncation: Union[bool, str, TruncationStrategy] = False,
101+
return_position_ids: bool = True,
102+
return_token_type_ids: bool = False,
103+
return_attention_mask: bool = True,
104+
return_length: bool = False,
105+
return_overflowing_tokens: bool = False,
106+
return_special_tokens_mask: bool = False,
107+
return_dict: bool = True,
108+
return_offsets_mapping: bool = False,
109+
add_special_tokens: bool = True,
110+
pad_to_multiple_of: Optional[int] = None,
111+
return_tensors: Optional[Union[str, TensorType]] = None,
112+
verbose: bool = True,
113+
**kwargs):
114+
return super(ErnieMFasterTokenizer, self).__call__(
115+
text=text,
116+
text_pair=text_pair,
117+
max_length=max_length,
118+
stride=stride,
119+
is_split_into_words=is_split_into_words,
120+
padding=padding,
121+
truncation=truncation,
122+
return_position_ids=return_position_ids,
123+
# Ernie-M model doesn't have token_type embedding.
124+
# So set "return_token_type_ids" to False.
125+
return_token_type_ids=False,
126+
return_attention_mask=return_attention_mask,
127+
return_length=return_length,
128+
return_overflowing_tokens=return_overflowing_tokens,
129+
return_special_tokens_mask=return_special_tokens_mask,
130+
return_dict=return_dict,
131+
return_offsets_mapping=return_offsets_mapping,
132+
add_special_tokens=add_special_tokens,
133+
pad_to_multiple_of=pad_to_multiple_of,
134+
return_tensors=return_tensors,
135+
verbose=verbose,
136+
**kwargs)

paddlenlp/transformers/ernie_m/tokenizer.py

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616

1717
import sentencepiece as spm
1818
import unicodedata
19+
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
1920

21+
from ..tokenizer_utils_base import TensorType, PaddingStrategy, TruncationStrategy
2022
from .. import PretrainedTokenizer
2123

2224
__all__ = ['ErnieMTokenizer']
@@ -114,27 +116,50 @@ def __init__(self,
114116
self.SP_CHAR_MAPPING[chr(ch)] = chr(ch - 65248)
115117

116118
def __call__(self,
117-
text,
118-
text_pair=None,
119-
max_seq_len=None,
120-
stride=0,
121-
is_split_into_words=False,
122-
pad_to_max_seq_len=False,
123-
truncation_strategy="longest_first",
124-
return_position_ids=True,
125-
return_token_type_ids=False,
126-
return_attention_mask=True,
127-
return_length=False,
128-
return_overflowing_tokens=False,
129-
return_special_tokens_mask=False,
130-
max_length=None):
131-
if max_length is None:
132-
max_length = max_seq_len
119+
text: Union[str, List[str], List[List[str]]],
120+
text_pair: Optional[Union[str, List[str],
121+
List[List[str]]]] = None,
122+
max_length: Optional[int] = None,
123+
stride: int = 0,
124+
is_split_into_words: bool = False,
125+
padding: Union[bool, str, PaddingStrategy] = False,
126+
truncation: Union[bool, str, TruncationStrategy] = False,
127+
return_position_ids: bool = True,
128+
return_token_type_ids: bool = False,
129+
return_attention_mask: bool = True,
130+
return_length: bool = False,
131+
return_overflowing_tokens: bool = False,
132+
return_special_tokens_mask: bool = False,
133+
return_dict: bool = True,
134+
return_offsets_mapping: bool = False,
135+
add_special_tokens: bool = True,
136+
pad_to_multiple_of: Optional[int] = None,
137+
return_tensors: Optional[Union[str, TensorType]] = None,
138+
verbose: bool = True,
139+
**kwargs):
133140
return super(ErnieMTokenizer, self).__call__(
134-
text, text_pair, max_length, stride, is_split_into_words,
135-
pad_to_max_seq_len, truncation_strategy, return_position_ids,
136-
return_token_type_ids, return_attention_mask, return_length,
137-
return_overflowing_tokens, return_special_tokens_mask)
141+
text=text,
142+
text_pair=text_pair,
143+
max_length=max_length,
144+
stride=stride,
145+
is_split_into_words=is_split_into_words,
146+
padding=padding,
147+
truncation=truncation,
148+
return_position_ids=return_position_ids,
149+
# Ernie-M model doesn't have token_type embedding.
150+
# So set "return_token_type_ids" to False.
151+
return_token_type_ids=False,
152+
return_attention_mask=return_attention_mask,
153+
return_length=return_length,
154+
return_overflowing_tokens=return_overflowing_tokens,
155+
return_special_tokens_mask=return_special_tokens_mask,
156+
return_dict=return_dict,
157+
return_offsets_mapping=return_offsets_mapping,
158+
add_special_tokens=add_special_tokens,
159+
pad_to_multiple_of=pad_to_multiple_of,
160+
return_tensors=return_tensors,
161+
verbose=verbose,
162+
**kwargs)
138163

139164
def get_offset_mapping(self, text):
140165
split_tokens = self._tokenize(text)
@@ -208,7 +233,7 @@ def _tokenize(self, text, sample=False):
208233
new_pieces.append(piece[lst_i:])
209234
return new_pieces
210235

211-
def tokenize(self, text):
236+
def tokenize(self, text, **kwargs):
212237
r"""
213238
Converts a string to a list of tokens.
214239

paddlenlp/transformers/tokenizer_utils_faster.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ def _convert_encoding(
138138
return_special_tokens_mask: bool = False,
139139
return_offsets_mapping: bool = False,
140140
return_length: bool = False,
141+
return_position_ids: bool = False,
141142
verbose: bool = True,
142143
) -> Tuple[Dict[str, Any], List[FasterEncoding]]:
143144
"""
@@ -174,7 +175,8 @@ def _convert_encoding(
174175
encoding_dict["offset_mapping"].append(e.offsets)
175176
if return_length:
176177
encoding_dict["length"].append(len(e.ids))
177-
178+
if return_position_ids:
179+
encoding_dict["position_ids"].append(list(range(len(e.ids))))
178180
return encoding_dict, encodings
179181

180182
def convert_tokens_to_ids(
@@ -317,7 +319,7 @@ def set_truncation_and_padding(
317319
"direction": self.padding_side,
318320
"pad_id": self.pad_token_id,
319321
"pad_token": self.pad_token,
320-
"pad_token_type_id": self.pad_token_type_id,
322+
"pad_type_id": self.pad_token_type_id,
321323
"pad_to_multiple_of": pad_to_multiple_of,
322324
}
323325
if _padding != target:
@@ -384,6 +386,7 @@ def _batch_encode_plus(
384386
return_special_tokens_mask=return_special_tokens_mask,
385387
return_offsets_mapping=return_offsets_mapping,
386388
return_length=return_length,
389+
return_position_ids=return_position_ids,
387390
verbose=verbose,
388391
) for encoding in encodings
389392
]

0 commit comments

Comments
 (0)