Skip to content

Commit 87f342c

Browse files
authored
fix roberta tokenizer (#1798)
1 parent 5bcc708 commit 87f342c

File tree

2 files changed

+8
-2
lines changed

2 files changed

+8
-2
lines changed

paddlenlp/transformers/roberta/tokenizer.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,9 @@ def __init__(self,
176176
"and 'merges_file' to construct an roberta BPE tokenizer."
177177
"Specify 'vocal_file' for Chinese tokenizer")
178178

179+
def __getattr__(self, name):
180+
return self.tokenizer.__getattr__(name)
181+
179182
@property
180183
def vocab_size(self):
181184
"""
@@ -324,6 +327,9 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
324327
return self.tokenizer.build_inputs_with_special_tokens(
325328
token_ids_0, token_ids_1=token_ids_1)
326329

330+
def _convert_token_to_id(self, token):
331+
return self.tokenizer._convert_token_to_id(token)
332+
327333
def build_offset_mapping_with_special_tokens(self,
328334
offset_mapping_0,
329335
offset_mapping_1=None):

paddlenlp/transformers/tokenizer_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1702,8 +1702,8 @@ def get_input_ids(text):
17021702

17031703
else:
17041704
encoded_inputs = self.encode(
1705-
first_ids,
1706-
second_ids,
1705+
text,
1706+
text_pair,
17071707
max_seq_len=max_seq_len,
17081708
pad_to_max_seq_len=pad_to_max_seq_len,
17091709
truncation_strategy=truncation_strategy,

0 commit comments

Comments
 (0)