Skip to content

Commit 15a074b

Browse files
authored
Fix tokenizer vocab replace bug (#878)
Fix #875
1 parent d4cf11a commit 15a074b

File tree

1 file changed

+8
-8
lines changed

1 file changed

+8
-8
lines changed

paddlenlp/transformers/tokenizer_utils.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,15 +1239,15 @@ def get_input_ids(text):
12391239
if i == len(sub_tokens) - 1:
12401240
token_offset_mapping.append(
12411241
(token_start_offset, token_start_offset +
1242-
len(sub_tokens[i].strip("##"))))
1242+
len(sub_tokens[i].replace("##", ""))))
12431243
token_start_offset += (
1244-
len(sub_tokens[i].strip("##")) + 1)
1244+
len(sub_tokens[i].replace("##", "")) + 1)
12451245
else:
12461246
token_offset_mapping.append(
12471247
(token_start_offset, token_start_offset +
1248-
len(sub_tokens[i].strip("##"))))
1248+
len(sub_tokens[i].replace("##", ""))))
12491249
token_start_offset += (
1250-
len(sub_tokens[i].strip("##")))
1250+
len(sub_tokens[i].replace("##", "")))
12511251

12521252
token_start_offset = 0
12531253
for token in token_pair:
@@ -1261,15 +1261,15 @@ def get_input_ids(text):
12611261
if i == len(sub_tokens) - 1:
12621262
token_pair_offset_mapping.append(
12631263
(token_start_offset, token_start_offset +
1264-
len(sub_tokens[i].strip("##"))))
1264+
len(sub_tokens[i].replace("##", ""))))
12651265
token_start_offset += (
1266-
len(sub_tokens[i].strip("##")) + 1)
1266+
len(sub_tokens[i].replace("##", "")) + 1)
12671267
else:
12681268
token_pair_offset_mapping.append(
12691269
(token_start_offset, token_start_offset +
1270-
len(sub_tokens[i].strip("##"))))
1270+
len(sub_tokens[i].replace("##", ""))))
12711271
token_start_offset += (
1272-
len(sub_tokens[i].strip("##")))
1272+
len(sub_tokens[i].replace("##", "")))
12731273

12741274
offset = 0
12751275
while offset < len(second_ids):

0 commit comments

Comments
 (0)