@@ -1239,15 +1239,15 @@ def get_input_ids(text):
1239
1239
if i == len (sub_tokens ) - 1 :
1240
1240
token_offset_mapping .append (
1241
1241
(token_start_offset , token_start_offset +
1242
- len (sub_tokens [i ].strip ("##" ))))
1242
+ len (sub_tokens [i ].replace ("##" , " " ))))
1243
1243
token_start_offset += (
1244
- len (sub_tokens [i ].strip ("##" )) + 1 )
1244
+ len (sub_tokens [i ].replace ("##" , " " )) + 1 )
1245
1245
else :
1246
1246
token_offset_mapping .append (
1247
1247
(token_start_offset , token_start_offset +
1248
- len (sub_tokens [i ].strip ("##" ))))
1248
+ len (sub_tokens [i ].replace ("##" , " " ))))
1249
1249
token_start_offset += (
1250
- len (sub_tokens [i ].strip ("##" )))
1250
+ len (sub_tokens [i ].replace ("##" , " " )))
1251
1251
1252
1252
token_start_offset = 0
1253
1253
for token in token_pair :
@@ -1261,15 +1261,15 @@ def get_input_ids(text):
1261
1261
if i == len (sub_tokens ) - 1 :
1262
1262
token_pair_offset_mapping .append (
1263
1263
(token_start_offset , token_start_offset +
1264
- len (sub_tokens [i ].strip ("##" ))))
1264
+ len (sub_tokens [i ].replace ("##" , " " ))))
1265
1265
token_start_offset += (
1266
- len (sub_tokens [i ].strip ("##" )) + 1 )
1266
+ len (sub_tokens [i ].replace ("##" , " " )) + 1 )
1267
1267
else :
1268
1268
token_pair_offset_mapping .append (
1269
1269
(token_start_offset , token_start_offset +
1270
- len (sub_tokens [i ].strip ("##" ))))
1270
+ len (sub_tokens [i ].replace ("##" , " " ))))
1271
1271
token_start_offset += (
1272
- len (sub_tokens [i ].strip ("##" )))
1272
+ len (sub_tokens [i ].replace ("##" , " " )))
1273
1273
1274
1274
offset = 0
1275
1275
while offset < len (second_ids ):
0 commit comments