File tree Expand file tree Collapse file tree 1 file changed +8
-1
lines changed Expand file tree Collapse file tree 1 file changed +8
-1
lines changed Original file line number Diff line number Diff line change @@ -1363,7 +1363,14 @@ def get_offset_mapping(self, text):
1363
1363
if token in self .all_special_tokens :
1364
1364
token = token .lower () if hasattr (
1365
1365
self , "do_lower_case" ) and self .do_lower_case else token
1366
- start = text [offset :].index (token ) + offset
1366
+ # The greek letter "sigma" has 2 forms of lowercase, σ and ς respectively.
1367
+ # When used as a final letter of a word, the final form (ς) is used. Otherwise, the form (σ) is used.
1368
+ # https://latin.stackexchange.com/questions/6168/how-and-when-did-we-get-two-forms-of-sigma
1369
+ if "σ" in token or "ς" in token :
1370
+ start = text [offset :].replace ("ς" , "σ" ).index (
1371
+ token .replace ("ς" , "σ" )) + offset
1372
+ else :
1373
+ start = text [offset :].index (token ) + offset
1367
1374
1368
1375
end = start + len (token )
1369
1376
You can’t perform that action at this time.
0 commit comments