Skip to content

Commit ba6a8ad

Browse files
jirkamarsikotethal
authored andcommitted
Pass PythonLocale to TRegex when compiling locale-sensitive regexps
1 parent 074f3ae commit ba6a8ad

File tree

1 file changed

+50
-4
lines changed

1 file changed

+50
-4
lines changed

graalpython/lib-graalpython/_sre.py

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,15 @@ def _normalize_bounds(string, pos, endpos):
6767
def _is_bytes_like(object):
6868
return isinstance(object, (bytes, bytearray, memoryview, array, mmap))
6969

70+
def _getlocale():
71+
from locale import getlocale
72+
(lang, encoding) = getlocale()
73+
if lang is None and charset is None:
74+
return 'C'
75+
if lang is None:
76+
lang = 'en_US'
77+
return '.'.join((lang, encoding))
78+
7079
def _new_compile(p, flags=0):
7180
if _with_tregex and isinstance(p, (str, bytes, bytearray, memoryview, array, mmap)):
7281
return _t_compile(p, flags)
@@ -237,6 +246,7 @@ def __init__(self, pattern, flags):
237246
self.__binary = _is_bytes_like(pattern)
238247
self.pattern = pattern
239248
self.__input_flags = flags
249+
self.__locale_sensitive = self.__is_locale_sensitive(pattern, flags)
240250
flags_str = []
241251
for char, flag in FLAGS.items():
242252
if flags & flag:
@@ -290,11 +300,19 @@ def __check_input_type(self, input):
290300
raise TypeError("cannot use a bytes pattern on a string-like object")
291301

292302
def __tregex_compile(self, method="search", must_advance=False):
293-
if (method, must_advance) not in self.__compiled_regexes:
303+
if self.__locale_sensitive:
304+
key = (method, must_advance, _getlocale())
305+
else:
306+
key = (method, must_advance)
307+
if key not in self.__compiled_regexes:
294308
try:
295-
extra_options = f"PythonMethod={method},MustAdvance={'true' if must_advance else 'false'}"
309+
if self.__locale_sensitive:
310+
locale_option = ",PythonLocale=" + key[2]
311+
else:
312+
locale_option = ""
313+
extra_options = f"PythonMethod={method},MustAdvance={'true' if must_advance else 'false'}{locale_option}"
296314
compiled_regex = tregex_compile_internal(self.pattern, self.__flags_str, extra_options)
297-
self.__compiled_regexes[(method, must_advance)] = compiled_regex
315+
self.__compiled_regexes[key] = compiled_regex
298316
except ValueError as e:
299317
if len(e.args) == 2:
300318
msg = e.args[0]
@@ -307,7 +325,35 @@ def __tregex_compile(self, method="search", must_advance=False):
307325
raise ValueError(msg) from None
308326
raise error(msg, self.pattern, e.args[1]) from None
309327
raise
310-
return self.__compiled_regexes[(method, must_advance)]
328+
return self.__compiled_regexes[key]
329+
330+
def __is_locale_sensitive(self, pattern, flags):
331+
"""Tests whether the regex is locale-sensitive. It is not completely precise. In some
332+
instances, it will return `True` even though the regex is *not* locale-sensitive. This is
333+
the case when sequences resembling inline flags appear in character classes or comments."""
334+
if not _is_bytes_like(pattern):
335+
return False
336+
if flags & FLAG_LOCALE != 0:
337+
return True
338+
pattern = pattern.decode(encoding='LATIN-1')
339+
position = 0
340+
while position < len(pattern):
341+
position = pattern.find('(?', position)
342+
if position == -1:
343+
break
344+
backslash_position = position - 1
345+
while backslash_position >= 0 and pattern[backslash_position] == '\\':
346+
backslash_position = backslash_position - 1
347+
# jump over '(?'
348+
position = position + 2
349+
if (position - backslash_position) % 2 == 0:
350+
# found odd number of backslashes, the parentheses is a literal
351+
continue
352+
while position < len(pattern) and pattern[position] in 'aiLmsux':
353+
if pattern[position] == 'L':
354+
return True
355+
position = position + 1
356+
return False
311357

312358
def __fallback_compile(self):
313359
if self.__compiled_fallback is None:

0 commit comments

Comments
 (0)