1
- # Copyright (c) 2018, 2022 , Oracle and/or its affiliates. All rights reserved.
1
+ # Copyright (c) 2018, 2023 , Oracle and/or its affiliates. All rights reserved.
2
2
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3
3
#
4
4
# The Universal Permissive License (UPL), Version 1.0
@@ -67,6 +67,15 @@ def _normalize_bounds(string, pos, endpos):
67
67
def _is_bytes_like (object ):
68
68
return isinstance (object , (bytes , bytearray , memoryview , array , mmap ))
69
69
70
+ def _getlocale ():
71
+ from locale import getlocale
72
+ (lang , encoding ) = getlocale ()
73
+ if lang is None and charset is None :
74
+ return 'C'
75
+ if lang is None :
76
+ lang = 'en_US'
77
+ return '.' .join ((lang , encoding ))
78
+
70
79
def _new_compile (p , flags = 0 ):
71
80
if _with_tregex and isinstance (p , (str , bytes , bytearray , memoryview , array , mmap )):
72
81
return _t_compile (p , flags )
@@ -237,6 +246,7 @@ def __init__(self, pattern, flags):
237
246
self .__binary = _is_bytes_like (pattern )
238
247
self .pattern = pattern
239
248
self .__input_flags = flags
249
+ self .__locale_sensitive = self .__is_locale_sensitive (pattern , flags )
240
250
flags_str = []
241
251
for char , flag in FLAGS .items ():
242
252
if flags & flag :
@@ -290,11 +300,19 @@ def __check_input_type(self, input):
290
300
raise TypeError ("cannot use a bytes pattern on a string-like object" )
291
301
292
302
def __tregex_compile (self , method = "search" , must_advance = False ):
293
- if (method , must_advance ) not in self .__compiled_regexes :
303
+ if self .__locale_sensitive :
304
+ key = (method , must_advance , _getlocale ())
305
+ else :
306
+ key = (method , must_advance )
307
+ if key not in self .__compiled_regexes :
294
308
try :
295
- extra_options = f"PythonMethod={ method } ,MustAdvance={ 'true' if must_advance else 'false' } "
309
+ if self .__locale_sensitive :
310
+ locale_option = ",PythonLocale=" + key [2 ]
311
+ else :
312
+ locale_option = ""
313
+ extra_options = f"PythonMethod={ method } ,MustAdvance={ 'true' if must_advance else 'false' } { locale_option } "
296
314
compiled_regex = tregex_compile_internal (self .pattern , self .__flags_str , extra_options )
297
- self .__compiled_regexes [( method , must_advance ) ] = compiled_regex
315
+ self .__compiled_regexes [key ] = compiled_regex
298
316
except ValueError as e :
299
317
if len (e .args ) == 2 :
300
318
msg = e .args [0 ]
@@ -307,7 +325,35 @@ def __tregex_compile(self, method="search", must_advance=False):
307
325
raise ValueError (msg ) from None
308
326
raise error (msg , self .pattern , e .args [1 ]) from None
309
327
raise
310
- return self .__compiled_regexes [(method , must_advance )]
328
+ return self .__compiled_regexes [key ]
329
+
330
+ def __is_locale_sensitive (self , pattern , flags ):
331
+ """Tests whether the regex is locale-sensitive. It is not completely precise. In some
332
+ instances, it will return `True` even though the regex is *not* locale-sensitive. This is
333
+ the case when sequences resembling inline flags appear in character classes or comments."""
334
+ if not _is_bytes_like (pattern ):
335
+ return False
336
+ if flags & FLAG_LOCALE != 0 :
337
+ return True
338
+ pattern = pattern .decode (encoding = 'LATIN-1' )
339
+ position = 0
340
+ while position < len (pattern ):
341
+ position = pattern .find ('(?' , position )
342
+ if position == - 1 :
343
+ break
344
+ backslash_position = position - 1
345
+ while backslash_position >= 0 and pattern [backslash_position ] == '\\ ' :
346
+ backslash_position = backslash_position - 1
347
+ # jump over '(?'
348
+ position = position + 2
349
+ if (position - backslash_position ) % 2 == 0 :
350
+ # found odd number of backslashes, the parentheses is a literal
351
+ continue
352
+ while position < len (pattern ) and pattern [position ] in 'aiLmsux' :
353
+ if pattern [position ] == 'L' :
354
+ return True
355
+ position = position + 1
356
+ return False
311
357
312
358
def __fallback_compile (self ):
313
359
if self .__compiled_fallback is None :
0 commit comments