Skip to content

Commit eb27301

Browse files
committed
use sre_*.py from 3.6.5 so they agree with our sre C module
1 parent 1abbbbb commit eb27301

File tree

3 files changed

+206
-143
lines changed

3 files changed

+206
-143
lines changed

graalpython/lib-python/3/sre_compile.py

Lines changed: 84 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ def _compile(code, pattern, flags):
7171
ASSERT_CODES = _ASSERT_CODES
7272
if (flags & SRE_FLAG_IGNORECASE and
7373
not (flags & SRE_FLAG_LOCALE) and
74-
flags & SRE_FLAG_UNICODE):
74+
flags & SRE_FLAG_UNICODE and
75+
not (flags & SRE_FLAG_ASCII)):
7576
fixes = _ignorecase_fixes
7677
else:
7778
fixes = None
@@ -137,14 +138,15 @@ def fixup(literal, flags=flags):
137138
else:
138139
emit(MIN_UNTIL)
139140
elif op is SUBPATTERN:
140-
if av[0]:
141+
group, add_flags, del_flags, p = av
142+
if group:
141143
emit(MARK)
142-
emit((av[0]-1)*2)
143-
# _compile_info(code, av[1], flags)
144-
_compile(code, av[1], flags)
145-
if av[0]:
144+
emit((group-1)*2)
145+
# _compile_info(code, p, (flags | add_flags) & ~del_flags)
146+
_compile(code, p, (flags | add_flags) & ~del_flags)
147+
if group:
146148
emit(MARK)
147-
emit((av[0]-1)*2+1)
149+
emit((group-1)*2+1)
148150
elif op in SUCCESS_CODES:
149151
emit(op)
150152
elif op in ASSERT_CODES:
@@ -172,7 +174,7 @@ def fixup(literal, flags=flags):
172174
av = AT_MULTILINE.get(av, av)
173175
if flags & SRE_FLAG_LOCALE:
174176
av = AT_LOCALE.get(av, av)
175-
elif flags & SRE_FLAG_UNICODE:
177+
elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
176178
av = AT_UNICODE.get(av, av)
177179
emit(av)
178180
elif op is BRANCH:
@@ -193,7 +195,7 @@ def fixup(literal, flags=flags):
193195
emit(op)
194196
if flags & SRE_FLAG_LOCALE:
195197
av = CH_LOCALE[av]
196-
elif flags & SRE_FLAG_UNICODE:
198+
elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
197199
av = CH_UNICODE[av]
198200
emit(av)
199201
elif op is GROUPREF:
@@ -220,8 +222,6 @@ def fixup(literal, flags=flags):
220222

221223
def _compile_charset(charset, flags, code, fixup=None, fixes=None):
222224
# compile charset subprogram
223-
# TODO: Truffle revert-me
224-
return None
225225
emit = code.append
226226
for op, av in _optimize_charset(charset, fixup, fixes):
227227
emit(op)
@@ -239,7 +239,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
239239
elif op is CATEGORY:
240240
if flags & SRE_FLAG_LOCALE:
241241
emit(CH_LOCALE[av])
242-
elif flags & SRE_FLAG_UNICODE:
242+
elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
243243
emit(CH_UNICODE[av])
244244
else:
245245
emit(av)
@@ -380,11 +380,7 @@ def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
380380

381381
def _bytes_to_codes(b):
382382
# Convert block indices to word array
383-
if _sre.CODESIZE == 2:
384-
code = 'H'
385-
else:
386-
code = 'I'
387-
a = memoryview(b).cast(code)
383+
a = memoryview(b).cast('I')
388384
assert a.itemsize == _sre.CODESIZE
389385
assert len(a) * a.itemsize == len(b)
390386
return a.tolist()
@@ -415,42 +411,42 @@ def _generate_overlap_table(prefix):
415411
table[i] = idx + 1
416412
return table
417413

418-
def _compile_info(code, pattern, flags):
419-
# internal: compile an info block. in the current version,
420-
# this contains min/max pattern width, and an optional literal
421-
# prefix or a character map
422-
lo, hi = pattern.getwidth()
423-
if hi > MAXCODE:
424-
hi = MAXCODE
425-
if lo == 0:
426-
code.extend([INFO, 4, 0, lo, hi])
427-
return
428-
# look for a literal prefix
414+
def _get_literal_prefix(pattern):
415+
# look for literal prefix
429416
prefix = []
430417
prefixappend = prefix.append
431-
prefix_skip = 0
418+
prefix_skip = None
419+
for op, av in pattern.data:
420+
if op is LITERAL:
421+
prefixappend(av)
422+
elif op is SUBPATTERN:
423+
group, add_flags, del_flags, p = av
424+
if add_flags & SRE_FLAG_IGNORECASE:
425+
break
426+
prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
427+
if prefix_skip is None:
428+
if group is not None:
429+
prefix_skip = len(prefix)
430+
elif prefix_skip1 is not None:
431+
prefix_skip = len(prefix) + prefix_skip1
432+
prefix.extend(prefix1)
433+
if not got_all:
434+
break
435+
else:
436+
break
437+
else:
438+
return prefix, prefix_skip, True
439+
return prefix, prefix_skip, False
440+
441+
def _get_charset_prefix(pattern):
432442
charset = [] # not used
433443
charsetappend = charset.append
434-
if not (flags & SRE_FLAG_IGNORECASE):
435-
# look for literal prefix
436-
for op, av in pattern.data:
437-
if op is LITERAL:
438-
if len(prefix) == prefix_skip:
439-
prefix_skip = prefix_skip + 1
440-
prefixappend(av)
441-
elif op is SUBPATTERN and len(av[1]) == 1:
442-
op, av = av[1][0]
443-
if op is LITERAL:
444-
prefixappend(av)
445-
else:
446-
break
447-
else:
448-
break
449-
# if no prefix, look for charset prefix
450-
if not prefix and pattern.data:
451-
op, av = pattern.data[0]
452-
if op is SUBPATTERN and av[1]:
453-
op, av = av[1][0]
444+
if pattern.data:
445+
op, av = pattern.data[0]
446+
if op is SUBPATTERN:
447+
group, add_flags, del_flags, p = av
448+
if p and not (add_flags & SRE_FLAG_IGNORECASE):
449+
op, av = p[0]
454450
if op is LITERAL:
455451
charsetappend((op, av))
456452
elif op is BRANCH:
@@ -466,21 +462,43 @@ def _compile_info(code, pattern, flags):
466462
break
467463
else:
468464
charset = c
469-
elif op is BRANCH:
470-
c = []
471-
cappend = c.append
472-
for p in av[1]:
473-
if not p:
474-
break
475-
op, av = p[0]
476-
if op is LITERAL:
477-
cappend((op, av))
478-
else:
479-
break
465+
elif op is BRANCH:
466+
c = []
467+
cappend = c.append
468+
for p in av[1]:
469+
if not p:
470+
break
471+
op, av = p[0]
472+
if op is LITERAL:
473+
cappend((op, av))
480474
else:
481-
charset = c
482-
elif op is IN:
483-
charset = av
475+
break
476+
else:
477+
charset = c
478+
elif op is IN:
479+
charset = av
480+
return charset
481+
482+
def _compile_info(code, pattern, flags):
483+
# internal: compile an info block. in the current version,
484+
# this contains min/max pattern width, and an optional literal
485+
# prefix or a character map
486+
lo, hi = pattern.getwidth()
487+
if hi > MAXCODE:
488+
hi = MAXCODE
489+
if lo == 0:
490+
code.extend([INFO, 4, 0, lo, hi])
491+
return
492+
# look for a literal prefix
493+
prefix = []
494+
prefix_skip = 0
495+
charset = [] # not used
496+
if not (flags & SRE_FLAG_IGNORECASE):
497+
# look for literal prefix
498+
prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
499+
# if no prefix, look for charset prefix
500+
if not prefix:
501+
charset = _get_charset_prefix(pattern)
484502
## if prefix:
485503
## print("*** PREFIX", prefix, prefix_skip)
486504
## if charset:
@@ -493,7 +511,7 @@ def _compile_info(code, pattern, flags):
493511
mask = 0
494512
if prefix:
495513
mask = SRE_INFO_PREFIX
496-
if len(prefix) == prefix_skip == len(pattern.data):
514+
if prefix_skip is None and got_all:
497515
mask = mask | SRE_INFO_LITERAL
498516
elif charset:
499517
mask = mask | SRE_INFO_CHARSET
@@ -508,6 +526,8 @@ def _compile_info(code, pattern, flags):
508526
# add literal prefix
509527
if prefix:
510528
emit(len(prefix)) # length
529+
if prefix_skip is None:
530+
prefix_skip = len(prefix)
511531
emit(prefix_skip) # skip
512532
code.extend(prefix)
513533
# generate overlap table

graalpython/lib-python/3/sre_constants.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,17 @@
2121
# should this really be here?
2222

2323
class error(Exception):
24+
"""Exception raised for invalid regular expressions.
25+
26+
Attributes:
27+
28+
msg: The unformatted error message
29+
pattern: The regular expression pattern
30+
pos: The index in the pattern where compilation failed (may be None)
31+
lineno: The line corresponding to pos (may be None)
32+
colno: The column corresponding to pos (may be None)
33+
"""
34+
2435
def __init__(self, msg, pattern=None, pos=None):
2536
self.msg = msg
2637
self.pattern = pattern

0 commit comments

Comments
 (0)