@@ -71,7 +71,8 @@ def _compile(code, pattern, flags):
71
71
ASSERT_CODES = _ASSERT_CODES
72
72
if (flags & SRE_FLAG_IGNORECASE and
73
73
not (flags & SRE_FLAG_LOCALE ) and
74
- flags & SRE_FLAG_UNICODE ):
74
+ flags & SRE_FLAG_UNICODE and
75
+ not (flags & SRE_FLAG_ASCII )):
75
76
fixes = _ignorecase_fixes
76
77
else :
77
78
fixes = None
@@ -137,14 +138,15 @@ def fixup(literal, flags=flags):
137
138
else :
138
139
emit (MIN_UNTIL )
139
140
elif op is SUBPATTERN :
140
- if av [0 ]:
141
+ group , add_flags , del_flags , p = av
142
+ if group :
141
143
emit (MARK )
142
- emit ((av [ 0 ] - 1 )* 2 )
143
- # _compile_info(code, av[1], flags)
144
- _compile (code , av [ 1 ], flags )
145
- if av [ 0 ] :
144
+ emit ((group - 1 )* 2 )
145
+ # _compile_info(code, p, ( flags | add_flags) & ~del_flags )
146
+ _compile (code , p , ( flags | add_flags ) & ~ del_flags )
147
+ if group :
146
148
emit (MARK )
147
- emit ((av [ 0 ] - 1 )* 2 + 1 )
149
+ emit ((group - 1 )* 2 + 1 )
148
150
elif op in SUCCESS_CODES :
149
151
emit (op )
150
152
elif op in ASSERT_CODES :
@@ -172,7 +174,7 @@ def fixup(literal, flags=flags):
172
174
av = AT_MULTILINE .get (av , av )
173
175
if flags & SRE_FLAG_LOCALE :
174
176
av = AT_LOCALE .get (av , av )
175
- elif flags & SRE_FLAG_UNICODE :
177
+ elif ( flags & SRE_FLAG_UNICODE ) and not ( flags & SRE_FLAG_ASCII ) :
176
178
av = AT_UNICODE .get (av , av )
177
179
emit (av )
178
180
elif op is BRANCH :
@@ -193,7 +195,7 @@ def fixup(literal, flags=flags):
193
195
emit (op )
194
196
if flags & SRE_FLAG_LOCALE :
195
197
av = CH_LOCALE [av ]
196
- elif flags & SRE_FLAG_UNICODE :
198
+ elif ( flags & SRE_FLAG_UNICODE ) and not ( flags & SRE_FLAG_ASCII ) :
197
199
av = CH_UNICODE [av ]
198
200
emit (av )
199
201
elif op is GROUPREF :
@@ -220,8 +222,6 @@ def fixup(literal, flags=flags):
220
222
221
223
def _compile_charset (charset , flags , code , fixup = None , fixes = None ):
222
224
# compile charset subprogram
223
- # TODO: Truffle revert-me
224
- return None
225
225
emit = code .append
226
226
for op , av in _optimize_charset (charset , fixup , fixes ):
227
227
emit (op )
@@ -239,7 +239,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
239
239
elif op is CATEGORY :
240
240
if flags & SRE_FLAG_LOCALE :
241
241
emit (CH_LOCALE [av ])
242
- elif flags & SRE_FLAG_UNICODE :
242
+ elif ( flags & SRE_FLAG_UNICODE ) and not ( flags & SRE_FLAG_ASCII ) :
243
243
emit (CH_UNICODE [av ])
244
244
else :
245
245
emit (av )
@@ -380,11 +380,7 @@ def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
380
380
381
381
def _bytes_to_codes (b ):
382
382
# Convert block indices to word array
383
- if _sre .CODESIZE == 2 :
384
- code = 'H'
385
- else :
386
- code = 'I'
387
- a = memoryview (b ).cast (code )
383
+ a = memoryview (b ).cast ('I' )
388
384
assert a .itemsize == _sre .CODESIZE
389
385
assert len (a ) * a .itemsize == len (b )
390
386
return a .tolist ()
@@ -415,42 +411,42 @@ def _generate_overlap_table(prefix):
415
411
table [i ] = idx + 1
416
412
return table
417
413
418
- def _compile_info (code , pattern , flags ):
419
- # internal: compile an info block. in the current version,
420
- # this contains min/max pattern width, and an optional literal
421
- # prefix or a character map
422
- lo , hi = pattern .getwidth ()
423
- if hi > MAXCODE :
424
- hi = MAXCODE
425
- if lo == 0 :
426
- code .extend ([INFO , 4 , 0 , lo , hi ])
427
- return
428
- # look for a literal prefix
414
+ def _get_literal_prefix (pattern ):
415
+ # look for literal prefix
429
416
prefix = []
430
417
prefixappend = prefix .append
431
- prefix_skip = 0
418
+ prefix_skip = None
419
+ for op , av in pattern .data :
420
+ if op is LITERAL :
421
+ prefixappend (av )
422
+ elif op is SUBPATTERN :
423
+ group , add_flags , del_flags , p = av
424
+ if add_flags & SRE_FLAG_IGNORECASE :
425
+ break
426
+ prefix1 , prefix_skip1 , got_all = _get_literal_prefix (p )
427
+ if prefix_skip is None :
428
+ if group is not None :
429
+ prefix_skip = len (prefix )
430
+ elif prefix_skip1 is not None :
431
+ prefix_skip = len (prefix ) + prefix_skip1
432
+ prefix .extend (prefix1 )
433
+ if not got_all :
434
+ break
435
+ else :
436
+ break
437
+ else :
438
+ return prefix , prefix_skip , True
439
+ return prefix , prefix_skip , False
440
+
441
+ def _get_charset_prefix (pattern ):
432
442
charset = [] # not used
433
443
charsetappend = charset .append
434
- if not (flags & SRE_FLAG_IGNORECASE ):
435
- # look for literal prefix
436
- for op , av in pattern .data :
437
- if op is LITERAL :
438
- if len (prefix ) == prefix_skip :
439
- prefix_skip = prefix_skip + 1
440
- prefixappend (av )
441
- elif op is SUBPATTERN and len (av [1 ]) == 1 :
442
- op , av = av [1 ][0 ]
443
- if op is LITERAL :
444
- prefixappend (av )
445
- else :
446
- break
447
- else :
448
- break
449
- # if no prefix, look for charset prefix
450
- if not prefix and pattern .data :
451
- op , av = pattern .data [0 ]
452
- if op is SUBPATTERN and av [1 ]:
453
- op , av = av [1 ][0 ]
444
+ if pattern .data :
445
+ op , av = pattern .data [0 ]
446
+ if op is SUBPATTERN :
447
+ group , add_flags , del_flags , p = av
448
+ if p and not (add_flags & SRE_FLAG_IGNORECASE ):
449
+ op , av = p [0 ]
454
450
if op is LITERAL :
455
451
charsetappend ((op , av ))
456
452
elif op is BRANCH :
@@ -466,21 +462,43 @@ def _compile_info(code, pattern, flags):
466
462
break
467
463
else :
468
464
charset = c
469
- elif op is BRANCH :
470
- c = []
471
- cappend = c .append
472
- for p in av [1 ]:
473
- if not p :
474
- break
475
- op , av = p [0 ]
476
- if op is LITERAL :
477
- cappend ((op , av ))
478
- else :
479
- break
465
+ elif op is BRANCH :
466
+ c = []
467
+ cappend = c .append
468
+ for p in av [1 ]:
469
+ if not p :
470
+ break
471
+ op , av = p [0 ]
472
+ if op is LITERAL :
473
+ cappend ((op , av ))
480
474
else :
481
- charset = c
482
- elif op is IN :
483
- charset = av
475
+ break
476
+ else :
477
+ charset = c
478
+ elif op is IN :
479
+ charset = av
480
+ return charset
481
+
482
+ def _compile_info (code , pattern , flags ):
483
+ # internal: compile an info block. in the current version,
484
+ # this contains min/max pattern width, and an optional literal
485
+ # prefix or a character map
486
+ lo , hi = pattern .getwidth ()
487
+ if hi > MAXCODE :
488
+ hi = MAXCODE
489
+ if lo == 0 :
490
+ code .extend ([INFO , 4 , 0 , lo , hi ])
491
+ return
492
+ # look for a literal prefix
493
+ prefix = []
494
+ prefix_skip = 0
495
+ charset = [] # not used
496
+ if not (flags & SRE_FLAG_IGNORECASE ):
497
+ # look for literal prefix
498
+ prefix , prefix_skip , got_all = _get_literal_prefix (pattern )
499
+ # if no prefix, look for charset prefix
500
+ if not prefix :
501
+ charset = _get_charset_prefix (pattern )
484
502
## if prefix:
485
503
## print("*** PREFIX", prefix, prefix_skip)
486
504
## if charset:
@@ -493,7 +511,7 @@ def _compile_info(code, pattern, flags):
493
511
mask = 0
494
512
if prefix :
495
513
mask = SRE_INFO_PREFIX
496
- if len ( prefix ) == prefix_skip == len ( pattern . data ) :
514
+ if prefix_skip is None and got_all :
497
515
mask = mask | SRE_INFO_LITERAL
498
516
elif charset :
499
517
mask = mask | SRE_INFO_CHARSET
@@ -508,6 +526,8 @@ def _compile_info(code, pattern, flags):
508
526
# add literal prefix
509
527
if prefix :
510
528
emit (len (prefix )) # length
529
+ if prefix_skip is None :
530
+ prefix_skip = len (prefix )
511
531
emit (prefix_skip ) # skip
512
532
code .extend (prefix )
513
533
# generate overlap table
0 commit comments