Skip to content

Commit 3284ccc

Browse files
authored
Support small offsets in capture group list (#563)
1 parent e0d4eee commit 3284ccc

File tree

5 files changed

+125
-61
lines changed

5 files changed

+125
-61
lines changed

HACKING

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -768,9 +768,15 @@ In ASCII or UTF-32 mode, the character counts in OP_REVERSE and OP_VREVERSE are
768768
also the number of code units, but in UTF-8/16 mode each character may occupy
769769
more than one code unit.
770770

771-
The "scan substring" assertion compiles as OP_ASSERT_SCS. What follows takes
772-
the same form as a conditional subpattern with a back reference condition (see
773-
next section).
771+
The "scan substring" assertion compiles as OP_ASSERT_SCS. This opcode is
772+
followed by a list of arguments. Each argument is either an OP_CREF or
773+
OP_DNCREF byte code sequence. The details of these sequences are described
774+
in the next section.
775+
776+
For example (*scs:(1,'NAME')...PATTERN...) is translated to:
777+
[OP_ASSERT_SCS] [OP_CREF] [OP_CREF] ...PATTERN... [OP_KET]
778+
779+
If 'NAME' is a duplicated name, the second [OP_CREF] is [OP_DNCREF] instead.
774780

775781

776782
Conditional subpatterns

src/pcre2_compile.c

Lines changed: 73 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -214,10 +214,10 @@ static unsigned char meta_extra_lengths[] = {
214214
1+SIZEOFFSET, /* META_COND_RNAME */
215215
1+SIZEOFFSET, /* META_COND_RNUMBER */
216216
3, /* META_COND_VERSION */
217-
1+SIZEOFFSET, /* META_SCS_NAME */
218-
1+SIZEOFFSET, /* META_SCS_NUMBER */
219-
1+SIZEOFFSET, /* META_SCS_NEXT_NAME */
220-
1+SIZEOFFSET, /* META_SCS_NEXT_NUMBER */
217+
SIZEOFFSET, /* META_OFFSET */
218+
0, /* META_SCS */
219+
1, /* META_SCS_NAME */
220+
1, /* META_SCS_NUMBER */
221221
0, /* META_DOLLAR */
222222
0, /* META_DOT */
223223
0, /* META_ESCAPE - one more for ESC_P and ESC_p */
@@ -1024,30 +1024,22 @@ for (;;)
10241024
fprintf(stderr, "%zd", offset);
10251025
break;
10261026

1027-
case META_SCS_NAME:
1028-
fprintf(stderr, "META (*scan_substring:(<name>) length=%d offset=", *pptr++);
1027+
case META_OFFSET:
1028+
fprintf(stderr, "META_OFFSET offset=");
10291029
GETOFFSET(offset, pptr);
10301030
fprintf(stderr, "%zd", offset);
10311031
break;
10321032

1033-
case META_SCS_NUMBER:
1034-
fprintf(stderr, "META_SCS_NUMBER %d offset=", pptr[SIZEOFFSET]);
1035-
GETOFFSET(offset, pptr);
1036-
fprintf(stderr, "%zd", offset);
1037-
pptr++;
1033+
case META_SCS:
1034+
fprintf(stderr, "META (*scan_substring:");
10381035
break;
10391036

1040-
case META_SCS_NEXT_NAME:
1041-
fprintf(stderr, "META_SCS_NEXT_NAME length=%d offset=", *pptr++);
1042-
GETOFFSET(offset, pptr);
1043-
fprintf(stderr, "%zd", offset);
1037+
case META_SCS_NAME:
1038+
fprintf(stderr, "META_SCS_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);
10441039
break;
10451040

1046-
case META_SCS_NEXT_NUMBER:
1047-
fprintf(stderr, "META_SCS_NEXT_NUMBER %d offset=", pptr[SIZEOFFSET]);
1048-
GETOFFSET(offset, pptr);
1049-
fprintf(stderr, "%zd", offset);
1050-
pptr++;
1041+
case META_SCS_NUMBER:
1042+
fprintf(stderr, "META_SCS_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);
10511043
break;
10521044

10531045
case META_MARK:
@@ -4504,11 +4496,14 @@ while (ptr < ptrend)
45044496
}
45054497

45064498
ptr++;
4499+
*parsed_pattern++ = META_SCS;
45074500
/* Temporary variable, zero in the first iteration. */
4508-
meta = 0;
4501+
offset = 0;
45094502

45104503
for (;;)
45114504
{
4505+
PCRE2_SIZE next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4506+
45124507
/* Handle (scan_substring:([+-]number)... */
45134508
if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
45144509
&i, &errorcode))
@@ -4519,10 +4514,8 @@ while (ptr < ptrend)
45194514
errorcode = ERR15;
45204515
goto FAILED;
45214516
}
4522-
*parsed_pattern++ = meta ? META_SCS_NEXT_NUMBER : META_SCS_NUMBER;
4523-
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4524-
PUTOFFSET(offset, parsed_pattern);
4525-
*parsed_pattern++ = i;
4517+
meta = META_SCS_NUMBER;
4518+
namelen = (uint32_t)i;
45264519
}
45274520
else if (errorcode != 0) goto FAILED; /* Number too big */
45284521
else
@@ -4540,14 +4533,28 @@ while (ptr < ptrend)
45404533
goto FAILED;
45414534
}
45424535

4543-
if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name,
4544-
&namelen, &errorcode, cb)) goto FAILED;
4536+
if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,
4537+
&name, &namelen, &errorcode, cb)) goto FAILED;
45454538

4546-
*parsed_pattern++ = meta ? META_SCS_NEXT_NAME : META_SCS_NAME;
4547-
*parsed_pattern++ = namelen;
4548-
PUTOFFSET(offset, parsed_pattern);
4539+
meta = META_SCS_NAME;
4540+
}
4541+
4542+
PCRE2_ASSERT(next_offset > 0);
4543+
if (offset == 0 || (next_offset - offset) >= 0x10000)
4544+
{
4545+
*parsed_pattern++ = META_OFFSET;
4546+
PUTOFFSET(next_offset, parsed_pattern);
4547+
offset = next_offset;
45494548
}
45504549

4550+
/* The offset is encoded as a relative offset, because for some
4551+
inputs such as ",2" in (*scs:(1,2,3)...), we only have space for
4552+
two uint32_t values, and an opcode and absolute offset may require
4553+
three uint32_t values. */
4554+
*parsed_pattern++ = meta | (uint32_t)(next_offset - offset);
4555+
*parsed_pattern++ = namelen;
4556+
offset = next_offset;
4557+
45514558
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
45524559

45534560
if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
@@ -4559,7 +4566,6 @@ while (ptr < ptrend)
45594566
}
45604567

45614568
ptr++;
4562-
meta = 1;
45634569
}
45644570
ptr++;
45654571
goto POST_ASSERTION;
@@ -5807,6 +5813,8 @@ uint32_t meta, meta_arg;
58075813
uint32_t firstcuflags, reqcuflags;
58085814
uint32_t zeroreqcuflags, zerofirstcuflags;
58095815
uint32_t req_caseopt, reqvary, tempreqvary;
5816+
/* Some opcodes, such as META_SCS_NUMBER or META_SCS_NAME,
5817+
depends on the previous value of offset. */
58105818
PCRE2_SIZE offset = 0;
58115819
PCRE2_SIZE length_prevgroup = 0;
58125820
PCRE2_UCHAR *code = *codeptr;
@@ -6294,6 +6302,15 @@ for (;; pptr++)
62946302
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
62956303
break;
62966304

6305+
case META_OFFSET:
6306+
GETPLUSOFFSET(offset, pptr);
6307+
break;
6308+
6309+
case META_SCS:
6310+
bravalue = OP_ASSERT_SCS;
6311+
cb->assert_depth += 1;
6312+
goto GROUP_PROCESS;
6313+
62976314

62986315
/* ===================================================================*/
62996316
/* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
@@ -6305,17 +6322,19 @@ for (;; pptr++)
63056322
case META_COND_RNUMBER: /* (?(Rdigits) */
63066323
case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
63076324
case META_COND_RNAME: /* (?(R&name) - test for recursion */
6308-
case META_SCS_NAME: /* (*scan_substring:'name') or (*scan_substring:(<name>)) */
6309-
case META_SCS_NEXT_NAME: /* More names for scan substring. */
6310-
bravalue = meta == META_SCS_NAME ? OP_ASSERT_SCS : OP_COND;
6325+
case META_SCS_NAME: /* Name of scan substring */
6326+
bravalue = OP_COND;
63116327
{
63126328
int count, index;
63136329
unsigned int i;
63146330
PCRE2_SPTR name;
63156331
named_group *ng = cb->named_groups;
63166332
uint32_t length = *(++pptr);
63176333

6318-
GETPLUSOFFSET(offset, pptr);
6334+
if (meta == META_SCS_NAME)
6335+
offset += meta_arg;
6336+
else
6337+
GETPLUSOFFSET(offset, pptr);
63196338
name = cb->start_pattern + offset;
63206339

63216340
/* In the first pass, the names generated in the pre-pass are available,
@@ -6371,7 +6390,7 @@ for (;; pptr++)
63716390
/* Otherwise found a duplicated name */
63726391
if (ng->number > cb->top_backref) cb->top_backref = ng->number;
63736392

6374-
if (meta == META_SCS_NEXT_NAME)
6393+
if (meta == META_SCS_NAME)
63756394
{
63766395
code[0] = OP_CREF;
63776396
PUT2(code, 1, ng->number);
@@ -6395,7 +6414,7 @@ for (;; pptr++)
63956414
if (lengthptr == NULL && !find_dupname_details(name, length, &index,
63966415
&count, errorcodeptr, cb)) return 0;
63976416

6398-
if (meta == META_SCS_NEXT_NAME)
6417+
if (meta == META_SCS_NAME)
63996418
{
64006419
code[0] = OP_DNCREF;
64016420
PUT2(code, 1, index);
@@ -6415,9 +6434,8 @@ for (;; pptr++)
64156434
PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
64166435
}
64176436

6418-
if (meta != META_SCS_NAME) goto GROUP_PROCESS_NOTE_EMPTY;
6419-
cb->assert_depth += 1;
6420-
goto GROUP_PROCESS;
6437+
PCRE2_ASSERT(meta != META_SCS_NAME);
6438+
goto GROUP_PROCESS_NOTE_EMPTY;
64216439

64226440
/* The DEFINE condition is always false. Its internal groups may never
64236441
be called, so matched_char must remain false, hence the jump to
@@ -6434,9 +6452,12 @@ for (;; pptr++)
64346452

64356453
case META_COND_NUMBER:
64366454
case META_SCS_NUMBER:
6437-
case META_SCS_NEXT_NUMBER:
6438-
bravalue = meta == META_SCS_NUMBER ? OP_ASSERT_SCS : OP_COND;
6439-
GETPLUSOFFSET(offset, pptr);
6455+
bravalue = OP_COND;
6456+
if (meta == META_SCS_NUMBER)
6457+
offset += meta_arg;
6458+
else
6459+
GETPLUSOFFSET(offset, pptr);
6460+
64406461
groupnumber = *(++pptr);
64416462
if (groupnumber > cb->bracount)
64426463
{
@@ -6446,7 +6467,7 @@ for (;; pptr++)
64466467
}
64476468
if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
64486469

6449-
if (meta == META_SCS_NEXT_NUMBER)
6470+
if (meta == META_SCS_NUMBER)
64506471
{
64516472
code[0] = OP_CREF;
64526473
PUT2(code, 1, groupnumber);
@@ -6455,13 +6476,11 @@ for (;; pptr++)
64556476
}
64566477

64576478
/* Point at initial ( for too many branches error */
6458-
if (meta != META_SCS_NUMBER) offset -= 2;
6479+
offset -= 2;
64596480
code[1+LINK_SIZE] = OP_CREF;
64606481
skipunits = 1+IMM2_SIZE;
64616482
PUT2(code, 2+LINK_SIZE, groupnumber);
6462-
if (meta != META_SCS_NUMBER) goto GROUP_PROCESS_NOTE_EMPTY;
6463-
cb->assert_depth += 1;
6464-
goto GROUP_PROCESS;
6483+
goto GROUP_PROCESS_NOTE_EMPTY;
64656484

64666485
/* Test for the PCRE2 version. */
64676486

@@ -9089,6 +9108,7 @@ for (;; pptr++)
90899108
case META_COND_RNAME:
90909109
case META_COND_RNUMBER:
90919110
case META_COND_VERSION:
9111+
case META_SCS:
90929112
case META_LOOKAHEAD:
90939113
case META_LOOKAHEADNOT:
90949114
case META_LOOKAHEAD_NA:
@@ -9350,6 +9370,7 @@ for (;; pptr++)
93509370
case META_LOOKAHEAD:
93519371
case META_LOOKAHEADNOT:
93529372
case META_LOOKAHEAD_NA:
9373+
case META_SCS:
93539374
*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
93549375
if (*errcodeptr != 0) return -1;
93559376

@@ -9781,8 +9802,7 @@ for (; *pptr != META_END; pptr++)
97819802
case META_ATOMIC:
97829803
case META_CAPTURE:
97839804
case META_COND_ASSERT:
9784-
case META_SCS_NAME:
9785-
case META_SCS_NUMBER:
9805+
case META_SCS:
97869806
case META_LOOKAHEAD:
97879807
case META_LOOKAHEADNOT:
97889808
case META_LOOKAHEAD_NA:
@@ -9820,6 +9840,7 @@ for (; *pptr != META_END; pptr++)
98209840
case META_THEN:
98219841
break;
98229842

9843+
case META_OFFSET:
98239844
case META_RECURSE:
98249845
pptr += SIZEOFFSET;
98259846
break;
@@ -9838,8 +9859,6 @@ for (; *pptr != META_END; pptr++)
98389859
case META_COND_NUMBER:
98399860
case META_COND_RNAME:
98409861
case META_COND_RNUMBER:
9841-
case META_SCS_NEXT_NAME:
9842-
case META_SCS_NEXT_NUMBER:
98439862
pptr += 1 + SIZEOFFSET;
98449863
nestlevel++;
98459864
break;
@@ -9856,6 +9875,8 @@ for (; *pptr != META_END; pptr++)
98569875
case META_BIGVALUE:
98579876
case META_POSIX:
98589877
case META_POSIX_NEG:
9878+
case META_SCS_NAME:
9879+
case META_SCS_NUMBER:
98599880
pptr += 1;
98609881
break;
98619882

src/pcre2_compile.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,11 @@ code (meta_extra_lengths) must be updated to remain in step. */
9696
#define META_COND_RNAME 0x80130000u /* (?(R&name)... */
9797
#define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
9898
#define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
99-
#define META_SCS_NAME 0x80160000u /* (*scan_substring:(<name>)... */
100-
#define META_SCS_NUMBER 0x80170000u /* (*scan_substring:(digits)... */
101-
#define META_SCS_NEXT_NAME 0x80180000u /* Next <name> of scan_substring */
102-
#define META_SCS_NEXT_NUMBER 0x80190000u /* Next digits of scan_substring */
99+
#define META_OFFSET 0x80160000u /* Setting offset for various
100+
META codes (e.g. META_SCS_NAME) */
101+
#define META_SCS 0x80170000u /* (*scan_substring:... */
102+
#define META_SCS_NAME 0x80180000u /* Next <name> of scan_substring */
103+
#define META_SCS_NUMBER 0x80190000u /* Next digits of scan_substring */
103104
#define META_DOLLAR 0x801a0000u /* $ metacharacter */
104105
#define META_DOT 0x801b0000u /* . metacharacter */
105106
#define META_ESCAPE 0x801c0000u /* \d and friends */

testdata/testinput2

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6369,6 +6369,10 @@ a)"xI
63696369

63706370
/()(*scs:(1)+a)/
63716371

6372+
/()(*scs:(1,1,1,1,1,1,1,1,2))/
6373+
6374+
/()()(*scs:(1,2,1,2,1,2,2,'XYZ'))/
6375+
63726376
# Tests for iterating scan_substring
63736377

63746378
/(a)(*scs:(1)b)*c/B
@@ -6580,6 +6584,13 @@ a)"xI
65806584
ab
65816585
ac
65826586

6587+
/()()()(?<=ab(*scs:(1,2,3))cd)xyz/
6588+
abcdxyz
6589+
6590+
/()()()(?<=ab(*ACCEPT)(*scs:(1,2,3))cd|efg)xyz/
6591+
abxyz
6592+
efgxyz
6593+
65836594
# Tests for pcre2_set_optimize()
65846595

65856596
/abc/I,optimization_none

0 commit comments

Comments
 (0)