Skip to content

Commit 86919c9

Browse files
committed
Re-factor handling of whole-pattern recursion in the interpreter
1 parent 198379c commit 86919c9

File tree

4 files changed

+87
-17
lines changed

4 files changed

+87
-17
lines changed

ChangeLog

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,11 @@ undefined behaviour.
170170

171171
46. Fix backref iterators when PCRE2_MATCH_UNSET_BACKREF is set in JIT.
172172

173+
47. Refactor the handling of whole-pattern recursion (?0) in pcre2_match() so
174+
that its end is handled similarly to other recursions. This has altered the
175+
behaviour of /|(?0)./endanchored which was previously not right. However,
176+
it still differs from JIT.
177+
173178

174179
Version 10.42 11-December-2022
175180
------------------------------

src/pcre2_match.c

Lines changed: 49 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -838,17 +838,15 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
838838
assert_accept_frame = F;
839839
RRETURN(MATCH_ACCEPT);
840840

841-
/* If recursing, we have to find the most recent recursion. */
841+
/* For ACCEPT within a recursion, we have to find the most recent
842+
recursion. If not in a recursion, fall through to code that is common with
843+
OP_END. */
842844

843845
case OP_ACCEPT:
844-
case OP_END:
845-
846-
/* Handle end of a recursion. */
847-
848846
if (Fcurrent_recurse != RECURSE_UNSET)
849847
{
850848
#ifdef DEBUG_SHOW_OPS
851-
fprintf(stderr, "++ End within recursion\n");
849+
fprintf(stderr, "++ Accept within recursion\n");
852850
#endif
853851
offset = Flast_group_offset;
854852
for(;;)
@@ -857,7 +855,6 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
857855
N = (heapframe *)((char *)match_data->heapframes + offset);
858856
P = (heapframe *)((char *)N - frame_size);
859857
if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
860-
861858
offset = P->last_group_offset;
862859
}
863860

@@ -873,11 +870,17 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
873870
Fecode += 1 + LINK_SIZE;
874871
continue;
875872
}
873+
/* Fall through */
876874

877-
/* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY
878-
is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the
879-
start of the subject. In both cases, backtracking will then try other
880-
alternatives, if any. */
875+
/* OP_END itself can never be reached within a recursion because that is
876+
picked up when the OP_KET that always precedes OP_END is reached. */
877+
878+
case OP_END:
879+
880+
/* Fail for an empty string match if either PCRE2_NOTEMPTY is set, or if
881+
PCRE2_NOTEMPTY_ATSTART is set and we have matched at the start of the
882+
subject. In both cases, backtracking will then try other alternatives, if
883+
any. */
881884

882885
if (Feptr == Fstart_match &&
883886
((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
@@ -5856,7 +5859,8 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
58565859
/* ===================================================================== */
58575860
/* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
58585861
starting frame was added to the chained frames in order to remember the
5859-
starting subject position for the group. */
5862+
starting subject position for the group. (Not true for OP_BRA when it's a
5863+
whole pattern recursion, but that is handled separately below.)*/
58605864

58615865
case OP_KET:
58625866
case OP_KETRMIN:
@@ -5908,8 +5912,37 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
59085912

59095913
switch (*bracode)
59105914
{
5911-
case OP_BRA: /* No need to do anything for these */
5912-
case OP_COND:
5915+
/* Whole pattern recursion is handled as a recursion into group 0, but
5916+
the entire pattern is wrapped in OP_BRA/OP_KET rather than a capturing
5917+
group - a design mistake: it should perhaps have been capture group 0.
5918+
Anyway, that means the end of such recursion must be handled here. It is
5919+
detected by checking for an immediately following OP_END when we are
5920+
recursing in group 0. If this is not the end of a whole-pattern
5921+
recursion, there is nothing to be done. */
5922+
5923+
case OP_BRA:
5924+
if (Fcurrent_recurse != 0 || Fecode[1+LINK_SIZE] != OP_END) break;
5925+
5926+
/* It is the end of whole-pattern recursion. */
5927+
5928+
offset = Flast_group_offset;
5929+
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
5930+
N = (heapframe *)((char *)match_data->heapframes + offset);
5931+
P = (heapframe *)((char *)N - frame_size);
5932+
Flast_group_offset = P->last_group_offset;
5933+
5934+
/* Reinstate the previous set of captures and then carry on after the
5935+
recursion call. */
5936+
5937+
memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
5938+
Foffset_top * sizeof(PCRE2_SIZE));
5939+
Foffset_top = P->offset_top;
5940+
Fcapture_last = P->capture_last;
5941+
Fcurrent_recurse = P->current_recurse;
5942+
Fecode = P->ecode + 1 + LINK_SIZE;
5943+
continue; /* With next opcode */
5944+
5945+
case OP_COND: /* No need to do anything for these */
59135946
case OP_SCOND:
59145947
break;
59155948

@@ -5976,9 +6009,8 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
59766009
if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
59776010
break;
59786011

5979-
/* Whole-pattern recursion is coded as a recurse into group 0, so it
5980-
won't be picked up here. Instead, we catch it when the OP_END is reached.
5981-
Other recursion is handled here. */
6012+
/* Whole-pattern recursion is coded as a recurse into group 0, and is
6013+
handled with OP_BRA above. Other recursion is handled here. */
59826014

59836015
case OP_CBRA:
59846016
case OP_CBRAPOS:

testdata/testinput2

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6066,4 +6066,19 @@ a)"xI
60666066
/\G(?:(?=(\1.|)(.))){1,13}?(?!.*\2.*\2)\1\K\2/g
60676067
aaabcccdeee
60686068

6069+
# This currently doesn't match JIT
6070+
6071+
/|(?0)./endanchored,aftertext
6072+
\= Expect error
6073+
abcd\=no_jit
6074+
6075+
/|a(?0)/endanchored
6076+
aaaa
6077+
6078+
# This currently doesn't match JIT
6079+
6080+
/(?:|(?0).)(?(R)|\z)/
6081+
\= Expect error
6082+
abcd\=no_jit
6083+
60696084
# End of testinput2

testdata/testoutput2

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17965,6 +17965,24 @@ No match
1796517965
1: ccc
1796617966
2: d
1796717967

17968+
# This currently doesn't match JIT
17969+
17970+
/|(?0)./endanchored,aftertext
17971+
\= Expect error
17972+
abcd\=no_jit
17973+
Failed: error -52: nested recursion at the same subject position
17974+
17975+
/|a(?0)/endanchored
17976+
aaaa
17977+
0: aaaa
17978+
17979+
# This currently doesn't match JIT
17980+
17981+
/(?:|(?0).)(?(R)|\z)/
17982+
\= Expect error
17983+
abcd\=no_jit
17984+
Failed: error -52: nested recursion at the same subject position
17985+
1796817986
# End of testinput2
1796917987
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
1797017988
Error -62: bad serialized data

0 commit comments

Comments
 (0)