Skip to content

Commit c742166

Browse files
authored
Add runtime checks for invalid uses of \K in lookaround (#812)
If the match start is before the start-offset, or if the match ends before it begins, then pcre2_match() and JIT match reject the match with a new error (unless the option is active to allow \K within lookaround). The only way to get in this situation should be when someone uses a subroutine call or similar control flow to jump to a \K from inside a lookaround.
1 parent d59672a commit c742166

File tree

10 files changed

+146
-3
lines changed

10 files changed

+146
-3
lines changed

src/pcre2.h.generic

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ released, the numbers must not be changed. */
440440
#define PCRE2_ERROR_DIFFSUBSSUBJECT (-72)
441441
#define PCRE2_ERROR_DIFFSUBSOFFSET (-73)
442442
#define PCRE2_ERROR_DIFFSUBSOPTIONS (-74)
443+
#define PCRE2_ERROR_BAD_BACKSLASH_K (-75)
443444

444445

445446
/* Request types for pcre2_pattern_info() */

src/pcre2.h.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ released, the numbers must not be changed. */
440440
#define PCRE2_ERROR_DIFFSUBSSUBJECT (-72)
441441
#define PCRE2_ERROR_DIFFSUBSOFFSET (-73)
442442
#define PCRE2_ERROR_DIFFSUBSOPTIONS (-74)
443+
#define PCRE2_ERROR_BAD_BACKSLASH_K (-75)
443444

444445

445446
/* Request types for pcre2_pattern_info() */

src/pcre2_compile.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8370,6 +8370,10 @@ for (;; pptr++)
83708370
case ESC_A:
83718371
if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
83728372
break;
8373+
8374+
case ESC_K:
8375+
cb->external_flags |= PCRE2_HASBSK; /* Record */
8376+
break;
83738377
}
83748378

83758379
*code++ = meta_arg;

src/pcre2_error.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ static const unsigned char match_error_texts[] =
305305
"substitute subject differs from prior match call\0"
306306
"substitute start offset differs from prior match call\0"
307307
"substitute options differ from prior match call\0"
308+
"disallowed use of \\K in lookaround\0"
308309
;
309310

310311

src/pcre2_internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,7 @@ bytes in a code unit in that mode. */
525525
#define PCRE2_DUPCAPUSED 0x00200000u /* contains (?| */
526526
#define PCRE2_HASBKC 0x00400000u /* contains \C */
527527
#define PCRE2_HASACCEPT 0x00800000u /* contains (*ACCEPT) */
528+
#define PCRE2_HASBSK 0x01000000u /* contains \K */
528529

529530
#define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32)
530531

src/pcre2_intmodedep.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,7 +969,9 @@ typedef struct match_block {
969969
uint32_t match_call_count; /* Number of times a new frame is created */
970970
BOOL hitend; /* Hit the end of the subject at some point */
971971
BOOL hasthen; /* Pattern contains (*THEN) */
972+
BOOL hasbsk; /* Pattern contains \K */
972973
BOOL allowemptypartial; /* Allow empty hard partial */
974+
BOOL allowlookaroundbsk; /* Allow \K within lookarounds */
973975
const uint8_t *lcc; /* Points to lower casing table */
974976
const uint8_t *fcc; /* Points to case-flipping table */
975977
const uint8_t *ctypes; /* Points to table of type maps */

src/pcre2_jit_compile.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13772,6 +13772,34 @@ common->accept_label = LABEL();
1377213772
if (common->accept != NULL)
1377313773
set_jumps(common->accept, common->accept_label);
1377413774

13775+
/* Fail if we detect that the start position was moved to be either after
13776+
the end position (\K in lookahead) or before the start offset (\K in
13777+
lookbehind). */
13778+
13779+
if (common->has_set_som &&
13780+
(common->re->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
13781+
{
13782+
if (HAS_VIRTUAL_REGISTERS)
13783+
{
13784+
OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
13785+
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str));
13786+
}
13787+
else
13788+
{
13789+
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, str));
13790+
}
13791+
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0));
13792+
13793+
/* (ovector[0] < jit_arguments->str)? */
13794+
OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP2, 0, TMP1, 0);
13795+
/* Unconditionally set R0 (aka TMP1), in between the comparison that needs to
13796+
use TMP1, but before the jump. */
13797+
OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_BAD_BACKSLASH_K);
13798+
add_jump(compiler, &common->abort, JUMP(SLJIT_LESS));
13799+
/* (ovector[0] > STR_PTR)? NB. ovector[1] hasn't yet been set to STR_PTR. */
13800+
add_jump(compiler, &common->abort, CMP(SLJIT_GREATER, TMP2, 0, STR_PTR, 0));
13801+
}
13802+
1377513803
/* This means we have a match. Update the ovector. */
1377613804
copy_ovector(common, re->top_bracket + 1);
1377713805
common->quit_label = common->abort_label = LABEL();

src/pcre2_match.c

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1010,11 +1010,28 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
10101010
}
10111011

10121012
#ifdef DEBUG_SHOW_OPS
1013-
fprintf(stderr, "++ Failed ACCEPT not at end (endanchnored set)\n");
1013+
fprintf(stderr, "++ Failed ACCEPT not at end (endanchored set)\n");
10141014
#endif
10151015
return MATCH_NOMATCH; /* (*ACCEPT) */
10161016
}
10171017

1018+
/* Fail if we detect that the start position was moved to be either after
1019+
the end position (\K in lookahead) or before the start offset (\K in
1020+
lookbehind). If this occurs, the pattern must have used \K in a somewhat
1021+
sneaky way (e.g. by pattern recursion), because if the \K is actually
1022+
syntactically inside the lookaround, it's blocked at compile-time. */
1023+
1024+
if (Fstart_match < mb->start_subject + mb->start_offset ||
1025+
Fstart_match > Feptr)
1026+
{
1027+
/* The \K expression is fairly rare. We assert it was used so that we
1028+
catch any unexpected invalid data in start_match. */
1029+
PCRE2_ASSERT(mb->hasbsk);
1030+
1031+
if (!mb->allowlookaroundbsk)
1032+
return PCRE2_ERROR_BAD_BACKSLASH_K;
1033+
}
1034+
10181035
/* We have a successful match of the whole pattern. Record the result and
10191036
then do a direct return from the function. If there is space in the offset
10201037
vector, set any pairs that follow the highest-numbered captured string but
@@ -7393,8 +7410,11 @@ mb->start_offset = start_offset;
73937410
mb->end_subject = end_subject;
73947411
mb->true_end_subject = true_end_subject;
73957412
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
7413+
mb->hasbsk = (re->flags & PCRE2_HASBSK) != 0;
73967414
mb->allowemptypartial = (re->max_lookbehind > 0) ||
73977415
(re->flags & PCRE2_MATCH_EMPTY) != 0;
7416+
mb->allowlookaroundbsk =
7417+
(re->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) != 0;
73987418
mb->poptions = re->overall_options; /* Pattern options */
73997419
mb->ignore_skip_arg = 0;
74007420
mb->mark = mb->nomatch_mark = NULL; /* In case never set */

testdata/testinput2

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6378,7 +6378,42 @@ a)"xI
63786378

63796379
/^abc(?<!b\Kq)d/,allow_lookaround_bsk
63806380
abcd
6381-
6381+
6382+
# PCRE2 now also rejects sneaky cases where the \K is inside a lookaround... but
6383+
# it's not always easy to detect this syntactically at compile-time (indeed,
6384+
# a conditional expression could dynamically invoke \K via a subroutine, based
6385+
# on the subject contents).
6386+
6387+
/(?(DEFINE)(?<sneaky>b\K))a(?=(?&sneaky))/g,allow_lookaround_bsk
6388+
ab
6389+
6390+
/(?(DEFINE)(?<sneaky>b\K))a(?=(?&sneaky))/g
6391+
ab
6392+
zz
6393+
6394+
/a|(?(DEFINE)(?<sneaky>\Ka))(?<=(?&sneaky))b/g,allow_lookaround_bsk
6395+
ab
6396+
6397+
/a|(?(DEFINE)(?<sneaky>\Ka))(?<=(?&sneaky))b/g
6398+
ab
6399+
zz
6400+
6401+
/a|(?(DEFINE)(?<sneaky>\K\Ga))(?<=(?&sneaky))b/g
6402+
ab
6403+
zz
6404+
6405+
/(?=.{10}(?1))x(\K){0}/
6406+
x1234567890
6407+
6408+
/(?=.{10}(.))(*scs:(1)(?2))x(\K){0}/
6409+
x1234567890
6410+
6411+
/(?=.{5}(?1))\d*(\K){0}/
6412+
\= Totally fine - pattern does nothing bad even though \K is reachable
6413+
1234567890
6414+
\= Not fine - the subject now causes the \K to misbehave
6415+
abcdefgh
6416+
63826417
# ---------
63836418

63846419
# Tests for zero-length NULL to be treated as an empty string.

testdata/testoutput2

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19062,7 +19062,57 @@ Failed: error 199 at offset 14: \K is not allowed in lookarounds (but see PCRE2_
1906219062
/^abc(?<!b\Kq)d/,allow_lookaround_bsk
1906319063
abcd
1906419064
0: abcd
19065-
19065+
19066+
# PCRE2 now also rejects sneaky cases where the \K is inside a lookaround... but
19067+
# it's not always easy to detect this syntactically at compile-time (indeed,
19068+
# a conditional expression could dynamically invoke \K via a subroutine, based
19069+
# on the subject contents).
19070+
19071+
/(?(DEFINE)(?<sneaky>b\K))a(?=(?&sneaky))/g,allow_lookaround_bsk
19072+
ab
19073+
Start of matched string is beyond its end - displaying from end to start.
19074+
0: b
19075+
19076+
/(?(DEFINE)(?<sneaky>b\K))a(?=(?&sneaky))/g
19077+
ab
19078+
Failed: error -75: disallowed use of \K in lookaround
19079+
zz
19080+
No match
19081+
19082+
/a|(?(DEFINE)(?<sneaky>\Ka))(?<=(?&sneaky))b/g,allow_lookaround_bsk
19083+
ab
19084+
0: a
19085+
0: ab
19086+
19087+
/a|(?(DEFINE)(?<sneaky>\Ka))(?<=(?&sneaky))b/g
19088+
ab
19089+
0: a
19090+
Failed: error -75: disallowed use of \K in lookaround
19091+
zz
19092+
No match
19093+
19094+
/a|(?(DEFINE)(?<sneaky>\K\Ga))(?<=(?&sneaky))b/g
19095+
ab
19096+
0: a
19097+
zz
19098+
No match
19099+
19100+
/(?=.{10}(?1))x(\K){0}/
19101+
x1234567890
19102+
Failed: error -75: disallowed use of \K in lookaround
19103+
19104+
/(?=.{10}(.))(*scs:(1)(?2))x(\K){0}/
19105+
x1234567890
19106+
Failed: error -75: disallowed use of \K in lookaround
19107+
19108+
/(?=.{5}(?1))\d*(\K){0}/
19109+
\= Totally fine - pattern does nothing bad even though \K is reachable
19110+
1234567890
19111+
0: 67890
19112+
\= Not fine - the subject now causes the \K to misbehave
19113+
abcdefgh
19114+
Failed: error -75: disallowed use of \K in lookaround
19115+
1906619116
# ---------
1906719117

1906819118
# Tests for zero-length NULL to be treated as an empty string.

0 commit comments

Comments
 (0)