Skip to content

Commit 82ebebf

Browse files
authored
Improve repeated matching of newlines (#554)
Fixed an old bug which allowed incorrect checking the repeat maximum of UPTO operations. Co-authored-by: Zoltan Herczeg <[email protected]>
1 parent 6f36e8a commit 82ebebf

File tree

3 files changed

+47
-5
lines changed

3 files changed

+47
-5
lines changed

ChangeLog

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ well as interpreting \b and \v as characters.
104104

105105
23. Updated perltest.sh to enable locale setting.
106106

107+
24. Fixed a bug in JIT affecting greedy bounded repeats. The upper limit of
108+
repeats inside a repeated bracket might be incorrectly checked.
109+
107110

108111
Version 10.44 07-June-2024
109112
--------------------------

src/pcre2_jit_compile.c

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2005,13 +2005,13 @@ while (cc < ccend)
20052005

20062006
CASE_ITERATOR_TYPE_PRIVATE_DATA_2A
20072007
size = 1;
2008-
if (cc[1] != OP_ANYNL && cc[1] != OP_EXTUNI)
2008+
if (cc[1] != OP_EXTUNI)
20092009
space = 2;
20102010
break;
20112011

20122012
case OP_TYPEUPTO:
20132013
size = 1 + IMM2_SIZE;
2014-
if (cc[1 + IMM2_SIZE] != OP_ANYNL && cc[1 + IMM2_SIZE] != OP_EXTUNI)
2014+
if (cc[1 + IMM2_SIZE] != OP_EXTUNI)
20152015
space = 2;
20162016
break;
20172017

@@ -12411,7 +12411,7 @@ switch(opcode)
1241112411
case OP_UPTO:
1241212412
SLJIT_ASSERT(early_fail_ptr == 0 || opcode == OP_STAR);
1241312413

12414-
if (type == OP_ANYNL || type == OP_EXTUNI)
12414+
if (type == OP_EXTUNI)
1241512415
{
1241612416
SLJIT_ASSERT(private_data_ptr == 0);
1241712417
SLJIT_ASSERT(early_fail_ptr == 0);
@@ -12536,6 +12536,7 @@ switch(opcode)
1253612536
}
1253712537
}
1253812538

12539+
SLJIT_ASSERT(tmp_base == TMP3);
1253912540
if (charpos_enabled)
1254012541
{
1254112542
if (opcode == OP_UPTO)
@@ -12561,7 +12562,13 @@ switch(opcode)
1256112562
CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, charpos_char, label);
1256212563

1256312564
if (private_data_ptr == 0)
12565+
{
12566+
if (opcode == OP_UPTO)
12567+
OP1(SLJIT_MOV, TMP2, 0, tmp_base, tmp_offset);
1256412568
allocate_stack(common, 2);
12569+
if (opcode == OP_UPTO)
12570+
OP1(SLJIT_MOV, tmp_base, tmp_offset, TMP2, 0);
12571+
}
1256512572
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
1256612573
OP1(SLJIT_MOV, base, offset1, STR_PTR, 0);
1256712574

@@ -13295,6 +13302,26 @@ SLJIT_ASSERT(cc == ccend);
1329513302

1329613303
#define CURRENT_AS(type) ((type *)current)
1329713304

13305+
static void compile_newline_move_back(compiler_common *common)
13306+
{
13307+
DEFINE_COMPILER;
13308+
struct sljit_jump *jump;
13309+
13310+
OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
13311+
jump = CMP(SLJIT_LESS_EQUAL, TMP1, 0, TMP2, 0);
13312+
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
13313+
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, CHAR_NL);
13314+
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
13315+
OP_FLAGS(SLJIT_MOV, SLJIT_TMP_DEST_REG, 0, SLJIT_EQUAL);
13316+
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, CHAR_CR);
13317+
OP_FLAGS(SLJIT_AND, SLJIT_TMP_DEST_REG, 0, SLJIT_EQUAL);
13318+
#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
13319+
OP2(SLJIT_SHL, SLJIT_TMP_DEST_REG, 0, SLJIT_TMP_DEST_REG, 0, SLJIT_IMM, UCHAR_SHIFT);
13320+
#endif
13321+
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_TMP_DEST_REG, 0);
13322+
JUMPHERE(jump);
13323+
}
13324+
1329813325
static void compile_iterator_backtrackingpath(compiler_common *common, struct backtrack_common *current)
1329913326
{
1330013327
DEFINE_COMPILER;
@@ -13317,7 +13344,7 @@ switch(opcode)
1331713344
{
1331813345
case OP_STAR:
1331913346
case OP_UPTO:
13320-
if (type == OP_ANYNL || type == OP_EXTUNI)
13347+
if (type == OP_EXTUNI)
1332113348
{
1332213349
SLJIT_ASSERT(private_data_ptr == 0);
1332313350
set_jumps(CURRENT_AS(char_iterator_backtrack)->u.backtracks, LABEL());
@@ -13335,6 +13362,8 @@ switch(opcode)
1333513362

1333613363
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
1333713364
label = LABEL();
13365+
if (type == OP_ANYNL)
13366+
compile_newline_move_back(common);
1333813367
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
1333913368
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
1334013369
if (CURRENT_AS(char_iterator_backtrack)->u.charpos.othercasebit != 0)
@@ -13346,7 +13375,15 @@ switch(opcode)
1334613375
else
1334713376
{
1334813377
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
13349-
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, base, offset1);
13378+
if (type == OP_ANYNL)
13379+
{
13380+
OP1(SLJIT_MOV, TMP2, 0, base, offset1);
13381+
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
13382+
compile_newline_move_back(common);
13383+
}
13384+
else
13385+
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, base, offset1);
13386+
1335013387
move_back(common, NULL, TRUE);
1335113388
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
1335213389
JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath);

src/pcre2_jit_test.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ static struct regression_test_case regression_test_cases[] = {
360360
{ MU, A, 0, 0, "(?P<size>\\d+)m|M", "4M" },
361361
{ M, PCRE2_NEWLINE_CRLF, 0, 0, "\\n?.+#", "\n,\n,#" },
362362
{ 0, A, 0, 0, "<(\\w+)[\\s\\w]+id>", "<br><div id>" },
363+
{ MU, A, 0, 0, "([a-z]{0,3}c;)+", "ccccc;c;cc;ccc;cccccccccccccccc;" },
363364

364365
/* Bracket repeats with limit. */
365366
{ MU, A, 0, 0, "(?:(ab){2}){5}M", "abababababababababababM" },
@@ -481,6 +482,7 @@ static struct regression_test_case regression_test_cases[] = {
481482
{ MU, A, 0, 0, "\\R{3,}\n", "\r\n\r\n\nab\n\n\n\r\r\n\n" },
482483
{ MU, A, 0, 0, "\\R{0,3}\n", "\r\n\r\n\r\n\n" },
483484
{ MU, A, 0, 0, "\\R{0,3}\n", "\r\n\r\n\r\n\r" },
485+
{ MU, A, 0, 0, "(\\R{0,3}\n;)+", "\r\n\r\n\r\n\r\n\n;\n;\n\n;\n\n\n;\n\n\n\n\n;" },
484486
{ MU, A, 0, 0 | F_NOMATCH, "\\R+\\R\\R", "\r\n\r\n" },
485487
{ MU, A, 0, 0, "\\R+\\R\\R", "\r\r\r" },
486488
{ MU, A, 0, 0, "\\R*\\R\\R", "\n\r" },

0 commit comments

Comments
 (0)