Skip to content

Commit 0cc80ce

Browse files
committed
Non-recursive scan prefix in JIT
1 parent a678783 commit 0cc80ce

File tree

2 files changed

+140
-49
lines changed

2 files changed

+140
-49
lines changed

src/pcre2_jit_compile.c

Lines changed: 139 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -5670,11 +5670,17 @@ if (last)
56705670
chars->last_count++;
56715671
}
56725672

5673-
static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars, int max_chars, sljit_u32 *rec_count)
5673+
#define SCAN_PREFIX_STACK_END 32
5674+
5675+
static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars)
56745676
{
5675-
/* Recursive function, which scans prefix literals. */
5677+
fast_forward_char_data *chars_start = chars;
5678+
fast_forward_char_data *chars_end = chars + MAX_N_CHARS;
5679+
PCRE2_SPTR cc_stack[SCAN_PREFIX_STACK_END];
5680+
fast_forward_char_data *chars_stack[SCAN_PREFIX_STACK_END];
5681+
sljit_u8 next_alternative_stack[SCAN_PREFIX_STACK_END];
56765682
BOOL last, any, class, caseless;
5677-
int len, repeat, len_save, consumed = 0;
5683+
int stack_ptr, rec_count, repeat, len, len_save;
56785684
sljit_u32 chr; /* Any unicode character. */
56795685
sljit_u8 *bytes, *bytes_end, byte;
56805686
PCRE2_SPTR alternative, cc_save, oc;
@@ -5687,11 +5693,44 @@ PCRE2_UCHAR othercase[1];
56875693
#endif
56885694

56895695
repeat = 1;
5696+
stack_ptr = 0;
5697+
rec_count = 10000;
56905698
while (TRUE)
56915699
{
5692-
if (*rec_count == 0)
5700+
if (--rec_count == 0)
56935701
return 0;
5694-
(*rec_count)--;
5702+
5703+
SLJIT_ASSERT(chars <= chars_start + MAX_N_CHARS);
5704+
5705+
if (chars >= chars_end)
5706+
{
5707+
if (stack_ptr == 0)
5708+
return chars_end - chars_start;
5709+
5710+
--stack_ptr;
5711+
cc = cc_stack[stack_ptr];
5712+
chars = chars_stack[stack_ptr];
5713+
5714+
if (chars >= chars_end)
5715+
continue;
5716+
5717+
if (next_alternative_stack[stack_ptr] != 0)
5718+
{
5719+
/* When an alternative is processed, the
5720+
next alternative is pushed onto the stack. */
5721+
SLJIT_ASSERT(*cc == OP_ALT);
5722+
alternative = cc + GET(cc, 1);
5723+
if (*alternative == OP_ALT)
5724+
{
5725+
SLJIT_ASSERT(stack_ptr < SCAN_PREFIX_STACK_END);
5726+
cc_stack[stack_ptr] = alternative;
5727+
chars_stack[stack_ptr] = chars;
5728+
next_alternative_stack[stack_ptr] = 1;
5729+
stack_ptr++;
5730+
}
5731+
cc += 1 + LINK_SIZE;
5732+
}
5733+
}
56955734

56965735
last = TRUE;
56975736
any = FALSE;
@@ -5768,9 +5807,17 @@ while (TRUE)
57685807
#ifdef SUPPORT_UNICODE
57695808
if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc);
57705809
#endif
5771-
max_chars = scan_prefix(common, cc + len, chars, max_chars, rec_count);
5772-
if (max_chars == 0)
5773-
return consumed;
5810+
if (stack_ptr >= SCAN_PREFIX_STACK_END)
5811+
{
5812+
chars_end = chars;
5813+
continue;
5814+
}
5815+
5816+
cc_stack[stack_ptr] = cc + len;
5817+
chars_stack[stack_ptr] = chars;
5818+
next_alternative_stack[stack_ptr] = 0;
5819+
stack_ptr++;
5820+
57745821
last = FALSE;
57755822
break;
57765823

@@ -5788,12 +5835,18 @@ while (TRUE)
57885835
case OP_CBRA:
57895836
case OP_CBRAPOS:
57905837
alternative = cc + GET(cc, 1);
5791-
while (*alternative == OP_ALT)
5838+
if (*alternative == OP_ALT)
57925839
{
5793-
max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, max_chars, rec_count);
5794-
if (max_chars == 0)
5795-
return consumed;
5796-
alternative += GET(alternative, 1);
5840+
if (stack_ptr >= SCAN_PREFIX_STACK_END)
5841+
{
5842+
chars_end = chars;
5843+
continue;
5844+
}
5845+
5846+
cc_stack[stack_ptr] = alternative;
5847+
chars_stack[stack_ptr] = chars;
5848+
next_alternative_stack[stack_ptr] = 1;
5849+
stack_ptr++;
57975850
}
57985851

57995852
if (*cc == OP_CBRA || *cc == OP_CBRAPOS)
@@ -5804,22 +5857,33 @@ while (TRUE)
58045857
case OP_CLASS:
58055858
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
58065859
if (common->utf && !is_char7_bitset((const sljit_u8 *)(cc + 1), FALSE))
5807-
return consumed;
5860+
{
5861+
chars_end = chars;
5862+
continue;
5863+
}
58085864
#endif
58095865
class = TRUE;
58105866
break;
58115867

58125868
case OP_NCLASS:
58135869
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5814-
if (common->utf) return consumed;
5870+
if (common->utf)
5871+
{
5872+
chars_end = chars;
5873+
continue;
5874+
}
58155875
#endif
58165876
class = TRUE;
58175877
break;
58185878

58195879
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
58205880
case OP_XCLASS:
58215881
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5822-
if (common->utf) return consumed;
5882+
if (common->utf)
5883+
{
5884+
chars_end = chars;
5885+
continue;
5886+
}
58235887
#endif
58245888
any = TRUE;
58255889
cc += GET(cc, 1);
@@ -5829,7 +5893,10 @@ while (TRUE)
58295893
case OP_DIGIT:
58305894
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
58315895
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_digit, FALSE))
5832-
return consumed;
5896+
{
5897+
chars_end = chars;
5898+
continue;
5899+
}
58335900
#endif
58345901
any = TRUE;
58355902
cc++;
@@ -5838,7 +5905,10 @@ while (TRUE)
58385905
case OP_WHITESPACE:
58395906
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
58405907
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_space, FALSE))
5841-
return consumed;
5908+
{
5909+
chars_end = chars;
5910+
continue;
5911+
}
58425912
#endif
58435913
any = TRUE;
58445914
cc++;
@@ -5847,7 +5917,10 @@ while (TRUE)
58475917
case OP_WORDCHAR:
58485918
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
58495919
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_word, FALSE))
5850-
return consumed;
5920+
{
5921+
chars_end = chars;
5922+
continue;
5923+
}
58515924
#endif
58525925
any = TRUE;
58535926
cc++;
@@ -5863,7 +5936,11 @@ while (TRUE)
58635936
case OP_ANY:
58645937
case OP_ALLANY:
58655938
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5866-
if (common->utf) return consumed;
5939+
if (common->utf)
5940+
{
5941+
chars_end = chars;
5942+
continue;
5943+
}
58675944
#endif
58685945
any = TRUE;
58695946
cc++;
@@ -5873,7 +5950,11 @@ while (TRUE)
58735950
case OP_NOTPROP:
58745951
case OP_PROP:
58755952
#if PCRE2_CODE_UNIT_WIDTH != 32
5876-
if (common->utf) return consumed;
5953+
if (common->utf)
5954+
{
5955+
chars_end = chars;
5956+
continue;
5957+
}
58775958
#endif
58785959
any = TRUE;
58795960
cc += 1 + 2;
@@ -5888,29 +5969,32 @@ while (TRUE)
58885969
case OP_NOTEXACT:
58895970
case OP_NOTEXACTI:
58905971
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
5891-
if (common->utf) return consumed;
5972+
if (common->utf)
5973+
{
5974+
chars_end = chars;
5975+
continue;
5976+
}
58925977
#endif
58935978
any = TRUE;
58945979
repeat = GET2(cc, 1);
58955980
cc += 1 + IMM2_SIZE + 1;
58965981
break;
58975982

58985983
default:
5899-
return consumed;
5984+
chars_end = chars;
5985+
continue;
59005986
}
59015987

5988+
SLJIT_ASSERT(chars < chars_end);
5989+
59025990
if (any)
59035991
{
59045992
do
59055993
{
59065994
chars->count = 255;
5907-
5908-
consumed++;
5909-
if (--max_chars == 0)
5910-
return consumed;
59115995
chars++;
59125996
}
5913-
while (--repeat > 0);
5997+
while (--repeat > 0 && chars < chars_end);
59145998

59155999
repeat = 1;
59166000
continue;
@@ -5929,9 +6013,16 @@ while (TRUE)
59296013
case OP_CRQUERY:
59306014
case OP_CRMINQUERY:
59316015
case OP_CRPOSQUERY:
5932-
max_chars = scan_prefix(common, cc + 1, chars, max_chars, rec_count);
5933-
if (max_chars == 0)
5934-
return consumed;
6016+
if (stack_ptr >= SCAN_PREFIX_STACK_END)
6017+
{
6018+
chars_end = chars;
6019+
continue;
6020+
}
6021+
6022+
cc_stack[stack_ptr] = cc + 1;
6023+
chars_stack[stack_ptr] = chars;
6024+
next_alternative_stack[stack_ptr] = 0;
6025+
stack_ptr++;
59356026
break;
59366027

59376028
default:
@@ -5945,7 +6036,10 @@ while (TRUE)
59456036
case OP_CRPOSRANGE:
59466037
repeat = GET2(cc, 1);
59476038
if (repeat <= 0)
5948-
return consumed;
6039+
{
6040+
chars_end = chars;
6041+
continue;
6042+
}
59496043
break;
59506044
}
59516045

@@ -5980,19 +6074,18 @@ while (TRUE)
59806074
bytes = bytes_end - 32;
59816075
}
59826076

5983-
consumed++;
5984-
if (--max_chars == 0)
5985-
return consumed;
59866077
chars++;
59876078
}
5988-
while (--repeat > 0);
6079+
while (--repeat > 0 && chars < chars_end);
59896080

6081+
repeat = 1;
59906082
switch (*cc)
59916083
{
59926084
case OP_CRSTAR:
59936085
case OP_CRMINSTAR:
59946086
case OP_CRPOSSTAR:
5995-
return consumed;
6087+
chars_end = chars;
6088+
break;
59966089

59976090
case OP_CRQUERY:
59986091
case OP_CRMINQUERY:
@@ -6004,12 +6097,11 @@ while (TRUE)
60046097
case OP_CRMINRANGE:
60056098
case OP_CRPOSRANGE:
60066099
if (GET2(cc, 1) != GET2(cc, 1 + IMM2_SIZE))
6007-
return consumed;
6100+
chars_end = chars;
60086101
cc += 1 + 2 * IMM2_SIZE;
60096102
break;
60106103
}
60116104

6012-
repeat = 1;
60136105
continue;
60146106
}
60156107

@@ -6025,7 +6117,10 @@ while (TRUE)
60256117
{
60266118
GETCHAR(chr, cc);
60276119
if ((int)PRIV(ord2utf)(char_othercase(common, chr), othercase) != len)
6028-
return consumed;
6120+
{
6121+
chars_end = chars;
6122+
continue;
6123+
}
60296124
}
60306125
else
60316126
#endif
@@ -6056,23 +6151,20 @@ while (TRUE)
60566151
do
60576152
{
60586153
len--;
6059-
consumed++;
60606154

60616155
chr = *cc;
60626156
add_prefix_char(*cc, chars, len == 0);
60636157

60646158
if (caseless)
60656159
add_prefix_char(*oc, chars, len == 0);
60666160

6067-
if (--max_chars == 0)
6068-
return consumed;
60696161
chars++;
60706162
cc++;
60716163
oc++;
60726164
}
6073-
while (len > 0);
6165+
while (len > 0 && chars < chars_end);
60746166

6075-
if (--repeat == 0)
6167+
if (--repeat == 0 || chars >= chars_end)
60766168
break;
60776169

60786170
len = len_save;
@@ -6081,7 +6173,7 @@ while (TRUE)
60816173

60826174
repeat = 1;
60836175
if (last)
6084-
return consumed;
6176+
chars_end = chars;
60856177
}
60866178
}
60876179

@@ -6251,16 +6343,14 @@ int i, max, from;
62516343
int range_right = -1, range_len;
62526344
sljit_u8 *update_table = NULL;
62536345
BOOL in_range;
6254-
sljit_u32 rec_count;
62556346

62566347
for (i = 0; i < MAX_N_CHARS; i++)
62576348
{
62586349
chars[i].count = 0;
62596350
chars[i].last_count = 0;
62606351
}
62616352

6262-
rec_count = 10000;
6263-
max = scan_prefix(common, common->start, chars, MAX_N_CHARS, &rec_count);
6353+
max = scan_prefix(common, common->start, chars);
62646354

62656355
if (max < 1)
62666356
return FALSE;

src/pcre2_jit_test.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ static struct regression_test_case regression_test_cases[] = {
286286
{ CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" },
287287
{ MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" },
288288
{ MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" },
289+
{ M, A, 0, 0, "(?:a?|a)b", "ba" },
289290

290291
/* Greedy and non-greedy + operators */
291292
{ MU, A, 0, 0, "(aa)+aa", "aaaaaaa" },

0 commit comments

Comments
 (0)