Skip to content

Commit b3f6381

Browse files
committed
Change xclass character min/max detection
The new code is useful for a future eclass implementation in jit.
1 parent 82d1d4b commit b3f6381

File tree

1 file changed

+211
-32
lines changed

1 file changed

+211
-32
lines changed

src/pcre2_jit_compile.c

Lines changed: 211 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8104,6 +8104,187 @@ SLJIT_ASSERT(next_char <= (const uint8_t*)common->start);
81048104
ranges->range_count = range_count;
81058105
}
81068106

8107+
#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
8108+
8109+
static void xclass_update_min_max(compiler_common *common, PCRE2_SPTR cc, sljit_u32 *min_ptr, sljit_u32 *max_ptr)
8110+
{
8111+
uint32_t type, list_ind, c;
8112+
sljit_u32 min = *min_ptr;
8113+
sljit_u32 max = *max_ptr;
8114+
uint32_t char_list_add;
8115+
const uint8_t *next_char;
8116+
BOOL utf = TRUE;
8117+
8118+
/* This function is pointless without utf 8/16. */
8119+
SLJIT_ASSERT(common->utf);
8120+
if (*cc == XCL_SINGLE || *cc == XCL_RANGE)
8121+
{
8122+
/* Only a few ranges are present. */
8123+
do
8124+
{
8125+
type = *cc++;
8126+
SLJIT_ASSERT(type == XCL_SINGLE || type == XCL_RANGE);
8127+
GETCHARINCTEST(c, cc);
8128+
8129+
if (c < min)
8130+
min = c;
8131+
8132+
if (type == XCL_RANGE)
8133+
{
8134+
GETCHARINCTEST(c, cc);
8135+
}
8136+
8137+
if (c > max)
8138+
max = c;
8139+
}
8140+
while (*cc != XCL_END);
8141+
8142+
SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max);
8143+
*min_ptr = min;
8144+
*max_ptr = max;
8145+
return;
8146+
}
8147+
8148+
SLJIT_ASSERT(cc[0] >= XCL_LIST);
8149+
#if PCRE2_CODE_UNIT_WIDTH == 8
8150+
type = (uint32_t)(cc[0] << 8) | cc[1];
8151+
cc += 2;
8152+
#else
8153+
type = cc[0];
8154+
cc++;
8155+
#endif /* CODE_UNIT_WIDTH */
8156+
8157+
/* Align characters. */
8158+
next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1);
8159+
type &= XCL_TYPE_MASK;
8160+
8161+
SLJIT_ASSERT(type != 0);
8162+
8163+
/* Detect minimum. */
8164+
8165+
/* Skip unused ranges. */
8166+
list_ind = 0;
8167+
while ((type & (XCL_BEGIN_WITH_RANGE | XCL_ITEM_COUNT_MASK)) == 0)
8168+
{
8169+
type >>= XCL_TYPE_BIT_LEN;
8170+
list_ind++;
8171+
}
8172+
8173+
SLJIT_ASSERT(list_ind <= 2);
8174+
switch (list_ind)
8175+
{
8176+
case 0:
8177+
char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
8178+
c = XCL_CHAR_LIST_LOW_16_START;
8179+
break;
8180+
8181+
case 1:
8182+
char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
8183+
c = XCL_CHAR_LIST_HIGH_16_START;
8184+
break;
8185+
8186+
default:
8187+
char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
8188+
c = XCL_CHAR_LIST_LOW_32_START;
8189+
break;
8190+
}
8191+
8192+
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
8193+
{
8194+
if (c < min)
8195+
min = c;
8196+
}
8197+
else
8198+
{
8199+
if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK)
8200+
{
8201+
if (list_ind <= 1)
8202+
c = *(const uint16_t*)(next_char + 2);
8203+
else
8204+
c = *(const uint32_t*)(next_char + 4);
8205+
}
8206+
else
8207+
{
8208+
if (list_ind <= 1)
8209+
c = *(const uint16_t*)next_char;
8210+
else
8211+
c = *(const uint32_t*)next_char;
8212+
}
8213+
8214+
c = char_list_add + (c >> XCL_CHAR_SHIFT);
8215+
if (c < min)
8216+
min = c;
8217+
}
8218+
8219+
/* Detect maximum. */
8220+
8221+
/* Skip intermediate ranges. */
8222+
while (TRUE)
8223+
{
8224+
if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK)
8225+
{
8226+
if (list_ind <= 1)
8227+
{
8228+
c = *(const uint16_t*)next_char;
8229+
next_char += (c + 1) << 1;
8230+
}
8231+
else
8232+
{
8233+
c = *(const uint32_t*)next_char;
8234+
next_char += (c + 1) << 2;
8235+
}
8236+
}
8237+
else
8238+
next_char += (type & XCL_ITEM_COUNT_MASK) << (list_ind <= 1 ? 1 : 2);
8239+
8240+
if ((type >> XCL_TYPE_BIT_LEN) == 0)
8241+
break;
8242+
8243+
list_ind++;
8244+
type >>= XCL_TYPE_BIT_LEN;
8245+
}
8246+
8247+
SLJIT_ASSERT(list_ind <= 2 && type != 0);
8248+
switch (list_ind)
8249+
{
8250+
case 0:
8251+
char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
8252+
c = XCL_CHAR_LIST_LOW_16_END;
8253+
break;
8254+
8255+
case 1:
8256+
char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
8257+
c = XCL_CHAR_LIST_HIGH_16_END;
8258+
break;
8259+
8260+
default:
8261+
char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
8262+
c = XCL_CHAR_LIST_LOW_32_END;
8263+
break;
8264+
}
8265+
8266+
if ((type & XCL_ITEM_COUNT_MASK) != 0)
8267+
{
8268+
/* Type is reused as temporary. */
8269+
if (list_ind <= 1)
8270+
type = *(const uint16_t*)(next_char - 2);
8271+
else
8272+
type = *(const uint32_t*)(next_char - 4);
8273+
8274+
if (type & XCL_CHAR_END)
8275+
c = char_list_add + (type >> XCL_CHAR_SHIFT);
8276+
}
8277+
8278+
if (c > max)
8279+
max = c;
8280+
8281+
SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max);
8282+
*min_ptr = min;
8283+
*max_ptr = max;
8284+
}
8285+
8286+
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */
8287+
81078288
#ifdef SUPPORT_UNICODE
81088289
#define XCLASS_SAVE_CHAR 0x001
81098290
#define XCLASS_CHAR_SAVED 0x002
@@ -8126,7 +8307,7 @@ jump_list *found = NULL;
81268307
jump_list *check_result = NULL;
81278308
jump_list **list = (cc[0] & XCL_NOT) == 0 ? &found : backtracks;
81288309
sljit_uw c, charoffset;
8129-
sljit_u32 max = 256, min = READ_CHAR_MAX;
8310+
sljit_u32 max = READ_CHAR_MAX, min = 0;
81308311
struct sljit_jump *jump = NULL;
81318312
PCRE2_SPTR ccbegin;
81328313
sljit_u32 compares, invertcmp, depth;
@@ -8149,18 +8330,13 @@ ccbegin = cc;
81498330
compares = 0;
81508331

81518332
if (cc[-1] & XCL_MAP)
8152-
{
8153-
min = 0;
81548333
cc += 32 / sizeof(PCRE2_UCHAR);
8155-
}
81568334

81578335
#ifdef SUPPORT_UNICODE
81588336
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
81598337
{
81608338
compares++;
81618339
cc++;
8162-
max = READ_CHAR_MAX;
8163-
min = 0;
81648340

81658341
items = 0;
81668342

@@ -8256,22 +8432,16 @@ if (category_list == UCPCAT_ALL)
82568432
}
82578433
#endif
82588434

8259-
ranges.range_count = 0;
8260-
ranges.ranges = ranges.local_ranges;
8261-
ranges.stack = ranges.local_stack;
8262-
82638435
if (*cc != XCL_END)
82648436
{
8265-
xclass_compute_ranges(common, cc, &ranges);
8266-
8267-
if (ranges.stack == NULL)
8268-
return;
8269-
8270-
if (ranges.ranges[ranges.range_count - 1] > max)
8271-
max = ranges.ranges[ranges.range_count - 1];
8272-
if (ranges.ranges[0] < min)
8273-
min = ranges.ranges[0];
8274-
8437+
#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
8438+
if (common->utf && compares == 0)
8439+
{
8440+
max = 0;
8441+
min = (ccbegin[-1] & XCL_MAP) != 0 ? 0 : READ_CHAR_MAX;
8442+
xclass_update_min_max(common, cc, &min, &max);
8443+
}
8444+
#endif
82758445
compares++;
82768446
#ifdef SUPPORT_UNICODE
82778447
unicode_status |= XCLASS_SAVE_CHAR;
@@ -8282,8 +8452,6 @@ if (*cc != XCL_END)
82828452
if (compares == 0 && category_list == 0)
82838453
{
82848454
/* No characters are accepted, same as (*F) or dotall. */
8285-
SLJIT_ASSERT(ranges.stack == ranges.local_stack);
8286-
82878455
compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);
82888456
if (list != backtracks)
82898457
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
@@ -8324,11 +8492,6 @@ if ((cc[-1] & XCL_MAP) != 0)
83248492

83258493
cc += 32 / sizeof(PCRE2_UCHAR);
83268494
}
8327-
else
8328-
{
8329-
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, min);
8330-
add_jump(compiler, (cc[-1] & XCL_NOT) == 0 ? backtracks : &found, CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, max - min));
8331-
}
83328495

83338496
#ifdef SUPPORT_UNICODE
83348497
if (unicode_status & XCLASS_NEEDS_UCD)
@@ -8690,19 +8853,35 @@ while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
86908853
add_jump(compiler, compares > 0 ? list : backtracks, jump);
86918854
}
86928855

8693-
if (ranges.range_count == 0)
8856+
if (compares == 0)
86948857
{
8695-
SLJIT_ASSERT(compares == 0 && ranges.stack == ranges.local_stack);
8696-
86978858
if (found != NULL)
86988859
set_jumps(found, LABEL());
86998860
return;
87008861
}
8701-
#else
8702-
SLJIT_ASSERT(ranges.range_count > 0);
87038862
#endif /* SUPPORT_UNICODE */
87048863

87058864
SLJIT_ASSERT(compares == 1);
8865+
ranges.range_count = 0;
8866+
ranges.ranges = ranges.local_ranges;
8867+
ranges.stack = ranges.local_stack;
8868+
8869+
xclass_compute_ranges(common, cc, &ranges);
8870+
8871+
if (ranges.stack == NULL)
8872+
return;
8873+
8874+
#if (defined SLJIT_DEBUG && SLJIT_DEBUG) && \
8875+
defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
8876+
if (common->utf)
8877+
{
8878+
min = 0xffffffff;
8879+
max = 0;
8880+
xclass_update_min_max(common, cc, &min, &max);
8881+
SLJIT_ASSERT(ranges.ranges[0] == min && ranges.ranges[ranges.range_count - 1] == max);
8882+
}
8883+
#endif /* SLJIT_DEBUG && SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */
8884+
87068885
invertcmp = (list != backtracks);
87078886

87088887
if (ranges.range_count == 2)

0 commit comments

Comments
 (0)