Skip to content

Commit 88b1c47

Browse files
author
Zoltan Herczeg
committed
Use bitset for unicode category select in jit
1 parent 9667c91 commit 88b1c47

File tree

1 file changed

+40
-80
lines changed

1 file changed

+40
-80
lines changed

src/pcre2_jit_compile.c

Lines changed: 40 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -6691,6 +6691,15 @@ else
66916691
JUMPTO(SLJIT_JUMP, mainloop);
66926692
}
66936693

6694+
#ifdef SUPPORT_UNICODE
6695+
#define UCPCAT(bit) (1 << (bit))
6696+
#define UCPCAT2(bit1, bit2) (UCPCAT(bit1) | UCPCAT(bit2))
6697+
#define UCPCAT3(bit1, bit2, bit3) (UCPCAT(bit1) | UCPCAT(bit2) | UCPCAT(bit3))
6698+
#define UCPCAT_RANGE(start, end) (((1 << ((end) + 1)) - 1) - ((1 << (start)) - 1))
6699+
#define UCPCAT_L UCPCAT_RANGE(ucp_Ll, ucp_Lu)
6700+
#define UCPCAT_N UCPCAT_RANGE(ucp_Nd, ucp_No)
6701+
#endif
6702+
66946703
static void check_wordboundary(compiler_common *common, BOOL ucp)
66956704
{
66966705
DEFINE_COMPILER;
@@ -6748,17 +6757,9 @@ else
67486757
if (ucp)
67496758
{
67506759
add_jump(compiler, &common->getucdtype, JUMP(SLJIT_FAST_CALL));
6751-
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ucp_Mn);
6752-
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
6753-
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ucp_Pc);
6754-
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
6755-
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Ll);
6756-
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, ucp_Lu - ucp_Ll);
6757-
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
6758-
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Nd - ucp_Ll);
6759-
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, ucp_No - ucp_Nd);
6760-
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
6761-
OP1(SLJIT_MOV, TMP3, 0, TMP2, 0);
6760+
OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP1, 0);
6761+
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N);
6762+
OP_FLAGS(SLJIT_MOV, TMP3, 0, SLJIT_NOT_ZERO);
67626763
}
67636764
else
67646765
#endif /* SUPPORT_UNICODE */
@@ -6795,16 +6796,9 @@ valid_utf = LABEL();
67956796
if (ucp)
67966797
{
67976798
add_jump(compiler, &common->getucdtype, JUMP(SLJIT_FAST_CALL));
6798-
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ucp_Mn);
6799-
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
6800-
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ucp_Pc);
6801-
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
6802-
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Ll);
6803-
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, ucp_Lu - ucp_Ll);
6804-
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
6805-
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ucp_Nd - ucp_Ll);
6806-
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, ucp_No - ucp_Nd);
6807-
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
6799+
OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP1, 0);
6800+
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N);
6801+
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
68086802
}
68096803
else
68106804
#endif /* SUPPORT_UNICODE */
@@ -7543,16 +7537,6 @@ return cc;
75437537

75447538
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
75457539

7546-
#define SET_TYPE_OFFSET(value) \
7547-
if ((value) != typeoffset) \
7548-
{ \
7549-
if ((value) < typeoffset) \
7550-
OP2(SLJIT_ADD, typereg, 0, typereg, 0, SLJIT_IMM, typeoffset - (value)); \
7551-
else \
7552-
OP2(SLJIT_SUB, typereg, 0, typereg, 0, SLJIT_IMM, (value) - typeoffset); \
7553-
} \
7554-
typeoffset = (value);
7555-
75567540
#define SET_CHAR_OFFSET(value) \
75577541
if ((value) != charoffset) \
75587542
{ \
@@ -7577,7 +7561,6 @@ static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHA
75777561
#define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080
75787562
#define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100
75797563
#define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0 0x200
7580-
75817564
#endif /* SUPPORT_UNICODE */
75827565

75837566
static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
@@ -7597,7 +7580,6 @@ BOOL utf = common->utf;
75977580
sljit_u32 unicode_status = 0;
75987581
int typereg = TMP1;
75997582
const sljit_u32 *other_cases;
7600-
sljit_uw typeoffset;
76017583
#endif /* SUPPORT_UNICODE */
76027584

76037585
/* Scanning the necessary info. */
@@ -7672,6 +7654,7 @@ while (*cc != XCL_END)
76727654
case PT_LAMP:
76737655
case PT_GC:
76747656
case PT_PC:
7657+
case PT_WORD:
76757658
case PT_ALNUM:
76767659
unicode_status |= XCLASS_HAS_TYPE;
76777660
break;
@@ -7692,7 +7675,6 @@ while (*cc != XCL_END)
76927675

76937676
case PT_SPACE:
76947677
case PT_PXSPACE:
7695-
case PT_WORD:
76967678
case PT_PXGRAPH:
76977679
case PT_PXPRINT:
76987680
case PT_PXPUNCT:
@@ -8027,16 +8009,14 @@ if (unicode_status & XCLASS_NEEDS_UCD)
80278009
typereg = RETURN_ADDR;
80288010

80298011
OP1(SLJIT_MOV_U8, typereg, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
8012+
OP2(SLJIT_SHL, typereg, 0, SLJIT_IMM, 1, typereg, 0);
80308013
}
80318014
}
80328015
#endif /* SUPPORT_UNICODE */
80338016

80348017
/* Generating code. */
80358018
charoffset = 0;
80368019
numberofcmps = 0;
8037-
#ifdef SUPPORT_UNICODE
8038-
typeoffset = 0;
8039-
#endif /* SUPPORT_UNICODE */
80408020

80418021
while (*cc != XCL_END)
80428022
{
@@ -8109,23 +8089,18 @@ while (*cc != XCL_END)
81098089
break;
81108090

81118091
case PT_LAMP:
8112-
OP2U(SLJIT_SUB | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, ucp_Lu - typeoffset);
8113-
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
8114-
OP2U(SLJIT_SUB | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, ucp_Ll - typeoffset);
8115-
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
8116-
OP2U(SLJIT_SUB | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, ucp_Lt - typeoffset);
8117-
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_EQUAL);
8092+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt));
81188093
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
81198094
break;
81208095

81218096
case PT_GC:
8122-
c = PRIV(ucp_typerange)[(int)cc[1] * 2];
8123-
SET_TYPE_OFFSET(c);
8124-
jump = CMP(SLJIT_LESS_EQUAL ^ invertcmp, typereg, 0, SLJIT_IMM, PRIV(ucp_typerange)[(int)cc[1] * 2 + 1] - c);
8097+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]));
8098+
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
81258099
break;
81268100

81278101
case PT_PC:
8128-
jump = CMP(SLJIT_EQUAL ^ invertcmp, typereg, 0, SLJIT_IMM, (int)cc[1] - typeoffset);
8102+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(cc[1]));
8103+
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
81298104
break;
81308105

81318106
case PT_SC:
@@ -8148,26 +8123,18 @@ while (*cc != XCL_END)
81488123
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x180e - 0x9);
81498124
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
81508125

8151-
SET_TYPE_OFFSET(ucp_Zl);
8152-
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, typereg, 0, SLJIT_IMM, ucp_Zs - ucp_Zl);
8153-
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_LESS_EQUAL);
8126+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Zl, ucp_Zs));
8127+
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO);
81548128
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
81558129
break;
81568130

81578131
case PT_WORD:
8158-
OP2U(SLJIT_SUB | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, ucp_Mn - typeoffset);
8159-
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
8160-
OP2U(SLJIT_SUB | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, ucp_Pc - typeoffset);
8161-
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
8162-
/* Fall through. */
8132+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N);
8133+
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
8134+
break;
81638135

81648136
case PT_ALNUM:
8165-
SET_TYPE_OFFSET(ucp_Ll);
8166-
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, typereg, 0, SLJIT_IMM, ucp_Lu - ucp_Ll);
8167-
OP_FLAGS((*cc == PT_ALNUM) ? SLJIT_MOV : SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
8168-
SET_TYPE_OFFSET(ucp_Nd);
8169-
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, typereg, 0, SLJIT_IMM, ucp_No - ucp_Nd);
8170-
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_LESS_EQUAL);
8137+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_L | UCPCAT_N);
81718138
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
81728139
break;
81738140

@@ -8242,12 +8209,11 @@ while (*cc != XCL_END)
82428209
break;
82438210

82448211
case PT_PXGRAPH:
8245-
/* C and Z groups are the farthest two groups. */
8246-
SET_TYPE_OFFSET(ucp_Ll);
8247-
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, typereg, 0, SLJIT_IMM, ucp_So - ucp_Ll);
8248-
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_GREATER);
8212+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT_RANGE(ucp_Zl, ucp_Zs));
8213+
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
82498214

8250-
jump = CMP(SLJIT_NOT_EQUAL, typereg, 0, SLJIT_IMM, ucp_Cf - ucp_Ll);
8215+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf));
8216+
jump = JUMP(SLJIT_ZERO);
82518217

82528218
/* In case of ucp_Cf, we overwrite the result. */
82538219
SET_CHAR_OFFSET(0x2066);
@@ -8265,15 +8231,11 @@ while (*cc != XCL_END)
82658231
break;
82668232

82678233
case PT_PXPRINT:
8268-
/* C and Z groups are the farthest two groups. */
8269-
SET_TYPE_OFFSET(ucp_Ll);
8270-
OP2U(SLJIT_SUB | SLJIT_SET_GREATER, typereg, 0, SLJIT_IMM, ucp_So - ucp_Ll);
8271-
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_GREATER);
8234+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT2(ucp_Zl, ucp_Zp));
8235+
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
82728236

8273-
OP2U(SLJIT_SUB | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, ucp_Zs - ucp_Ll);
8274-
OP_FLAGS(SLJIT_AND, TMP2, 0, SLJIT_NOT_EQUAL);
8275-
8276-
jump = CMP(SLJIT_NOT_EQUAL, typereg, 0, SLJIT_IMM, ucp_Cf - ucp_Ll);
8237+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf));
8238+
jump = JUMP(SLJIT_ZERO);
82778239

82788240
/* In case of ucp_Cf, we overwrite the result. */
82798241
SET_CHAR_OFFSET(0x2066);
@@ -8288,17 +8250,15 @@ while (*cc != XCL_END)
82888250
break;
82898251

82908252
case PT_PXPUNCT:
8291-
SET_TYPE_OFFSET(ucp_Sc);
8292-
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, typereg, 0, SLJIT_IMM, ucp_So - ucp_Sc);
8293-
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
8253+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Sc, ucp_So));
8254+
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
82948255

82958256
SET_CHAR_OFFSET(0);
82968257
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x7f);
82978258
OP_FLAGS(SLJIT_AND, TMP2, 0, SLJIT_LESS_EQUAL);
82988259

8299-
SET_TYPE_OFFSET(ucp_Pc);
8300-
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, typereg, 0, SLJIT_IMM, ucp_Ps - ucp_Pc);
8301-
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_LESS_EQUAL);
8260+
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Pc, ucp_Ps));
8261+
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO);
83028262
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
83038263
break;
83048264

0 commit comments

Comments
 (0)