Skip to content

Commit e2bc644

Browse files
authored
Optimize RegExp ASCII literal matching (#94)
Add REOP_char8 that matches single bytes. Compresses bytecode for the ASCII common case by 33% and reduces regexp_ascii benchmark running time by 4%. The regexp_utf16 benchmark is unaffected.
1 parent e49da8e commit e2bc644

File tree

3 files changed

+47
-12
lines changed

3 files changed

+47
-12
lines changed

libregexp-opcode.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
#ifdef DEF
2626

2727
DEF(invalid, 1) /* never used */
28-
DEF(char, 3)
28+
DEF(char8, 2) /* 7 bits in fact */
29+
DEF(char16, 3)
2930
DEF(char32, 5)
3031
DEF(dot, 1)
3132
DEF(any, 1) /* same as dot but match any character including line terminator */

libregexp.c

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -315,15 +315,15 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
315315
}
316316
printf("%s", reopcode_info[opcode].name);
317317
switch(opcode) {
318-
case REOP_char:
318+
case REOP_char8:
319+
val = get_u8(buf + pos + 1);
320+
goto printchar;
321+
case REOP_char16:
319322
val = get_u16(buf + pos + 1);
320-
if (val >= ' ' && val <= 126)
321-
printf(" '%c'", val);
322-
else
323-
printf(" 0x%04x", val);
324-
break;
323+
goto printchar;
325324
case REOP_char32:
326325
val = get_u32(buf + pos + 1);
326+
printchar:
327327
if (val >= ' ' && val <= 126)
328328
printf(" '%c'", val);
329329
else
@@ -971,8 +971,9 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
971971
val = get_u16(bc_buf + pos + 1);
972972
len += val * 8;
973973
goto simple_char;
974-
case REOP_char:
975974
case REOP_char32:
975+
case REOP_char16:
976+
case REOP_char8:
976977
case REOP_dot:
977978
case REOP_any:
978979
simple_char:
@@ -1050,8 +1051,9 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
10501051
val = get_u16(bc_buf + pos + 1);
10511052
len += val * 8;
10521053
goto simple_char;
1053-
case REOP_char:
10541054
case REOP_char32:
1055+
case REOP_char16:
1056+
case REOP_char8:
10551057
case REOP_dot:
10561058
case REOP_any:
10571059
simple_char:
@@ -1494,8 +1496,10 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
14941496
} else {
14951497
if (s->ignore_case)
14961498
c = lre_canonicalize(c, s->is_utf16);
1497-
if (c <= 0xffff)
1498-
re_emit_op_u16(s, REOP_char, c);
1499+
if (c <= 0x7f)
1500+
re_emit_op_u8(s, REOP_char8, c);
1501+
else if (c <= 0xffff)
1502+
re_emit_op_u16(s, REOP_char16, c);
14991503
else
15001504
re_emit_op_u32(s, REOP_char32, c);
15011505
}
@@ -2181,9 +2185,13 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
21812185
val = get_u32(pc);
21822186
pc += 4;
21832187
goto test_char;
2184-
case REOP_char:
2188+
case REOP_char16:
21852189
val = get_u16(pc);
21862190
pc += 2;
2191+
goto test_char;
2192+
case REOP_char8:
2193+
val = get_u8(pc);
2194+
pc += 1;
21872195
test_char:
21882196
if (cptr >= cbuf_end)
21892197
goto no_match;

tests/microbench.js

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,30 @@ function math_min(n)
654654
return n * 1000;
655655
}
656656

657+
function regexp_ascii(n)
658+
{
659+
var i, j, r, s;
660+
s = "the quick brown fox jumped over the lazy dog"
661+
for(j = 0; j < n; j++) {
662+
for(i = 0; i < 10000; i++)
663+
r = /the quick brown fox/.exec(s)
664+
global_res = r;
665+
}
666+
return n * 10000;
667+
}
668+
669+
function regexp_utf16(n)
670+
{
671+
var i, j, r, s;
672+
s = "the quick brown ᶠᵒˣ jumped over the lazy ᵈᵒᵍ"
673+
for(j = 0; j < n; j++) {
674+
for(i = 0; i < 10000; i++)
675+
r = /the quick brown ˣ/.exec(s)
676+
global_res = r;
677+
}
678+
return n * 10000;
679+
}
680+
657681
/* incremental string contruction as local var */
658682
function string_build1(n)
659683
{
@@ -951,6 +975,8 @@ function main(argc, argv, g)
951975
array_for_in,
952976
array_for_of,
953977
math_min,
978+
regexp_ascii,
979+
regexp_utf16,
954980
string_build1,
955981
string_build2,
956982
//string_build3,

0 commit comments

Comments
 (0)