Skip to content

Commit f7d2169

Browse files
authored
Rename LRE_FLAG_UTF16 to LRE_FLAG_UNICODE (#186)
Prep work for https://github.com/tc39/proposal-regexp-v-flag a.k.a. UnicodeSets.
1 parent 42b7086 commit f7d2169

File tree

3 files changed

+38
-38
lines changed

3 files changed

+38
-38
lines changed

libregexp.c

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ typedef struct {
6969
const uint8_t *buf_end;
7070
const uint8_t *buf_start;
7171
int re_flags;
72-
BOOL is_utf16;
72+
BOOL is_unicode;
7373
BOOL ignore_case;
7474
BOOL dotall;
7575
int capture_count;
@@ -122,11 +122,11 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
122122
}
123123

124124
/* canonicalize with the specific JS regexp rules */
125-
static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
125+
static uint32_t lre_canonicalize(uint32_t c, BOOL is_unicode)
126126
{
127127
uint32_t res[LRE_CC_RES_LEN_MAX];
128128
int len;
129-
if (is_utf16) {
129+
if (is_unicode) {
130130
if (likely(c < 128)) {
131131
if (c >= 'A' && c <= 'Z')
132132
c = c - 'A' + 'a';
@@ -751,10 +751,10 @@ static int get_class_atom(REParseState *s, CharRange *cr,
751751
if ((c >= 'a' && c <= 'z') ||
752752
(c >= 'A' && c <= 'Z') ||
753753
(((c >= '0' && c <= '9') || c == '_') &&
754-
inclass && !s->is_utf16)) { /* Annex B.1.4 */
754+
inclass && !s->is_unicode)) { /* Annex B.1.4 */
755755
c &= 0x1f;
756756
p++;
757-
} else if (s->is_utf16) {
757+
} else if (s->is_unicode) {
758758
goto invalid_escape;
759759
} else {
760760
/* otherwise return '\' and 'c' */
@@ -764,7 +764,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
764764
break;
765765
case 'p':
766766
case 'P':
767-
if (s->is_utf16) {
767+
if (s->is_unicode) {
768768
if (parse_unicode_property(s, cr, &p, (c == 'P')))
769769
return -1;
770770
c = CLASS_RANGE_BASE;
@@ -773,14 +773,14 @@ static int get_class_atom(REParseState *s, CharRange *cr,
773773
/* fall thru */
774774
default:
775775
p--;
776-
ret = lre_parse_escape(&p, s->is_utf16 * 2);
776+
ret = lre_parse_escape(&p, s->is_unicode * 2);
777777
if (ret >= 0) {
778778
c = ret;
779779
} else {
780780
if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
781781
/* always valid to escape these characters */
782782
goto normal_char;
783-
} else if (s->is_utf16) {
783+
} else if (s->is_unicode) {
784784
invalid_escape:
785785
return re_parse_error(s, "invalid escape sequence in regular expression");
786786
} else {
@@ -802,7 +802,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
802802
/* normal char */
803803
if (c >= 128) {
804804
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
805-
if ((unsigned)c > 0xffff && !s->is_utf16) {
805+
if ((unsigned)c > 0xffff && !s->is_unicode) {
806806
/* XXX: should handle non BMP-1 code points */
807807
return re_parse_error(s, "malformed unicode char");
808808
}
@@ -878,7 +878,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
878878
if (*p == '-' && p[1] != ']') {
879879
const uint8_t *p0 = p + 1;
880880
if (c1 >= CLASS_RANGE_BASE) {
881-
if (s->is_utf16) {
881+
if (s->is_unicode) {
882882
cr_free(cr1);
883883
goto invalid_class_range;
884884
}
@@ -890,7 +890,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
890890
goto fail;
891891
if (c2 >= CLASS_RANGE_BASE) {
892892
cr_free(cr1);
893-
if (s->is_utf16) {
893+
if (s->is_unicode) {
894894
goto invalid_class_range;
895895
}
896896
/* Annex B: match '-' character */
@@ -1248,7 +1248,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
12481248
re_emit_op(s, REOP_prev);
12491249
break;
12501250
case '{':
1251-
if (s->is_utf16) {
1251+
if (s->is_unicode) {
12521252
return re_parse_error(s, "syntax error");
12531253
} else if (!is_digit(p[1])) {
12541254
/* Annex B: we accept '{' not followed by digits as a
@@ -1300,7 +1300,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
13001300
lookahead:
13011301
/* Annex B allows lookahead to be used as an atom for
13021302
the quantifiers */
1303-
if (!s->is_utf16 && !is_backward_lookahead) {
1303+
if (!s->is_unicode && !is_backward_lookahead) {
13041304
last_atom_start = s->byte_code.size;
13051305
last_capture_count = s->capture_count;
13061306
}
@@ -1376,15 +1376,15 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
13761376
/* annex B: we tolerate invalid group names in non
13771377
unicode mode if there is no named capture
13781378
definition */
1379-
if (s->is_utf16 || re_has_named_captures(s))
1379+
if (s->is_unicode || re_has_named_captures(s))
13801380
return re_parse_error(s, "expecting group name");
13811381
else
13821382
goto parse_class_atom;
13831383
}
13841384
p1 += 3;
13851385
if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
13861386
&p1)) {
1387-
if (s->is_utf16 || re_has_named_captures(s))
1387+
if (s->is_unicode || re_has_named_captures(s))
13881388
return re_parse_error(s, "invalid group name");
13891389
else
13901390
goto parse_class_atom;
@@ -1395,7 +1395,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
13951395
after (inefficient, but hopefully not common */
13961396
c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
13971397
if (c < 0) {
1398-
if (s->is_utf16 || re_has_named_captures(s))
1398+
if (s->is_unicode || re_has_named_captures(s))
13991399
return re_parse_error(s, "group name not defined");
14001400
else
14011401
goto parse_class_atom;
@@ -1407,7 +1407,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
14071407
case '0':
14081408
p += 2;
14091409
c = 0;
1410-
if (s->is_utf16) {
1410+
if (s->is_unicode) {
14111411
if (is_digit(*p)) {
14121412
return re_parse_error(s, "invalid decimal escape in regular expression");
14131413
}
@@ -1429,7 +1429,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
14291429

14301430
c = parse_digits(&p, FALSE);
14311431
if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
1432-
if (!s->is_utf16) {
1432+
if (!s->is_unicode) {
14331433
/* Annex B.1.4: accept legacy octal */
14341434
p = q;
14351435
if (*p <= '7') {
@@ -1471,7 +1471,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
14711471
break;
14721472
case ']':
14731473
case '}':
1474-
if (s->is_utf16)
1474+
if (s->is_unicode)
14751475
return re_parse_error(s, "syntax error");
14761476
goto parse_class_atom;
14771477
default:
@@ -1493,7 +1493,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
14931493
return -1;
14941494
} else {
14951495
if (s->ignore_case)
1496-
c = lre_canonicalize(c, s->is_utf16);
1496+
c = lre_canonicalize(c, s->is_unicode);
14971497
if (c <= 0x7f)
14981498
re_emit_op_u8(s, REOP_char8, c);
14991499
else if (c <= 0xffff)
@@ -1531,7 +1531,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
15311531
/* As an extension (see ES6 annex B), we accept '{' not
15321532
followed by digits as a normal atom */
15331533
if (!is_digit(p[1])) {
1534-
if (s->is_utf16)
1534+
if (s->is_unicode)
15351535
goto invalid_quant_count;
15361536
break;
15371537
}
@@ -1550,7 +1550,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
15501550
quant_max = INT32_MAX; /* infinity */
15511551
}
15521552
}
1553-
if (*p != '}' && !s->is_utf16) {
1553+
if (*p != '}' && !s->is_unicode) {
15541554
/* Annex B: normal atom if invalid '{' syntax */
15551555
p = p1;
15561556
break;
@@ -1839,7 +1839,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
18391839
s->buf_end = s->buf_ptr + buf_len;
18401840
s->buf_start = s->buf_ptr;
18411841
s->re_flags = re_flags;
1842-
s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0);
1842+
s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
18431843
is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
18441844
s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
18451845
s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
@@ -2039,7 +2039,7 @@ typedef struct {
20392039
int stack_size_max;
20402040
BOOL multi_line;
20412041
BOOL ignore_case;
2042-
BOOL is_utf16;
2042+
BOOL is_unicode;
20432043
void *opaque; /* used for stack overflow check */
20442044

20452045
size_t state_size;
@@ -2189,7 +2189,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
21892189
goto no_match;
21902190
GET_CHAR(c, cptr, cbuf_end);
21912191
if (s->ignore_case) {
2192-
c = lre_canonicalize(c, s->is_utf16);
2192+
c = lre_canonicalize(c, s->is_unicode);
21932193
}
21942194
if (val != c)
21952195
goto no_match;
@@ -2346,8 +2346,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
23462346
GET_CHAR(c1, cptr1, cptr1_end);
23472347
GET_CHAR(c2, cptr, cbuf_end);
23482348
if (s->ignore_case) {
2349-
c1 = lre_canonicalize(c1, s->is_utf16);
2350-
c2 = lre_canonicalize(c2, s->is_utf16);
2349+
c1 = lre_canonicalize(c1, s->is_unicode);
2350+
c2 = lre_canonicalize(c2, s->is_unicode);
23512351
}
23522352
if (c1 != c2)
23532353
goto no_match;
@@ -2360,8 +2360,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
23602360
GET_PREV_CHAR(c1, cptr1, cptr1_start);
23612361
GET_PREV_CHAR(c2, cptr, s->cbuf);
23622362
if (s->ignore_case) {
2363-
c1 = lre_canonicalize(c1, s->is_utf16);
2364-
c2 = lre_canonicalize(c2, s->is_utf16);
2363+
c1 = lre_canonicalize(c1, s->is_unicode);
2364+
c2 = lre_canonicalize(c2, s->is_unicode);
23652365
}
23662366
if (c1 != c2)
23672367
goto no_match;
@@ -2380,7 +2380,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
23802380
goto no_match;
23812381
GET_CHAR(c, cptr, cbuf_end);
23822382
if (s->ignore_case) {
2383-
c = lre_canonicalize(c, s->is_utf16);
2383+
c = lre_canonicalize(c, s->is_unicode);
23842384
}
23852385
idx_min = 0;
23862386
low = get_u16(pc + 0 * 4);
@@ -2420,7 +2420,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
24202420
goto no_match;
24212421
GET_CHAR(c, cptr, cbuf_end);
24222422
if (s->ignore_case) {
2423-
c = lre_canonicalize(c, s->is_utf16);
2423+
c = lre_canonicalize(c, s->is_unicode);
24242424
}
24252425
idx_min = 0;
24262426
low = get_u32(pc + 0 * 8);
@@ -2512,13 +2512,13 @@ int lre_exec(uint8_t **capture,
25122512
re_flags = bc_buf[RE_HEADER_FLAGS];
25132513
s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
25142514
s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
2515-
s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0;
2515+
s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
25162516
s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
25172517
s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
25182518
s->cbuf = cbuf;
25192519
s->cbuf_end = cbuf + (clen << cbuf_type);
25202520
s->cbuf_type = cbuf_type;
2521-
if (s->cbuf_type == 1 && s->is_utf16)
2521+
if (s->cbuf_type == 1 && s->is_unicode)
25222522
s->cbuf_type = 2;
25232523
s->opaque = opaque;
25242524

libregexp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
#define LRE_FLAG_IGNORECASE (1 << 1)
3535
#define LRE_FLAG_MULTILINE (1 << 2)
3636
#define LRE_FLAG_DOTALL (1 << 3)
37-
#define LRE_FLAG_UTF16 (1 << 4)
37+
#define LRE_FLAG_UNICODE (1 << 4)
3838
#define LRE_FLAG_STICKY (1 << 5)
3939
#define LRE_FLAG_INDICES (1 << 6) /* Unused by libregexp, just recorded. */
4040

quickjs.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40450,7 +40450,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
4045040450
mask = LRE_FLAG_DOTALL;
4045140451
break;
4045240452
case 'u':
40453-
mask = LRE_FLAG_UTF16;
40453+
mask = LRE_FLAG_UNICODE;
4045440454
break;
4045540455
case 'y':
4045640456
mask = LRE_FLAG_STICKY;
@@ -40468,7 +40468,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
4046840468
JS_FreeCString(ctx, str);
4046940469
}
4047040470

40471-
str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UTF16));
40471+
str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UNICODE));
4047240472
if (!str)
4047340473
return JS_EXCEPTION;
4047440474
re_bytecode_buf = lre_compile(&re_bytecode_len, error_msg,
@@ -41113,7 +41113,7 @@ static JSValue JS_RegExpDelete(JSContext *ctx, JSValueConst this_val, JSValueCon
4111341113
break;
4111441114
}
4111541115
if (end == start) {
41116-
if (!(re_flags & LRE_FLAG_UTF16) || (unsigned)end >= str->len || !str->is_wide_char) {
41116+
if (!(re_flags & LRE_FLAG_UNICODE) || (unsigned)end >= str->len || !str->is_wide_char) {
4111741117
end++;
4111841118
} else {
4111941119
string_getc(str, &end);
@@ -41873,7 +41873,7 @@ static const JSCFunctionListEntry js_regexp_proto_funcs[] = {
4187341873
JS_CGETSET_MAGIC_DEF("ignoreCase", js_regexp_get_flag, NULL, LRE_FLAG_IGNORECASE ),
4187441874
JS_CGETSET_MAGIC_DEF("multiline", js_regexp_get_flag, NULL, LRE_FLAG_MULTILINE ),
4187541875
JS_CGETSET_MAGIC_DEF("dotAll", js_regexp_get_flag, NULL, LRE_FLAG_DOTALL ),
41876-
JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UTF16 ),
41876+
JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE ),
4187741877
JS_CGETSET_MAGIC_DEF("sticky", js_regexp_get_flag, NULL, LRE_FLAG_STICKY ),
4187841878
JS_CGETSET_MAGIC_DEF("hasIndices", js_regexp_get_flag, NULL, LRE_FLAG_INDICES ),
4187941879
JS_CFUNC_DEF("exec", 1, js_regexp_exec ),

0 commit comments

Comments
 (0)