Skip to content

Commit bef2a12

Browse files
authored
DRY surrogate pair handling (#95)
1 parent d1960d1 commit bef2a12

File tree

3 files changed

+100
-93
lines changed

3 files changed

+100
-93
lines changed

cutils.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,21 @@ static inline void dbuf_set_error(DynBuf *s)
278278
int unicode_to_utf8(uint8_t *buf, unsigned int c);
279279
int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
280280

281+
static inline BOOL is_hi_surrogate(uint32_t c)
282+
{
283+
return 54 == (c >> 10); // 0xD800-0xDBFF
284+
}
285+
286+
static inline BOOL is_lo_surrogate(uint32_t c)
287+
{
288+
return 55 == (c >> 10); // 0xDC00-0xDFFF
289+
}
290+
291+
static inline uint32_t from_surrogate(uint32_t hi, uint32_t lo)
292+
{
293+
return 65536 + 1024 * (hi & 1023) + (lo & 1023);
294+
}
295+
281296
static inline int from_hex(int c)
282297
{
283298
if (c >= '0' && c <= '9')

libregexp.c

Lines changed: 70 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,7 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16)
550550
}
551551
c = (c << 4) | h;
552552
}
553-
if (c >= 0xd800 && c < 0xdc00 &&
553+
if (is_hi_surrogate(c) &&
554554
allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
555555
/* convert an escaped surrogate pair into a
556556
unicode char */
@@ -561,9 +561,9 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16)
561561
break;
562562
c1 = (c1 << 4) | h;
563563
}
564-
if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) {
564+
if (i == 4 && is_lo_surrogate(c1)) {
565565
p += 6;
566-
c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
566+
c = from_surrogate(c, c1);
567567
}
568568
}
569569
}
@@ -1092,10 +1092,10 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
10921092
break;
10931093
} else if (c >= 128) {
10941094
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
1095-
if (c >= 0xD800 && c <= 0xDBFF) {
1095+
if (is_hi_surrogate(c)) {
10961096
d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
1097-
if (d >= 0xDC00 && d <= 0xDFFF) {
1098-
c = 0x10000 + 0x400 * (c - 0xD800) + (d - 0xDC00);
1097+
if (is_lo_surrogate(d)) {
1098+
c = from_surrogate(c, d);
10991099
p = p1;
11001100
}
11011101
}
@@ -1935,88 +1935,81 @@ static BOOL is_word_char(uint32_t c)
19351935
if (cbuf_type == 0) { \
19361936
c = *cptr++; \
19371937
} else { \
1938-
uint32_t __c1; \
1939-
c = *(uint16_t *)cptr; \
1940-
cptr += 2; \
1941-
if (c >= 0xd800 && c < 0xdc00 && \
1942-
cbuf_type == 2 && cptr < cbuf_end) { \
1943-
__c1 = *(uint16_t *)cptr; \
1944-
if (__c1 >= 0xdc00 && __c1 < 0xe000) { \
1945-
c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
1946-
cptr += 2; \
1947-
} \
1948-
} \
1938+
const uint16_t *_p = (uint16_t *)cptr; \
1939+
const uint16_t *_end = (uint16_t *)cbuf_end; \
1940+
c = *_p++; \
1941+
if (is_hi_surrogate(c)) \
1942+
if (cbuf_type == 2) \
1943+
if (_p < _end) \
1944+
if (is_lo_surrogate(*_p)) \
1945+
c = from_surrogate(c, *_p++); \
1946+
cptr = (void *) _p; \
19491947
} \
19501948
} while (0)
19511949

1952-
#define PEEK_CHAR(c, cptr, cbuf_end) \
1953-
do { \
1954-
if (cbuf_type == 0) { \
1955-
c = cptr[0]; \
1956-
} else { \
1957-
uint32_t __c1; \
1958-
c = ((uint16_t *)cptr)[0]; \
1959-
if (c >= 0xd800 && c < 0xdc00 && \
1960-
cbuf_type == 2 && (cptr + 2) < cbuf_end) { \
1961-
__c1 = ((uint16_t *)cptr)[1]; \
1962-
if (__c1 >= 0xdc00 && __c1 < 0xe000) { \
1963-
c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
1964-
} \
1965-
} \
1966-
} \
1950+
#define PEEK_CHAR(c, cptr, cbuf_end) \
1951+
do { \
1952+
if (cbuf_type == 0) { \
1953+
c = cptr[0]; \
1954+
} else { \
1955+
const uint16_t *_p = (uint16_t *)cptr; \
1956+
const uint16_t *_end = (uint16_t *)cbuf_end; \
1957+
c = *_p++; \
1958+
if (is_hi_surrogate(c)) \
1959+
if (cbuf_type == 2) \
1960+
if (_p < _end) \
1961+
if (is_lo_surrogate(*_p)) \
1962+
c = from_surrogate(c, *_p++); \
1963+
} \
19671964
} while (0)
19681965

1969-
#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \
1970-
do { \
1971-
if (cbuf_type == 0) { \
1972-
c = cptr[-1]; \
1973-
} else { \
1974-
uint32_t __c1; \
1975-
c = ((uint16_t *)cptr)[-1]; \
1976-
if (c >= 0xdc00 && c < 0xe000 && \
1977-
cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \
1978-
__c1 = ((uint16_t *)cptr)[-2]; \
1979-
if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
1980-
c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
1981-
} \
1982-
} \
1966+
#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \
1967+
do { \
1968+
if (cbuf_type == 0) { \
1969+
c = cptr[-1]; \
1970+
} else { \
1971+
const uint16_t *_p = (uint16_t *)cptr - 1; \
1972+
const uint16_t *_start = (uint16_t *)cbuf_start; \
1973+
c = *_p; \
1974+
if (is_lo_surrogate(c)) \
1975+
if (cbuf_type == 2) \
1976+
if (_p > _start) \
1977+
if (is_hi_surrogate(*--_p)) \
1978+
c = from_surrogate(*_p, c); \
19831979
} \
19841980
} while (0)
19851981

1986-
#define GET_PREV_CHAR(c, cptr, cbuf_start) \
1987-
do { \
1988-
if (cbuf_type == 0) { \
1989-
cptr--; \
1990-
c = cptr[0]; \
1991-
} else { \
1992-
uint32_t __c1; \
1993-
cptr -= 2; \
1994-
c = ((uint16_t *)cptr)[0]; \
1995-
if (c >= 0xdc00 && c < 0xe000 && \
1996-
cbuf_type == 2 && cptr > cbuf_start) { \
1997-
__c1 = ((uint16_t *)cptr)[-1]; \
1998-
if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
1999-
cptr -= 2; \
2000-
c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
2001-
} \
2002-
} \
1982+
#define GET_PREV_CHAR(c, cptr, cbuf_start) \
1983+
do { \
1984+
if (cbuf_type == 0) { \
1985+
cptr--; \
1986+
c = cptr[0]; \
1987+
} else { \
1988+
const uint16_t *_p = (uint16_t *)cptr - 1; \
1989+
const uint16_t *_start = (uint16_t *)cbuf_start; \
1990+
c = *_p; \
1991+
if (is_lo_surrogate(c)) \
1992+
if (cbuf_type == 2) \
1993+
if (_p > _start) \
1994+
if (is_hi_surrogate(*--_p)) \
1995+
c = from_surrogate(*_p, c); \
1996+
cptr = (void *) _p; \
20031997
} \
20041998
} while (0)
20051999

2006-
#define PREV_CHAR(cptr, cbuf_start) \
2007-
do { \
2008-
if (cbuf_type == 0) { \
2009-
cptr--; \
2010-
} else { \
2011-
cptr -= 2; \
2012-
if (cbuf_type == 2) { \
2013-
c = ((uint16_t *)cptr)[0]; \
2014-
if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \
2015-
c = ((uint16_t *)cptr)[-1]; \
2016-
if (c >= 0xd800 && c < 0xdc00) \
2017-
cptr -= 2; \
2018-
} \
2019-
} \
2000+
#define PREV_CHAR(cptr, cbuf_start) \
2001+
do { \
2002+
if (cbuf_type == 0) { \
2003+
cptr--; \
2004+
} else { \
2005+
const uint16_t *_p = (uint16_t *)cptr - 1; \
2006+
const uint16_t *_start = (uint16_t *)cbuf_start; \
2007+
if (is_lo_surrogate(*_p)) \
2008+
if (cbuf_type == 2) \
2009+
if (_p > _start) \
2010+
if (is_hi_surrogate(_p[-1])) \
2011+
_p--; \
2012+
cptr = (void *) _p; \
20202013
} \
20212014
} while (0)
20222015

quickjs.c

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3501,10 +3501,10 @@ static int string_getc(const JSString *p, int *pidx)
35013501
idx = *pidx;
35023502
if (p->is_wide_char) {
35033503
c = p->u.str16[idx++];
3504-
if (c >= 0xd800 && c < 0xdc00 && idx < p->len) {
3504+
if (is_hi_surrogate(c) && idx < p->len) {
35053505
c1 = p->u.str16[idx];
3506-
if (c1 >= 0xdc00 && c1 < 0xe000) {
3507-
c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
3506+
if (is_lo_surrogate(c1)) {
3507+
c = from_surrogate(c, c1);
35083508
idx++;
35093509
}
35103510
}
@@ -3842,13 +3842,12 @@ const char *JS_ToCStringLen2(JSContext *ctx, size_t *plen, JSValueConst val1, BO
38423842
if (c < 0x80) {
38433843
*q++ = c;
38443844
} else {
3845-
if (c >= 0xd800 && c < 0xdc00) {
3845+
if (is_hi_surrogate(c)) {
38463846
if (pos < len && !cesu8) {
38473847
c1 = src[pos];
3848-
if (c1 >= 0xdc00 && c1 < 0xe000) {
3848+
if (is_lo_surrogate(c1)) {
38493849
pos++;
3850-
/* surrogate pair */
3851-
c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
3850+
c = from_surrogate(c, c1);
38523851
} else {
38533852
/* Keep unmatched surrogate code points */
38543853
/* c = 0xfffd; */ /* error */
@@ -11087,7 +11086,7 @@ static JSValue JS_ToQuotedString(JSContext *ctx, JSValueConst val1)
1108711086
goto fail;
1108811087
break;
1108911088
default:
11090-
if (c < 32 || (c >= 0xd800 && c < 0xe000)) {
11089+
if (c < 32 || is_hi_surrogate(c) || is_lo_surrogate(c)) {
1109111090
snprintf(buf, sizeof(buf), "\\u%04x", c);
1109211091
if (string_buffer_puts8(b, buf))
1109311092
goto fail;
@@ -39098,10 +39097,10 @@ static int string_prevc(JSString *p, int *pidx)
3909839097
idx--;
3909939098
if (p->is_wide_char) {
3910039099
c = p->u.str16[idx];
39101-
if (c >= 0xdc00 && c < 0xe000 && idx > 0) {
39100+
if (is_lo_surrogate(c) && idx > 0) {
3910239101
c1 = p->u.str16[idx - 1];
39103-
if (c1 >= 0xd800 && c1 <= 0xdc00) {
39104-
c = (((c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000;
39102+
if (is_hi_surrogate(c1)) {
39103+
c = from_surrogate(c1, c);
3910539104
idx--;
3910639105
}
3910739106
}
@@ -45453,7 +45452,7 @@ static JSValue js_global_decodeURI(JSContext *ctx, JSValueConst this_val,
4545345452
c = (c << 6) | (c1 & 0x3f);
4545445453
}
4545545454
if (c < c_min || c > 0x10FFFF ||
45456-
(c >= 0xd800 && c < 0xe000)) {
45455+
is_hi_surrogate(c) || is_lo_surrogate(c)) {
4545745456
js_throw_URIError(ctx, "malformed UTF-8");
4545845457
goto fail;
4545945458
}
@@ -45528,21 +45527,21 @@ static JSValue js_global_encodeURI(JSContext *ctx, JSValueConst this_val,
4552845527
if (isURIUnescaped(c, isComponent)) {
4552945528
string_buffer_putc16(b, c);
4553045529
} else {
45531-
if (c >= 0xdc00 && c <= 0xdfff) {
45530+
if (is_lo_surrogate(c)) {
4553245531
js_throw_URIError(ctx, "invalid character");
4553345532
goto fail;
45534-
} else if (c >= 0xd800 && c <= 0xdbff) {
45533+
} else if (is_hi_surrogate(c)) {
4553545534
if (k >= p->len) {
4553645535
js_throw_URIError(ctx, "expecting surrogate pair");
4553745536
goto fail;
4553845537
}
4553945538
c1 = string_get(p, k);
4554045539
k++;
45541-
if (c1 < 0xdc00 || c1 > 0xdfff) {
45540+
if (!is_lo_surrogate(c1)) {
4554245541
js_throw_URIError(ctx, "expecting surrogate pair");
4554345542
goto fail;
4554445543
}
45545-
c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
45544+
c = from_surrogate(c, c1);
4554645545
}
4554745546
if (c < 0x80) {
4554845547
encodeURI_hex(b, c);

0 commit comments

Comments
 (0)