Skip to content

Commit df8f938

Browse files
byroothsbt
authored andcommitted
[ruby/json] json_string_unescape: use memchr to search for backslashes
ruby/json@5e6cfcf724
1 parent ba8f22c commit df8f938

File tree

1 file changed

+71
-80
lines changed

1 file changed

+71
-80
lines changed

ext/json/parser/parser.c

Lines changed: 71 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -592,96 +592,87 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
592592
}
593593
}
594594

595-
pe = memchr(p, '\\', bufferSize);
596-
if (RB_UNLIKELY(pe == NULL)) {
597-
return build_string(string, stringEnd, intern, symbolize);
598-
}
599-
600595
VALUE result = rb_str_buf_new(bufferSize);
601596
rb_enc_associate_index(result, utf8_encindex);
602597
buffer = RSTRING_PTR(result);
603598
bufferStart = buffer;
604599

605-
while (pe < stringEnd) {
606-
if (*pe == '\\') {
607-
unescape = (char *) "?";
608-
unescape_len = 1;
609-
if (pe > p) {
610-
MEMCPY(buffer, p, char, pe - p);
611-
buffer += pe - p;
612-
}
613-
switch (*++pe) {
614-
case 'n':
615-
unescape = (char *) "\n";
616-
break;
617-
case 'r':
618-
unescape = (char *) "\r";
619-
break;
620-
case 't':
621-
unescape = (char *) "\t";
622-
break;
623-
case '"':
624-
unescape = (char *) "\"";
625-
break;
626-
case '\\':
627-
unescape = (char *) "\\";
628-
break;
629-
case 'b':
630-
unescape = (char *) "\b";
631-
break;
632-
case 'f':
633-
unescape = (char *) "\f";
634-
break;
635-
case 'u':
636-
if (pe > stringEnd - 4) {
637-
raise_parse_error("incomplete unicode character escape sequence at '%s'", p);
638-
} else {
639-
uint32_t ch = unescape_unicode((unsigned char *) ++pe);
640-
pe += 3;
641-
/* To handle values above U+FFFF, we take a sequence of
642-
* \uXXXX escapes in the U+D800..U+DBFF then
643-
* U+DC00..U+DFFF ranges, take the low 10 bits from each
644-
* to make a 20-bit number, then add 0x10000 to get the
645-
* final codepoint.
646-
*
647-
* See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
648-
* Surrogate Pairs in UTF-16", and 23.6 "Surrogates
649-
* Area".
650-
*/
651-
if ((ch & 0xFC00) == 0xD800) {
652-
pe++;
653-
if (pe > stringEnd - 6) {
654-
raise_parse_error("incomplete surrogate pair at '%s'", p);
655-
}
656-
if (pe[0] == '\\' && pe[1] == 'u') {
657-
uint32_t sur = unescape_unicode((unsigned char *) pe + 2);
658-
ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
659-
| (sur & 0x3FF));
660-
pe += 5;
661-
} else {
662-
unescape = (char *) "?";
663-
break;
664-
}
600+
while ((pe = memchr(pe, '\\', stringEnd - pe))) {
601+
unescape = (char *) "?";
602+
unescape_len = 1;
603+
if (pe > p) {
604+
MEMCPY(buffer, p, char, pe - p);
605+
buffer += pe - p;
606+
}
607+
switch (*++pe) {
608+
case 'n':
609+
unescape = (char *) "\n";
610+
break;
611+
case 'r':
612+
unescape = (char *) "\r";
613+
break;
614+
case 't':
615+
unescape = (char *) "\t";
616+
break;
617+
case '"':
618+
unescape = (char *) "\"";
619+
break;
620+
case '\\':
621+
unescape = (char *) "\\";
622+
break;
623+
case 'b':
624+
unescape = (char *) "\b";
625+
break;
626+
case 'f':
627+
unescape = (char *) "\f";
628+
break;
629+
case 'u':
630+
if (pe > stringEnd - 4) {
631+
raise_parse_error("incomplete unicode character escape sequence at '%s'", p);
632+
} else {
633+
uint32_t ch = unescape_unicode((unsigned char *) ++pe);
634+
pe += 3;
635+
/* To handle values above U+FFFF, we take a sequence of
636+
* \uXXXX escapes in the U+D800..U+DBFF then
637+
* U+DC00..U+DFFF ranges, take the low 10 bits from each
638+
* to make a 20-bit number, then add 0x10000 to get the
639+
* final codepoint.
640+
*
641+
* See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
642+
* Surrogate Pairs in UTF-16", and 23.6 "Surrogates
643+
* Area".
644+
*/
645+
if ((ch & 0xFC00) == 0xD800) {
646+
pe++;
647+
if (pe > stringEnd - 6) {
648+
raise_parse_error("incomplete surrogate pair at '%s'", p);
649+
}
650+
if (pe[0] == '\\' && pe[1] == 'u') {
651+
uint32_t sur = unescape_unicode((unsigned char *) pe + 2);
652+
ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
653+
| (sur & 0x3FF));
654+
pe += 5;
655+
} else {
656+
unescape = (char *) "?";
657+
break;
665658
}
666-
unescape_len = convert_UTF32_to_UTF8(buf, ch);
667-
unescape = buf;
668659
}
669-
break;
670-
default:
671-
p = pe;
672-
continue;
673-
}
674-
MEMCPY(buffer, unescape, char, unescape_len);
675-
buffer += unescape_len;
676-
p = ++pe;
677-
} else {
678-
pe++;
660+
unescape_len = convert_UTF32_to_UTF8(buf, ch);
661+
unescape = buf;
662+
}
663+
break;
664+
default:
665+
p = pe;
666+
continue;
679667
}
668+
MEMCPY(buffer, unescape, char, unescape_len);
669+
buffer += unescape_len;
670+
p = ++pe;
680671
}
681672

682-
if (pe > p) {
683-
MEMCPY(buffer, p, char, pe - p);
684-
buffer += pe - p;
673+
if (stringEnd > p) {
674+
MEMCPY(buffer, p, char, stringEnd - p);
675+
buffer += stringEnd - p;
685676
}
686677
rb_str_set_len(result, buffer - bufferStart);
687678

0 commit comments

Comments
 (0)