Skip to content

Commit 4f4551f

Browse files
committed
parser.c: Record escape positions while parsing
We can then pass them to the decoder to save having to parse the string again. ``` == Parsing activitypub.json (58160 bytes) ruby 3.4.6 (2025-09-16 revision dbd83256b1) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- after 1.275k i/100ms Calculating ------------------------------------- after 12.774k (± 0.8%) i/s (78.29 μs/i) - 65.025k in 5.090834s Comparison: before: 12314.3 i/s after: 12773.8 i/s - 1.04x faster == Parsing twitter.json (567916 bytes) ruby 3.4.6 (2025-09-16 revision dbd83256b1) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- after 143.000 i/100ms Calculating ------------------------------------- after 1.441k (± 0.2%) i/s (693.86 μs/i) - 7.293k in 5.060345s Comparison: before: 1430.1 i/s after: 1441.2 i/s - 1.01x faster == Parsing citm_catalog.json (1727030 bytes) ruby 3.4.6 (2025-09-16 revision dbd83256b1) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- after 69.000 i/100ms Calculating ------------------------------------- after 695.919 (± 0.4%) i/s (1.44 ms/i) - 3.519k in 5.056691s Comparison: before: 687.8 i/s after: 695.9 i/s - 1.01x faster ```
1 parent f7f8f55 commit 4f4551f

File tree

1 file changed

+63
-25
lines changed

1 file changed

+63
-25
lines changed

ext/json/ext/parser/parser.c

Lines changed: 63 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -616,8 +616,10 @@ static inline bool json_string_cacheable_p(const char *string, size_t length)
616616
return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]);
617617
}
618618

619-
static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
619+
static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name)
620620
{
621+
bool intern = is_name || config->freeze;
622+
bool symbolize = is_name && config->symbolize_names;
621623
size_t bufferSize = stringEnd - string;
622624

623625
if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) {
@@ -636,8 +638,33 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *st
636638
return build_string(string, stringEnd, intern, symbolize);
637639
}
638640

639-
static VALUE json_string_unescape(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
641+
#define JSON_MAX_UNESCAPE_POSITIONS 16
642+
typedef struct _json_unescape_positions {
643+
long size;
644+
const char **positions;
645+
bool has_more;
646+
} JSON_UnescapePositions;
647+
648+
static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions)
649+
{
650+
while (positions->size) {
651+
positions->size--;
652+
const char *next_position = positions->positions[0];
653+
positions->positions++;
654+
return next_position;
655+
}
656+
657+
if (positions->has_more) {
658+
return memchr(pe, '\\', stringEnd - pe);
659+
}
660+
661+
return NULL;
662+
}
663+
664+
static NOINLINE() VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions)
640665
{
666+
bool intern = is_name || config->freeze;
667+
bool symbolize = is_name && config->symbolize_names;
641668
size_t bufferSize = stringEnd - string;
642669
const char *p = string, *pe = string, *bufferStart;
643670
char *buffer;
@@ -649,7 +676,7 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
649676

650677
#define APPEND_CHAR(chr) *buffer++ = chr; p = ++pe;
651678

652-
while (pe < stringEnd && (pe = memchr(pe, '\\', stringEnd - pe))) {
679+
while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) {
653680
if (pe > p) {
654681
MEMCPY(buffer, p, char, pe - p);
655682
buffer += pe - p;
@@ -893,20 +920,6 @@ static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfi
893920
return object;
894921
}
895922

896-
static inline VALUE json_decode_string(JSON_ParserState *state, JSON_ParserConfig *config, const char *start, const char *end, bool escaped, bool is_name)
897-
{
898-
VALUE string;
899-
bool intern = is_name || config->freeze;
900-
bool symbolize = is_name && config->symbolize_names;
901-
if (escaped) {
902-
string = json_string_unescape(state, start, end, is_name, intern, symbolize);
903-
} else {
904-
string = json_string_fastpath(state, start, end, is_name, intern, symbolize);
905-
}
906-
907-
return string;
908-
}
909-
910923
static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *config, VALUE value)
911924
{
912925
if (RB_UNLIKELY(config->on_load_proc)) {
@@ -964,22 +977,30 @@ static ALWAYS_INLINE() bool string_scan(JSON_ParserState *state)
964977
return false;
965978
}
966979

967-
static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
980+
static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name, const char *start)
968981
{
969-
state->cursor++;
970-
const char *start = state->cursor;
971-
bool escaped = false;
982+
const char *backslashes[JSON_MAX_UNESCAPE_POSITIONS];
983+
JSON_UnescapePositions positions = {
984+
.size = 0,
985+
.positions = backslashes,
986+
.has_more = false,
987+
};
972988

973-
while (RB_UNLIKELY(string_scan(state))) {
989+
do {
974990
switch (*state->cursor) {
975991
case '"': {
976-
VALUE string = json_decode_string(state, config, start, state->cursor, escaped, is_name);
992+
VALUE string = json_string_unescape(state, config, start, state->cursor, is_name, &positions);
977993
state->cursor++;
978994
return json_push_value(state, config, string);
979995
}
980996
case '\\': {
997+
if (RB_LIKELY(positions.size < JSON_MAX_UNESCAPE_POSITIONS)) {
998+
backslashes[positions.size] = state->cursor;
999+
positions.size++;
1000+
} else {
1001+
positions.has_more = true;
1002+
}
9811003
state->cursor++;
982-
escaped = true;
9831004
break;
9841005
}
9851006
default:
@@ -988,12 +1009,29 @@ static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig
9881009
}
9891010

9901011
state->cursor++;
991-
}
1012+
} while (string_scan(state));
9921013

9931014
raise_parse_error("unexpected end of input, expected closing \"", state);
9941015
return Qfalse;
9951016
}
9961017

1018+
static ALWAYS_INLINE() VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
1019+
{
1020+
state->cursor++;
1021+
const char *start = state->cursor;
1022+
1023+
if (RB_UNLIKELY(!string_scan(state))) {
1024+
raise_parse_error("unexpected end of input, expected closing \"", state);
1025+
}
1026+
1027+
if (RB_LIKELY(*state->cursor == '"')) {
1028+
VALUE string = json_string_fastpath(state, config, start, state->cursor, is_name);
1029+
state->cursor++;
1030+
return json_push_value(state, config, string);
1031+
}
1032+
return json_parse_escaped_string(state, config, is_name, start);
1033+
}
1034+
9971035
#if JSON_CPU_LITTLE_ENDIAN_64BITS
9981036
// From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/
9991037
// Additional References:

0 commit comments

Comments
 (0)