Skip to content

Commit 97713ac

Browse files
byroothsbt
authored andcommitted
[ruby/json] convert_UTF8_to_JSON: repurpose the escape tables into size tables
Since we're looking up the table anyway, we might as well store the UTF-8 char length in it. For single byte characters that don't need escaping we store `0`. This helps on strings with lots of multi-byte characters: Before: ``` == Encoding mostly utf8 (20004001 bytes) ruby 3.3.4 (2024-07-09 revision ruby/json@be1089c8ec) +YJIT [arm64-darwin23] Warming up -------------------------------------- json 6.000 i/100ms oj 10.000 i/100ms rapidjson 2.000 i/100ms Calculating ------------------------------------- json 67.978 (± 1.5%) i/s (14.71 ms/i) - 342.000 in 5.033062s oj 100.876 (± 2.0%) i/s (9.91 ms/i) - 510.000 in 5.058080s rapidjson 26.389 (± 7.6%) i/s (37.89 ms/i) - 132.000 in 5.027681s Comparison: json: 68.0 i/s oj: 100.9 i/s - 1.48x faster rapidjson: 26.4 i/s - 2.58x slower ``` After: ``` == Encoding mostly utf8 (20004001 bytes) ruby 3.3.4 (2024-07-09 revision ruby/json@be1089c8ec) +YJIT [arm64-darwin23] Warming up -------------------------------------- json 7.000 i/100ms oj 10.000 i/100ms rapidjson 2.000 i/100ms Calculating ------------------------------------- json 75.187 (± 2.7%) i/s (13.30 ms/i) - 378.000 in 5.030111s oj 95.196 (± 2.1%) i/s (10.50 ms/i) - 480.000 in 5.043565s rapidjson 25.969 (± 3.9%) i/s (38.51 ms/i) - 130.000 in 5.011471s Comparison: json: 75.2 i/s oj: 95.2 i/s - 1.27x faster rapidjson: 26.0 i/s - 2.90x slower ``` ruby/json@51e2631d1f
1 parent 9f300d0 commit 97713ac

File tree

2 files changed

+99
-64
lines changed

2 files changed

+99
-64
lines changed

benchmark/encoder.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [
5959
benchmark_encoding "small nested array", [[1,2,3,4,5]]*10
6060
benchmark_encoding "small hash", { "username" => "jhawthorn", "id" => 123, "event" => "wrote json serializer" }
6161

62+
# On this one we're a bit slower (~25%).
63+
benchmark_encoding "mostly utf8", ([("€" * 3333)] * 2000), except: %i(json_state)
64+
6265
# On these three benchmarks we perform well. Either on par or very closely faster/slower
6366
benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 2000), except: %i(json_state)
6467
benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json"), except: %i(json_state)

ext/json/generator/generator.c

Lines changed: 96 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -25,65 +25,69 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend;
2525
* Everything else (should be UTF-8) is just passed through and
2626
* appended to the result.
2727
*/
28-
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256], bool out_script_safe)
28+
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
2929
{
3030
const char *hexdig = "0123456789abcdef";
3131
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
3232

3333
const char *ptr = RSTRING_PTR(str);
3434
unsigned long len = RSTRING_LEN(str);
3535

36-
unsigned long beg = 0, pos;
36+
unsigned long beg = 0, pos = 0;
3737

38-
for (pos = 0; pos < len;) {
38+
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
39+
40+
while (pos < len) {
3941
unsigned char ch = ptr[pos];
42+
unsigned char ch_len = escape_table[ch];
4043
/* JSON encoding */
41-
if (escape_table[ch]) {
42-
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
43-
switch (ch) {
44-
case '"': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\"", 2); break;
45-
case '\\': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\\", 2); break;
46-
case '/': FLUSH_POS(1); fbuffer_append(out_buffer, "\\/", 2); break;
47-
case '\b': FLUSH_POS(1); fbuffer_append(out_buffer, "\\b", 2); break;
48-
case '\f': FLUSH_POS(1); fbuffer_append(out_buffer, "\\f", 2); break;
49-
case '\n': FLUSH_POS(1); fbuffer_append(out_buffer, "\\n", 2); break;
50-
case '\r': FLUSH_POS(1); fbuffer_append(out_buffer, "\\r", 2); break;
51-
case '\t': FLUSH_POS(1); fbuffer_append(out_buffer, "\\t", 2); break;
52-
default: {
53-
if ((ch & 0x80) == 0x00) { /* leading 1 bit is 0b0 */
54-
FLUSH_POS(1);
55-
scratch[2] = hexdig[ch >> 12];
56-
scratch[3] = hexdig[(ch >> 8) & 0xf];
57-
scratch[4] = hexdig[(ch >> 4) & 0xf];
58-
scratch[5] = hexdig[ch & 0xf];
59-
fbuffer_append(out_buffer, scratch, 6);
60-
} else if ((ch & 0xE0) == 0xC0) { /* leading 3 bits are 0b110 */
61-
pos += 2;
62-
} else if ((ch & 0xF0) == 0xE0) { /* leading 4 bits are 0b1110 */
63-
unsigned char b2 = ptr[pos + 1];
44+
45+
if (RB_UNLIKELY(ch_len)) {
46+
switch (ch_len) {
47+
case 0:
48+
pos++;
49+
break;
50+
case 1: {
51+
FLUSH_POS(1);
52+
switch (ch) {
53+
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
54+
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
55+
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
56+
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
57+
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
58+
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
59+
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
60+
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
61+
default: {
62+
scratch[2] = hexdig[ch >> 12];
63+
scratch[3] = hexdig[(ch >> 8) & 0xf];
64+
scratch[4] = hexdig[(ch >> 4) & 0xf];
65+
scratch[5] = hexdig[ch & 0xf];
66+
fbuffer_append(out_buffer, scratch, 6);
67+
break;
68+
}
69+
}
70+
break;
71+
}
72+
case 3: {
73+
unsigned char b2 = ptr[pos + 1];
74+
if (out_script_safe && b2 == 0x80) {
6475
unsigned char b3 = ptr[pos + 2];
65-
if (out_script_safe && (b2 == 0x80)) {
66-
if (b3 == 0xA8) {
67-
FLUSH_POS(3);
68-
fprintf(stderr, "escape: \\u2028 pos = %ld\n", pos);
69-
fbuffer_append(out_buffer, "\\u2028", 6);
70-
} else if (b3 == 0xA9) {
71-
FLUSH_POS(3);
72-
fprintf(stderr, "escape: \\u2029 pos = %ld\n", pos);
73-
fbuffer_append(out_buffer, "\\u2029", 6);
74-
} else {
75-
pos += 3;
76-
}
77-
} else {
78-
pos += 3;
76+
if (b3 == 0xA8) {
77+
FLUSH_POS(3);
78+
fbuffer_append(out_buffer, "\\u2028", 6);
79+
break;
80+
} else if (b3 == 0xA9) {
81+
FLUSH_POS(3);
82+
fbuffer_append(out_buffer, "\\u2029", 6);
83+
break;
7984
}
80-
} else if ((ch & 0xF8) == 0xF0) { /* leading 5 bits are 0b11110 */
81-
pos += 4;
82-
} else {
83-
// This should be unreachable
84-
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
8585
}
86+
// fallthrough
8687
}
88+
default:
89+
pos += ch_len;
90+
break;
8791
}
8892
} else {
8993
pos++;
@@ -98,29 +102,57 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool esca
98102
RB_GC_GUARD(str);
99103
}
100104

101-
static const bool escape_table[256] = {
102-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
103-
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' */
104-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
105-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
106-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
107-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
105+
static const char escape_table[256] = {
106+
// ASCII Control Characters
107+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109+
// ASCII Characters
110+
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
111+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
112+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
113+
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
114+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
115+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
116+
// Continuation byte
117+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
118+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
119+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
120+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
121+
// First byte of a 2-byte code point
122+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
123+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
124+
// First byte of a 4-byte code point
125+
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
126+
//First byte of a 4+byte code point
127+
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
110128
};
111129

112-
static const bool script_safe_escape_table[256] = {
113-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
114-
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' and '/' */
115-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
116-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
117-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
118-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
119-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
120-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
130+
static const char script_safe_escape_table[256] = {
131+
// ASCII Control Characters
132+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
133+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
134+
// ASCII Characters
135+
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/'
136+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
137+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
138+
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
139+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
140+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
141+
// Continuation byte
142+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
143+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
144+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
145+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
146+
// First byte of a 2-byte code point
147+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
148+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
149+
// First byte of a 4-byte code point
150+
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
151+
//First byte of a 4+byte code point
152+
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
121153
};
122154

123-
static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256])
155+
static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256])
124156
{
125157
const char *hexdig = "0123456789abcdef";
126158
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

0 commit comments

Comments
 (0)