Skip to content

Commit 67246de

Browse files
committed
Improve lookup tables for string escaping.
Introduce a simplified table for the most common case, which is `script_safe: false, ascii_only: false`. On the `script_safe` table, now only `0xE2` does a multi-byte check. Merge back `convert_ASCII_to_JSON`, as it no longer help much with the simplified escape table. ``` == Encoding mixed utf8 (5003001 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 38.000 i/100ms Calculating ------------------------------------- after 398.220 (± 3.0%) i/s (2.51 ms/i) - 2.014k in 5.061659s Comparison: before: 381.8 i/s after: 398.2 i/s - same-ish: difference falls within error == Encoding mostly utf8 (5001001 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 39.000 i/100ms Calculating ------------------------------------- after 393.337 (± 2.5%) i/s (2.54 ms/i) - 1.989k in 5.059397s Comparison: before: 304.3 i/s after: 393.3 i/s - 1.29x faster == Encoding twitter.json (466906 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 244.000 i/100ms Calculating ------------------------------------- after 2.436k (± 0.9%) i/s (410.43 μs/i) - 12.200k in 5.007702s Comparison: before: 2125.9 i/s after: 2436.5 i/s - 1.15x faster ```
1 parent f745ec1 commit 67246de

File tree

2 files changed

+81
-111
lines changed

2 files changed

+81
-111
lines changed

benchmark/encoder.rb

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,10 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [
6868
benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 500)
6969
benchmark_encoding "mostly utf8", ([("€" * 3333)] * 500)
7070

71-
# On these benchmarks we perform well, we're on par or better.
71+
# On these benchmarks we perform well, we're on par or a bit better.
7272
benchmark_encoding "integers", (1_000_000..1_001_000).to_a, except: %i(json_state)
7373
benchmark_encoding "activitypub.json", JSON.load_file("#{__dir__}/data/activitypub.json")
7474
benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json")
75-
76-
# On twitter.json we're still about 6% slower, this is worth investigating.
7775
benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json")
7876

7977
# This benchmark spent the overwhelming majority of its time in `ruby_dtoa`. We rely on Ruby's implementation

ext/json/ext/generator/generator.c

Lines changed: 80 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,74 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
9696
raise_generator_error_str(invalid_object, str);
9797
}
9898

99+
// 0 - single byte char that don't need to be escaped.
100+
// (x | 8) - char that needs to be escaped.
101+
static const unsigned char CHAR_LENGTH_MASK = 7;
102+
static const unsigned char ESCAPE_MASK = 8;
103+
104+
static const unsigned char escape_table[256] = {
105+
// ASCII Control Characters
106+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
107+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
108+
// ASCII Characters
109+
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
110+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
111+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
113+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115+
};
116+
117+
static const unsigned char ascii_only_escape_table[256] = {
118+
// ASCII Control Characters
119+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
120+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
121+
// ASCII Characters
122+
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
123+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
126+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
127+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128+
// Continuation byte
129+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
130+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
133+
// First byte of a 2-byte code point
134+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
135+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136+
// First byte of a 3-byte code point
137+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
138+
//First byte of a 4+ byte code point
139+
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
140+
};
141+
142+
static const unsigned char script_safe_escape_table[256] = {
143+
// ASCII Control Characters
144+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
145+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
146+
// ASCII Characters
147+
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, // '"' and '/'
148+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
149+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
150+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
151+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
152+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
153+
// Continuation byte
154+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
155+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158+
// First byte of a 2-byte code point
159+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
160+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
161+
// First byte of a 3-byte code point
162+
3, 3,11, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE2 is the start of \u2028 and \u2029
163+
//First byte of a 4+ byte code point
164+
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
165+
};
166+
99167
/* Converts in_string to a JSON string (without the wrapping '"'
100168
* characters) in FBuffer out_buffer.
101169
*
@@ -106,13 +174,13 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
106174
*
107175
* - If out_ascii_only: non-ASCII characters (>0x7F)
108176
*
109-
* - If out_script_safe: forwardslash, line separator (U+2028), and
177+
* - If script_safe: forwardslash (/), line separator (U+2028), and
110178
* paragraph separator (U+2029)
111179
*
112180
* Everything else (should be UTF-8) is just passed through and
113181
* appended to the result.
114182
*/
115-
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
183+
static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
116184
{
117185
const char *hexdig = "0123456789abcdef";
118186
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
@@ -131,7 +199,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
131199

132200
if (RB_UNLIKELY(ch_len)) {
133201
switch (ch_len) {
134-
case 1: {
202+
case 9: {
135203
FLUSH_POS(1);
136204
switch (ch) {
137205
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
@@ -153,9 +221,9 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
153221
}
154222
break;
155223
}
156-
case 3: {
224+
case 11: {
157225
unsigned char b2 = ptr[pos + 1];
158-
if (RB_UNLIKELY(out_script_safe && ch == 0xE2 && b2 == 0x80)) {
226+
if (RB_UNLIKELY(b2 == 0x80)) {
159227
unsigned char b3 = ptr[pos + 2];
160228
if (b3 == 0xA8) {
161229
FLUSH_POS(3);
@@ -167,6 +235,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
167235
break;
168236
}
169237
}
238+
ch_len = 3;
170239
// fallthrough
171240
}
172241
default:
@@ -186,104 +255,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
186255
RB_GC_GUARD(str);
187256
}
188257

189-
static const char escape_table[256] = {
190-
// ASCII Control Characters
191-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
192-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193-
// ASCII Characters
194-
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
195-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
197-
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
198-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
199-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200-
// Continuation byte
201-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
202-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
203-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
204-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
205-
// First byte of a 2-byte code point
206-
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
207-
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
208-
// First byte of a 4-byte code point
209-
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
210-
//First byte of a 4+byte code point
211-
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
212-
};
213-
214-
static const char script_safe_escape_table[256] = {
215-
// ASCII Control Characters
216-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
217-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
218-
// ASCII Characters
219-
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/'
220-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
221-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222-
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
223-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
224-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
225-
// Continuation byte
226-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
227-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
228-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
229-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
230-
// First byte of a 2-byte code point
231-
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
232-
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
233-
// First byte of a 4-byte code point
234-
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
235-
//First byte of a 4+byte code point
236-
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
237-
};
238-
239-
static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256])
240-
{
241-
const char *hexdig = "0123456789abcdef";
242-
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
243-
244-
const char *ptr = RSTRING_PTR(str);
245-
unsigned long len = RSTRING_LEN(str);
246-
247-
unsigned long beg = 0, pos;
248-
249-
for (pos = 0; pos < len;) {
250-
unsigned char ch = ptr[pos];
251-
/* JSON encoding */
252-
if (escape_table[ch]) {
253-
if (pos > beg) {
254-
fbuffer_append(out_buffer, &ptr[beg], pos - beg);
255-
}
256-
257-
beg = pos + 1;
258-
switch (ch) {
259-
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
260-
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
261-
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
262-
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
263-
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
264-
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
265-
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
266-
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
267-
default:
268-
scratch[2] = '0';
269-
scratch[3] = '0';
270-
scratch[4] = hexdig[(ch >> 4) & 0xf];
271-
scratch[5] = hexdig[ch & 0xf];
272-
fbuffer_append(out_buffer, scratch, 6);
273-
}
274-
}
275-
276-
pos++;
277-
}
278-
279-
if (beg < len) {
280-
fbuffer_append(out_buffer, &ptr[beg], len - beg);
281-
}
282-
283-
RB_GC_GUARD(str);
284-
}
285-
286-
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
258+
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
287259
{
288260
const char *hexdig = "0123456789abcdef";
289261
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
@@ -301,7 +273,7 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
301273

302274
if (RB_UNLIKELY(ch_len)) {
303275
switch (ch_len) {
304-
case 1: {
276+
case 1 | ESCAPE_MASK: {
305277
FLUSH_POS(1);
306278
switch (ch) {
307279
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
@@ -325,6 +297,8 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
325297
}
326298
default: {
327299
uint32_t wchar = 0;
300+
ch_len = ch_len & CHAR_LENGTH_MASK;
301+
328302
switch(ch_len) {
329303
case 2:
330304
wchar = ptr[pos] & 0x1F;
@@ -935,13 +909,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
935909

936910
switch(rb_enc_str_coderange(obj)) {
937911
case ENC_CODERANGE_7BIT:
938-
convert_ASCII_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
939-
break;
940912
case ENC_CODERANGE_VALID:
941913
if (RB_UNLIKELY(state->ascii_only)) {
942-
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
914+
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
943915
} else {
944-
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
916+
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
945917
}
946918
break;
947919
default:

0 commit comments

Comments
 (0)