Skip to content

Commit 12965b9

Browse files
authored
Merge pull request #724 from byroot/lookup-3
Improve lookup tables for string escaping.
2 parents f745ec1 + dc7d766 commit 12965b9

File tree

2 files changed

+80
-111
lines changed

2 files changed

+80
-111
lines changed

benchmark/encoder.rb

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,10 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [
6868
benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 500)
6969
benchmark_encoding "mostly utf8", ([("€" * 3333)] * 500)
7070

71-
# On these benchmarks we perform well, we're on par or better.
71+
# On these benchmarks we perform well, we're on par or a bit better.
7272
benchmark_encoding "integers", (1_000_000..1_001_000).to_a, except: %i(json_state)
7373
benchmark_encoding "activitypub.json", JSON.load_file("#{__dir__}/data/activitypub.json")
7474
benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json")
75-
76-
# On twitter.json we're still about 6% slower, this is worth investigating.
7775
benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json")
7876

7977
# This benchmark spent the overwhelming majority of its time in `ruby_dtoa`. We rely on Ruby's implementation

ext/json/ext/generator/generator.c

Lines changed: 79 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,73 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
9696
raise_generator_error_str(invalid_object, str);
9797
}
9898

99+
// 0 - single byte char that don't need to be escaped.
100+
// (x | 8) - char that needs to be escaped.
101+
static const unsigned char CHAR_LENGTH_MASK = 7;
102+
103+
static const unsigned char escape_table[256] = {
104+
// ASCII Control Characters
105+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
106+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
107+
// ASCII Characters
108+
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
109+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
110+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
111+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
112+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114+
};
115+
116+
static const unsigned char ascii_only_escape_table[256] = {
117+
// ASCII Control Characters
118+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
119+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
120+
// ASCII Characters
121+
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
122+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
125+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
126+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
127+
// Continuation byte
128+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
130+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132+
// First byte of a 2-byte code point
133+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
135+
// First byte of a 3-byte code point
136+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
137+
//First byte of a 4+ byte code point
138+
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
139+
};
140+
141+
static const unsigned char script_safe_escape_table[256] = {
142+
// ASCII Control Characters
143+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
144+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
145+
// ASCII Characters
146+
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, // '"' and '/'
147+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
149+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
150+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
151+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
152+
// Continuation byte
153+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
155+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157+
// First byte of a 2-byte code point
158+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
159+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
160+
// First byte of a 3-byte code point
161+
3, 3,11, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE2 is the start of \u2028 and \u2029
162+
//First byte of a 4+ byte code point
163+
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
164+
};
165+
99166
/* Converts in_string to a JSON string (without the wrapping '"'
100167
* characters) in FBuffer out_buffer.
101168
*
@@ -106,13 +173,13 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
106173
*
107174
* - If out_ascii_only: non-ASCII characters (>0x7F)
108175
*
109-
* - If out_script_safe: forwardslash, line separator (U+2028), and
176+
* - If script_safe: forwardslash (/), line separator (U+2028), and
110177
* paragraph separator (U+2029)
111178
*
112179
* Everything else (should be UTF-8) is just passed through and
113180
* appended to the result.
114181
*/
115-
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
182+
static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
116183
{
117184
const char *hexdig = "0123456789abcdef";
118185
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
@@ -131,7 +198,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
131198

132199
if (RB_UNLIKELY(ch_len)) {
133200
switch (ch_len) {
134-
case 1: {
201+
case 9: {
135202
FLUSH_POS(1);
136203
switch (ch) {
137204
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
@@ -153,9 +220,9 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
153220
}
154221
break;
155222
}
156-
case 3: {
223+
case 11: {
157224
unsigned char b2 = ptr[pos + 1];
158-
if (RB_UNLIKELY(out_script_safe && ch == 0xE2 && b2 == 0x80)) {
225+
if (RB_UNLIKELY(b2 == 0x80)) {
159226
unsigned char b3 = ptr[pos + 2];
160227
if (b3 == 0xA8) {
161228
FLUSH_POS(3);
@@ -167,6 +234,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
167234
break;
168235
}
169236
}
237+
ch_len = 3;
170238
// fallthrough
171239
}
172240
default:
@@ -186,104 +254,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
186254
RB_GC_GUARD(str);
187255
}
188256

189-
static const char escape_table[256] = {
190-
// ASCII Control Characters
191-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
192-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193-
// ASCII Characters
194-
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
195-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
197-
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
198-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
199-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200-
// Continuation byte
201-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
202-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
203-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
204-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
205-
// First byte of a 2-byte code point
206-
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
207-
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
208-
// First byte of a 4-byte code point
209-
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
210-
//First byte of a 4+byte code point
211-
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
212-
};
213-
214-
static const char script_safe_escape_table[256] = {
215-
// ASCII Control Characters
216-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
217-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
218-
// ASCII Characters
219-
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/'
220-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
221-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222-
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
223-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
224-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
225-
// Continuation byte
226-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
227-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
228-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
229-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
230-
// First byte of a 2-byte code point
231-
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
232-
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
233-
// First byte of a 4-byte code point
234-
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
235-
//First byte of a 4+byte code point
236-
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
237-
};
238-
239-
static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256])
240-
{
241-
const char *hexdig = "0123456789abcdef";
242-
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
243-
244-
const char *ptr = RSTRING_PTR(str);
245-
unsigned long len = RSTRING_LEN(str);
246-
247-
unsigned long beg = 0, pos;
248-
249-
for (pos = 0; pos < len;) {
250-
unsigned char ch = ptr[pos];
251-
/* JSON encoding */
252-
if (escape_table[ch]) {
253-
if (pos > beg) {
254-
fbuffer_append(out_buffer, &ptr[beg], pos - beg);
255-
}
256-
257-
beg = pos + 1;
258-
switch (ch) {
259-
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
260-
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
261-
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
262-
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
263-
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
264-
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
265-
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
266-
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
267-
default:
268-
scratch[2] = '0';
269-
scratch[3] = '0';
270-
scratch[4] = hexdig[(ch >> 4) & 0xf];
271-
scratch[5] = hexdig[ch & 0xf];
272-
fbuffer_append(out_buffer, scratch, 6);
273-
}
274-
}
275-
276-
pos++;
277-
}
278-
279-
if (beg < len) {
280-
fbuffer_append(out_buffer, &ptr[beg], len - beg);
281-
}
282-
283-
RB_GC_GUARD(str);
284-
}
285-
286-
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
257+
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
287258
{
288259
const char *hexdig = "0123456789abcdef";
289260
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
@@ -301,7 +272,7 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
301272

302273
if (RB_UNLIKELY(ch_len)) {
303274
switch (ch_len) {
304-
case 1: {
275+
case 9: {
305276
FLUSH_POS(1);
306277
switch (ch) {
307278
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
@@ -325,6 +296,8 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
325296
}
326297
default: {
327298
uint32_t wchar = 0;
299+
ch_len = ch_len & CHAR_LENGTH_MASK;
300+
328301
switch(ch_len) {
329302
case 2:
330303
wchar = ptr[pos] & 0x1F;
@@ -935,13 +908,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
935908

936909
switch(rb_enc_str_coderange(obj)) {
937910
case ENC_CODERANGE_7BIT:
938-
convert_ASCII_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
939-
break;
940911
case ENC_CODERANGE_VALID:
941912
if (RB_UNLIKELY(state->ascii_only)) {
942-
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
913+
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
943914
} else {
944-
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
915+
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
945916
}
946917
break;
947918
default:

0 commit comments

Comments
 (0)