Skip to content

Commit 98c56de

Browse files
byroothsbt
authored andcommitted
[ruby/json] Refactor further to expose the simpler escape search possible
ruby/json@e03515ac8b
1 parent 98e1c28 commit 98c56de

File tree

1 file changed

+143
-97
lines changed

1 file changed

+143
-97
lines changed

ext/json/generator/generator.c

Lines changed: 143 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -103,20 +103,20 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
103103
static const unsigned char CHAR_LENGTH_MASK = 7;
104104
static const unsigned char ESCAPE_MASK = 8;
105105

106-
static const unsigned char escape_table[256] = {
107-
// ASCII Control Characters
108-
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
109-
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
110-
// ASCII Characters
111-
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
112-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
115-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117-
};
106+
typedef struct _search_state {
107+
const char *ptr;
108+
const char *end;
109+
const char *cursor;
110+
FBuffer *buffer;
111+
} search_state;
118112

119-
static const unsigned char ascii_only_escape_table[256] = {
113+
static inline void search_flush(search_state *search)
114+
{
115+
fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor);
116+
search->cursor = search->ptr;
117+
}
118+
119+
static const unsigned char escape_table_basic[256] = {
120120
// ASCII Control Characters
121121
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
122122
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
@@ -127,20 +127,105 @@ static const unsigned char ascii_only_escape_table[256] = {
127127
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
128128
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
129129
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130-
// Continuation byte
131-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
133-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
134-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
135-
// First byte of a 2-byte code point
136-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
138-
// First byte of a 3-byte code point
139-
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
140-
//First byte of a 4+ byte code point
141-
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
142130
};
143131

132+
static inline unsigned char search_escape_basic(search_state *search)
133+
{
134+
while (search->ptr < search->end) {
135+
if (RB_UNLIKELY(escape_table_basic[(const unsigned char)*search->ptr])) {
136+
search_flush(search);
137+
return 1;
138+
} else {
139+
search->ptr++;
140+
}
141+
}
142+
search_flush(search);
143+
return 0;
144+
}
145+
146+
static inline void escape_UTF8_char_basic(search_state *search) {
147+
const unsigned char ch = (unsigned char)*search->ptr;
148+
switch (ch) {
149+
case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
150+
case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
151+
case '/': fbuffer_append(search->buffer, "\\/", 2); break;
152+
case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
153+
case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
154+
case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
155+
case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
156+
case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
157+
default: {
158+
const char *hexdig = "0123456789abcdef";
159+
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
160+
scratch[4] = hexdig[(ch >> 4) & 0xf];
161+
scratch[5] = hexdig[ch & 0xf];
162+
fbuffer_append(search->buffer, scratch, 6);
163+
break;
164+
}
165+
}
166+
search->ptr++;
167+
search->cursor = search->ptr;
168+
}
169+
170+
/* Converts in_string to a JSON string (without the wrapping '"'
171+
* characters) in FBuffer out_buffer.
172+
*
173+
* Character are JSON-escaped according to:
174+
*
175+
* - Always: ASCII control characters (0x00-0x1F), dquote, and
176+
* backslash.
177+
*
178+
* - If out_ascii_only: non-ASCII characters (>0x7F)
179+
*
180+
* - If script_safe: forwardslash (/), line separator (U+2028), and
181+
* paragraph separator (U+2029)
182+
*
183+
* Everything else (should be UTF-8) is just passed through and
184+
* appended to the result.
185+
*/
186+
static inline void convert_UTF8_to_JSON(search_state *search)
187+
{
188+
while (search_escape_basic(search)) {
189+
escape_UTF8_char_basic(search);
190+
}
191+
}
192+
193+
static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) {
194+
const unsigned char ch = (unsigned char)*search->ptr;
195+
switch (ch_len) {
196+
case 1: {
197+
switch (ch) {
198+
case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
199+
case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
200+
case '/': fbuffer_append(search->buffer, "\\/", 2); break;
201+
case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
202+
case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
203+
case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
204+
case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
205+
case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
206+
default: {
207+
const char *hexdig = "0123456789abcdef";
208+
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
209+
scratch[4] = hexdig[(ch >> 4) & 0xf];
210+
scratch[5] = hexdig[ch & 0xf];
211+
fbuffer_append(search->buffer, scratch, 6);
212+
break;
213+
}
214+
}
215+
break;
216+
}
217+
case 3: {
218+
if (search->ptr[2] & 1) {
219+
fbuffer_append(search->buffer, "\\u2029", 6);
220+
} else {
221+
fbuffer_append(search->buffer, "\\u2028", 6);
222+
}
223+
break;
224+
}
225+
}
226+
search->cursor = (search->ptr += ch_len);
227+
}
228+
144229
static const unsigned char script_safe_escape_table[256] = {
145230
// ASCII Control Characters
146231
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
@@ -166,25 +251,11 @@ static const unsigned char script_safe_escape_table[256] = {
166251
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
167252
};
168253

169-
170-
typedef struct _search_state {
171-
const char *ptr;
172-
const char *end;
173-
const char *cursor;
174-
FBuffer *buffer;
175-
} search_state;
176-
177-
static inline void search_flush(search_state *search)
178-
{
179-
fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor);
180-
search->cursor = search->ptr;
181-
}
182-
183-
static inline unsigned char search_escape(search_state *search, const unsigned char escape_table[256])
254+
static inline unsigned char search_script_safe_escape(search_state *search)
184255
{
185256
while (search->ptr < search->end) {
186257
unsigned char ch = (unsigned char)*search->ptr;
187-
unsigned char ch_len = escape_table[ch];
258+
unsigned char ch_len = script_safe_escape_table[ch];
188259

189260
if (RB_UNLIKELY(ch_len)) {
190261
if (ch_len & ESCAPE_MASK) {
@@ -208,66 +279,39 @@ static inline unsigned char search_escape(search_state *search, const unsigned c
208279
return 0;
209280
}
210281

211-
static inline void fast_escape_UTF8_char(search_state *search, unsigned char ch_len) {
212-
const unsigned char ch = (unsigned char)*search->ptr;
213-
switch (ch_len) {
214-
case 1: {
215-
switch (ch) {
216-
case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
217-
case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
218-
case '/': fbuffer_append(search->buffer, "\\/", 2); break;
219-
case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
220-
case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
221-
case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
222-
case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
223-
case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
224-
default: {
225-
const char *hexdig = "0123456789abcdef";
226-
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
227-
scratch[4] = hexdig[(ch >> 4) & 0xf];
228-
scratch[5] = hexdig[ch & 0xf];
229-
fbuffer_append(search->buffer, scratch, 6);
230-
break;
231-
}
232-
}
233-
break;
234-
}
235-
case 3: {
236-
if (search->ptr[2] & 1) {
237-
fbuffer_append(search->buffer, "\\u2029", 6);
238-
} else {
239-
fbuffer_append(search->buffer, "\\u2028", 6);
240-
}
241-
break;
242-
}
243-
}
244-
search->cursor = (search->ptr += ch_len);
245-
}
246-
247-
/* Converts in_string to a JSON string (without the wrapping '"'
248-
* characters) in FBuffer out_buffer.
249-
*
250-
* Character are JSON-escaped according to:
251-
*
252-
* - Always: ASCII control characters (0x00-0x1F), dquote, and
253-
* backslash.
254-
*
255-
* - If out_ascii_only: non-ASCII characters (>0x7F)
256-
*
257-
* - If script_safe: forwardslash (/), line separator (U+2028), and
258-
* paragraph separator (U+2029)
259-
*
260-
* Everything else (should be UTF-8) is just passed through and
261-
* appended to the result.
262-
*/
263-
static inline void convert_UTF8_to_JSON(search_state *search, const unsigned char escape_table[256])
282+
static void convert_UTF8_to_script_safe_JSON(search_state *search)
264283
{
265284
unsigned char ch_len;
266-
while ((ch_len = search_escape(search, escape_table))) {
267-
fast_escape_UTF8_char(search, ch_len);
285+
while ((ch_len = search_script_safe_escape(search))) {
286+
escape_UTF8_char(search, ch_len);
268287
}
269288
}
270289

290+
static const unsigned char ascii_only_escape_table[256] = {
291+
// ASCII Control Characters
292+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
293+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
294+
// ASCII Characters
295+
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
296+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
297+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
298+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
299+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
300+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
301+
// Continuation byte
302+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
303+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
304+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
305+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
306+
// First byte of a 2-byte code point
307+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
308+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
309+
// First byte of a 3-byte code point
310+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
311+
//First byte of a 4+ byte code point
312+
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
313+
};
314+
271315
static inline unsigned char search_ascii_only_escape(search_state *search, const unsigned char escape_table[256])
272316
{
273317
while (search->ptr < search->end) {
@@ -934,8 +978,10 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
934978
case ENC_CODERANGE_VALID:
935979
if (RB_UNLIKELY(state->ascii_only)) {
936980
convert_UTF8_to_ASCII_only_JSON(&search, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
981+
} else if (RB_UNLIKELY(state->script_safe)) {
982+
convert_UTF8_to_script_safe_JSON(&search);
937983
} else {
938-
convert_UTF8_to_JSON(&search, state->script_safe ? script_safe_escape_table : escape_table);
984+
convert_UTF8_to_JSON(&search);
939985
}
940986
break;
941987
default:

0 commit comments

Comments
 (0)