Skip to content

Commit cbd933b

Browse files
byroothsbt
authored andcommitted
[ruby/json] convert_UTF8_to_ASCII_only_JSON: apply the same optimization pass
ruby/json@42edaf7f17
1 parent e52b476 commit cbd933b

File tree

1 file changed

+71
-60
lines changed

1 file changed

+71
-60
lines changed

ext/json/generator/generator.c

Lines changed: 71 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -199,73 +199,80 @@ static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char esc
199199
RB_GC_GUARD(str);
200200
}
201201

202-
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe)
202+
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
203203
{
204204
const char *hexdig = "0123456789abcdef";
205205
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
206206

207-
const char *in_utf8_str = RSTRING_PTR(in_string);
208-
unsigned long in_utf8_len = RSTRING_LEN(in_string);
207+
const char *ptr = RSTRING_PTR(str);
208+
unsigned long len = RSTRING_LEN(str);
209209

210-
unsigned long beg = 0, pos;
210+
unsigned long beg = 0, pos = 0;
211211

212-
for (pos = 0; pos < in_utf8_len;) {
213-
uint32_t ch;
214-
short ch_len;
215-
bool should_escape;
216-
217-
/* UTF-8 decoding */
218-
short i;
219-
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
220-
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
221-
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
222-
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
223-
else {
224-
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
225-
}
212+
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
226213

227-
for (i = 1; i < ch_len; i++) {
228-
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
229-
}
214+
while (pos < len) {
215+
unsigned char ch = ptr[pos];
216+
unsigned char ch_len = escape_table[ch];
230217

231-
/* JSON policy */
232-
should_escape =
233-
(ch < 0x20) ||
234-
(ch == '"') ||
235-
(ch == '\\') ||
236-
(ch > 0x7F) ||
237-
(out_script_safe && (ch == '/')) ||
238-
(out_script_safe && (ch == 0x2028)) ||
239-
(out_script_safe && (ch == 0x2029));
218+
if (RB_UNLIKELY(ch_len)) {
219+
switch (ch_len) {
220+
case 0:
221+
pos++;
222+
break;
223+
case 1: {
224+
FLUSH_POS(1);
225+
switch (ch) {
226+
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
227+
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
228+
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
229+
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
230+
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
231+
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
232+
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
233+
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
234+
default: {
235+
scratch[2] = hexdig[ch >> 12];
236+
scratch[3] = hexdig[(ch >> 8) & 0xf];
237+
scratch[4] = hexdig[(ch >> 4) & 0xf];
238+
scratch[5] = hexdig[ch & 0xf];
239+
fbuffer_append(out_buffer, scratch, 6);
240+
break;
241+
}
242+
}
243+
break;
244+
}
245+
default: {
246+
uint32_t wchar = 0;
247+
switch(ch_len) {
248+
case 2:
249+
wchar = ptr[pos] & 0x1F;
250+
break;
251+
case 3:
252+
wchar = ptr[pos] & 0x0F;
253+
break;
254+
case 4:
255+
wchar = ptr[pos] & 0x07;
256+
break;
257+
}
240258

241-
/* JSON encoding */
242-
if (should_escape) {
243-
if (pos > beg) {
244-
fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg);
245-
}
259+
for (short i = 1; i < ch_len; i++) {
260+
wchar = (wchar<<6) | (ptr[pos+i] & 0x3F);
261+
}
246262

247-
beg = pos + ch_len;
248-
switch (ch) {
249-
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
250-
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
251-
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
252-
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
253-
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
254-
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
255-
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
256-
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
257-
default:
258-
if (ch <= 0xFFFF) {
259-
scratch[2] = hexdig[ch >> 12];
260-
scratch[3] = hexdig[(ch >> 8) & 0xf];
261-
scratch[4] = hexdig[(ch >> 4) & 0xf];
262-
scratch[5] = hexdig[ch & 0xf];
263+
FLUSH_POS(ch_len);
264+
265+
if (wchar <= 0xFFFF) {
266+
scratch[2] = hexdig[wchar >> 12];
267+
scratch[3] = hexdig[(wchar >> 8) & 0xf];
268+
scratch[4] = hexdig[(wchar >> 4) & 0xf];
269+
scratch[5] = hexdig[wchar & 0xf];
263270
fbuffer_append(out_buffer, scratch, 6);
264271
} else {
265272
uint16_t hi, lo;
266-
ch -= 0x10000;
267-
hi = 0xD800 + (uint16_t)(ch >> 10);
268-
lo = 0xDC00 + (uint16_t)(ch & 0x3FF);
273+
wchar -= 0x10000;
274+
hi = 0xD800 + (uint16_t)(wchar >> 10);
275+
lo = 0xDC00 + (uint16_t)(wchar & 0x3FF);
269276

270277
scratch[2] = hexdig[hi >> 12];
271278
scratch[3] = hexdig[(hi >> 8) & 0xf];
@@ -279,17 +286,21 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE in_string
279286

280287
fbuffer_append(out_buffer, scratch, 12);
281288
}
289+
290+
break;
291+
}
282292
}
293+
} else {
294+
pos++;
283295
}
284-
285-
pos += ch_len;
286296
}
297+
#undef FLUSH_POS
287298

288-
if (beg < in_utf8_len) {
289-
fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg);
299+
if (beg < len) {
300+
fbuffer_append(out_buffer, &ptr[beg], len - beg);
290301
}
291302

292-
RB_GC_GUARD(in_string);
303+
RB_GC_GUARD(str);
293304
}
294305

295306
static char *fstrndup(const char *ptr, unsigned long len) {
@@ -747,7 +758,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S
747758
break;
748759
case ENC_CODERANGE_VALID:
749760
if (RB_UNLIKELY(state->ascii_only)) {
750-
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe);
761+
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
751762
} else {
752763
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
753764
}

0 commit comments

Comments
 (0)