Skip to content

Commit 99a3aea

Browse files
committed
Refactor convert_UTF8_to_JSON to split searching and escaping code
The goal is to be able to dispatch to more optimized search implementations without having to duplicate the escaping code. Somehow, this is a few % faster already: ``` == Encoding activitypub.json (52595 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 2.257k i/100ms Calculating ------------------------------------- after 22.930k (± 1.3%) i/s (43.61 μs/i) - 115.107k in 5.020814s Comparison: before: 21604.0 i/s after: 22930.1 i/s - 1.06x faster == Encoding citm_catalog.json (500298 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 137.000 i/100ms Calculating ------------------------------------- after 1.397k (± 1.1%) i/s (715.57 μs/i) - 6.987k in 5.000408s Comparison: before: 1344.4 i/s after: 1397.5 i/s - 1.04x faster == Encoding twitter.json (466906 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 249.000 i/100ms Calculating ------------------------------------- after 2.464k (± 1.8%) i/s (405.81 μs/i) - 12.450k in 5.054131s Comparison: before: 2326.5 i/s after: 2464.2 i/s - 1.06x faster ```
1 parent 1023227 commit 99a3aea

File tree

2 files changed

+94
-69
lines changed

2 files changed

+94
-69
lines changed

ext/json/ext/generator/generator.c

Lines changed: 88 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
101101
// 0 - single byte char that don't need to be escaped.
102102
// (x | 8) - char that needs to be escaped.
103103
static const unsigned char CHAR_LENGTH_MASK = 7;
104+
static const unsigned char ESCAPE_MASK = 8;
104105

105106
static const unsigned char escape_table[256] = {
106107
// ASCII Control Characters
@@ -165,6 +166,84 @@ static const unsigned char script_safe_escape_table[256] = {
165166
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
166167
};
167168

169+
170+
typedef struct _search_state {
171+
const char *ptr;
172+
const char *end;
173+
const char *cursor;
174+
FBuffer *buffer;
175+
} search_state;
176+
177+
static inline void escape_UTF8_char(search_state *state, unsigned char ch_len) {
178+
const unsigned char ch = (unsigned char)*state->ptr;
179+
switch (ch_len) {
180+
case 1: {
181+
switch (ch) {
182+
case '"': fbuffer_append(state->buffer, "\\\"", 2); break;
183+
case '\\': fbuffer_append(state->buffer, "\\\\", 2); break;
184+
case '/': fbuffer_append(state->buffer, "\\/", 2); break;
185+
case '\b': fbuffer_append(state->buffer, "\\b", 2); break;
186+
case '\f': fbuffer_append(state->buffer, "\\f", 2); break;
187+
case '\n': fbuffer_append(state->buffer, "\\n", 2); break;
188+
case '\r': fbuffer_append(state->buffer, "\\r", 2); break;
189+
case '\t': fbuffer_append(state->buffer, "\\t", 2); break;
190+
default: {
191+
const char *hexdig = "0123456789abcdef";
192+
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
193+
scratch[4] = hexdig[(ch >> 4) & 0xf];
194+
scratch[5] = hexdig[ch & 0xf];
195+
fbuffer_append(state->buffer, scratch, 6);
196+
break;
197+
}
198+
}
199+
break;
200+
}
201+
case 3: {
202+
if (state->ptr[2] & 1) {
203+
fbuffer_append(state->buffer, "\\u2029", 6);
204+
} else {
205+
fbuffer_append(state->buffer, "\\u2028", 6);
206+
}
207+
break;
208+
}
209+
}
210+
state->cursor = (state->ptr += ch_len);
211+
}
212+
213+
static inline void search_flush(search_state *state)
214+
{
215+
fbuffer_append(state->buffer, state->cursor, state->ptr - state->cursor);
216+
state->cursor = state->ptr;
217+
}
218+
219+
static inline unsigned char search_escape(search_state *state, const unsigned char escape_table[256])
220+
{
221+
while (state->ptr < state->end) {
222+
unsigned char ch = (unsigned char)*state->ptr;
223+
unsigned char ch_len = escape_table[ch];
224+
225+
if (RB_UNLIKELY(ch_len)) {
226+
if (ch_len & ESCAPE_MASK) {
227+
if (RB_UNLIKELY(ch_len == 11)) {
228+
const unsigned char *uptr = (const unsigned char *)state->ptr;
229+
if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) {
230+
state->ptr += 3;
231+
continue;
232+
}
233+
}
234+
search_flush(state);
235+
return ch_len & CHAR_LENGTH_MASK;
236+
} else {
237+
state->ptr += ch_len;
238+
}
239+
} else {
240+
state->ptr++;
241+
}
242+
}
243+
search_flush(state);
244+
return 0;
245+
}
246+
168247
/* Converts in_string to a JSON string (without the wrapping '"'
169248
* characters) in FBuffer out_buffer.
170249
*
@@ -183,77 +262,17 @@ static const unsigned char script_safe_escape_table[256] = {
183262
*/
184263
static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
185264
{
186-
const char *hexdig = "0123456789abcdef";
187-
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
188-
189-
const char *ptr = RSTRING_PTR(str);
190-
unsigned long len = RSTRING_LEN(str);
191-
192-
unsigned long beg = 0, pos = 0;
193-
194-
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
195-
196-
while (pos < len) {
197-
unsigned char ch = ptr[pos];
198-
unsigned char ch_len = escape_table[ch];
199-
/* JSON encoding */
265+
long len;
266+
search_state state;
267+
state.buffer = out_buffer;
268+
RSTRING_GETMEM(str, state.ptr, len);
269+
state.cursor = state.ptr;
270+
state.end = state.ptr + len;
200271

201-
if (RB_UNLIKELY(ch_len)) {
202-
switch (ch_len) {
203-
case 9: {
204-
FLUSH_POS(1);
205-
switch (ch) {
206-
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
207-
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
208-
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
209-
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
210-
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
211-
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
212-
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
213-
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
214-
default: {
215-
scratch[2] = '0';
216-
scratch[3] = '0';
217-
scratch[4] = hexdig[(ch >> 4) & 0xf];
218-
scratch[5] = hexdig[ch & 0xf];
219-
fbuffer_append(out_buffer, scratch, 6);
220-
break;
221-
}
222-
}
223-
break;
224-
}
225-
case 11: {
226-
unsigned char b2 = ptr[pos + 1];
227-
if (RB_UNLIKELY(b2 == 0x80)) {
228-
unsigned char b3 = ptr[pos + 2];
229-
if (b3 == 0xA8) {
230-
FLUSH_POS(3);
231-
fbuffer_append(out_buffer, "\\u2028", 6);
232-
break;
233-
} else if (b3 == 0xA9) {
234-
FLUSH_POS(3);
235-
fbuffer_append(out_buffer, "\\u2029", 6);
236-
break;
237-
}
238-
}
239-
ch_len = 3;
240-
// fallthrough
241-
}
242-
default:
243-
pos += ch_len;
244-
break;
245-
}
246-
} else {
247-
pos++;
248-
}
249-
}
250-
#undef FLUSH_POS
251-
252-
if (beg < len) {
253-
fbuffer_append(out_buffer, &ptr[beg], len - beg);
272+
unsigned char ch_len;
273+
while ((ch_len = search_escape(&state, escape_table))) {
274+
escape_UTF8_char(&state, ch_len);
254275
}
255-
256-
RB_GC_GUARD(str);
257276
}
258277

259278
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])

test/json/json_generator_test.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,12 @@ def test_nonutf8_encoding
665665
assert_equal("\"5\u{b0}\"", "5\xb0".dup.force_encoding(Encoding::ISO_8859_1).to_json)
666666
end
667667

668+
def test_utf8_multibyte
669+
assert_equal('["foßbar"]', JSON.generate(["foßbar"]))
670+
assert_equal('"n€ßt€ð2"', JSON.generate("n€ßt€ð2"))
671+
assert_equal('"\"\u0000\u001f"', JSON.generate("\"\u0000\u001f"))
672+
end
673+
668674
def test_fragment
669675
fragment = JSON::Fragment.new(" 42")
670676
assert_equal '{"number": 42}', JSON.generate({ number: fragment })

0 commit comments

Comments
 (0)