Skip to content

Commit 50b6cd4

Browse files
samyronhsbt
authored andcommitted
Optimize 'json_parse_string' using SIMD.
1 parent d6bfb73 commit 50b6cd4

File tree

6 files changed

+371
-189
lines changed

6 files changed

+371
-189
lines changed

ext/json/ext/simd/simd.h

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
typedef enum {
2+
SIMD_NONE,
3+
SIMD_NEON,
4+
SIMD_SSE2
5+
} SIMD_Implementation;
6+
7+
#ifdef JSON_ENABLE_SIMD
8+
9+
#ifdef __clang__
10+
#if __has_builtin(__builtin_ctzll)
11+
#define HAVE_BUILTIN_CTZLL 1
12+
#else
13+
#define HAVE_BUILTIN_CTZLL 0
14+
#endif
15+
#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
16+
#define HAVE_BUILTIN_CTZLL 1
17+
#else
18+
#define HAVE_BUILTIN_CTZLL 0
19+
#endif
20+
21+
static inline uint32_t trailing_zeros64(uint64_t input) {
22+
#if HAVE_BUILTIN_CTZLL
23+
return __builtin_ctzll(input);
24+
#else
25+
uint32_t trailing_zeros = 0;
26+
uint64_t temp = input;
27+
while ((temp & 1) == 0 && temp > 0) {
28+
trailing_zeros++;
29+
temp >>= 1;
30+
}
31+
return trailing_zeros;
32+
#endif
33+
}
34+
35+
static inline int trailing_zeros(int input) {
36+
#if HAVE_BUILTIN_CTZLL
37+
return __builtin_ctz(input);
38+
#else
39+
int trailing_zeros = 0;
40+
int temp = input;
41+
while ((temp & 1) == 0 && temp > 0) {
42+
trailing_zeros++;
43+
temp >>= 1;
44+
}
45+
return trailing_zeros;
46+
#endif
47+
}
48+
49+
#if (defined(__GNUC__ ) || defined(__clang__))
50+
#define FORCE_INLINE __attribute__((always_inline))
51+
#else
52+
#define FORCE_INLINE
53+
#endif
54+
55+
56+
#define SIMD_MINIMUM_THRESHOLD 6
57+
58+
#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
59+
#include <arm_neon.h>
60+
61+
#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
62+
static SIMD_Implementation find_simd_implementation(void) {
63+
return SIMD_NEON;
64+
}
65+
66+
#define HAVE_SIMD 1
67+
#define HAVE_SIMD_NEON 1
68+
69+
// See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
70+
static inline FORCE_INLINE uint64_t neon_match_mask(uint8x16_t matches)
71+
{
72+
const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4);
73+
const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
74+
return mask & 0x8888888888888888ull;
75+
}
76+
77+
static inline FORCE_INLINE uint64_t compute_chunk_mask_neon(const char *ptr)
78+
{
79+
uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr);
80+
81+
// Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
82+
// https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
83+
const uint8x16_t too_low_or_dbl_quote = vcltq_u8(veorq_u8(chunk, vdupq_n_u8(2)), vdupq_n_u8(33));
84+
85+
uint8x16_t has_backslash = vceqq_u8(chunk, vdupq_n_u8('\\'));
86+
uint8x16_t needs_escape = vorrq_u8(too_low_or_dbl_quote, has_backslash);
87+
return neon_match_mask(needs_escape);
88+
}
89+
90+
static inline FORCE_INLINE int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask)
91+
{
92+
while(*ptr + sizeof(uint8x16_t) <= end) {
93+
uint64_t chunk_mask = compute_chunk_mask_neon(*ptr);
94+
if (chunk_mask) {
95+
*mask = chunk_mask;
96+
return 1;
97+
}
98+
*ptr += sizeof(uint8x16_t);
99+
}
100+
return 0;
101+
}
102+
103+
uint8x16x4_t load_uint8x16_4(const unsigned char *table) {
104+
uint8x16x4_t tab;
105+
tab.val[0] = vld1q_u8(table);
106+
tab.val[1] = vld1q_u8(table+16);
107+
tab.val[2] = vld1q_u8(table+32);
108+
tab.val[3] = vld1q_u8(table+48);
109+
return tab;
110+
}
111+
112+
#endif /* ARM Neon Support.*/
113+
114+
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
115+
116+
#ifdef HAVE_X86INTRIN_H
117+
#include <x86intrin.h>
118+
119+
#define HAVE_SIMD 1
120+
#define HAVE_SIMD_SSE2 1
121+
122+
#ifdef HAVE_CPUID_H
123+
#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
124+
125+
#if defined(__clang__) || defined(__GNUC__)
126+
#define TARGET_SSE2 __attribute__((target("sse2")))
127+
#else
128+
#define TARGET_SSE2
129+
#endif
130+
131+
#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a)
132+
#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)
133+
#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
134+
#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)
135+
136+
static inline TARGET_SSE2 FORCE_INLINE int compute_chunk_mask_sse2(const char *ptr)
137+
{
138+
__m128i chunk = _mm_loadu_si128((__m128i const*)ptr);
139+
// Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
140+
// https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
141+
__m128i too_low_or_dbl_quote = _mm_cmplt_epu8(_mm_xor_si128(chunk, _mm_set1_epi8(2)), _mm_set1_epi8(33));
142+
__m128i has_backslash = _mm_cmpeq_epi8(chunk, _mm_set1_epi8('\\'));
143+
__m128i needs_escape = _mm_or_si128(too_low_or_dbl_quote, has_backslash);
144+
return _mm_movemask_epi8(needs_escape);
145+
}
146+
147+
static inline TARGET_SSE2 FORCE_INLINE int string_scan_simd_sse2(const char **ptr, const char *end, int *mask)
148+
{
149+
while (*ptr + sizeof(__m128i) <= end) {
150+
int chunk_mask = compute_chunk_mask_sse2(*ptr);
151+
if (chunk_mask) {
152+
*mask = chunk_mask;
153+
return 1;
154+
}
155+
*ptr += sizeof(__m128i);
156+
}
157+
158+
return 0;
159+
}
160+
161+
#include <cpuid.h>
162+
#endif /* HAVE_CPUID_H */
163+
164+
static SIMD_Implementation find_simd_implementation(void) {
165+
166+
#if defined(__GNUC__ ) || defined(__clang__)
167+
#ifdef __GNUC__
168+
__builtin_cpu_init();
169+
#endif /* __GNUC__ */
170+
171+
// TODO Revisit. I think the SSE version now only uses SSE2 instructions.
172+
if (__builtin_cpu_supports("sse2")) {
173+
return SIMD_SSE2;
174+
}
175+
#endif /* __GNUC__ || __clang__*/
176+
177+
return SIMD_NONE;
178+
}
179+
180+
#endif /* HAVE_X86INTRIN_H */
181+
#endif /* X86_64 Support */
182+
183+
#endif /* JSON_ENABLE_SIMD */
184+
185+
#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED
186+
static SIMD_Implementation find_simd_implementation(void) {
187+
return SIMD_NONE;
188+
}
189+
#endif

ext/json/generator/generator.c

Lines changed: 6 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <math.h>
66
#include <ctype.h>
77

8-
#include "simd.h"
8+
#include "../simd/simd.h"
99

1010
/* ruby api and some helpers */
1111

@@ -304,28 +304,6 @@ static inline FORCE_INLINE unsigned char neon_next_match(search_state *search)
304304
return 1;
305305
}
306306

307-
// See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
308-
static inline FORCE_INLINE uint64_t neon_match_mask(uint8x16_t matches)
309-
{
310-
const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4);
311-
const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
312-
return mask & 0x8888888888888888ull;
313-
}
314-
315-
static inline FORCE_INLINE uint64_t neon_rules_update(const char *ptr)
316-
{
317-
uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr);
318-
319-
// Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
320-
// https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
321-
const uint8x16_t too_low_or_dbl_quote = vcltq_u8(veorq_u8(chunk, vdupq_n_u8(2)), vdupq_n_u8(33));
322-
323-
uint8x16_t has_backslash = vceqq_u8(chunk, vdupq_n_u8('\\'));
324-
uint8x16_t needs_escape = vorrq_u8(too_low_or_dbl_quote, has_backslash);
325-
326-
return neon_match_mask(needs_escape);
327-
}
328-
329307
static inline unsigned char search_escape_basic_neon(search_state *search)
330308
{
331309
if (RB_UNLIKELY(search->has_matches)) {
@@ -380,14 +358,8 @@ static inline unsigned char search_escape_basic_neon(search_state *search)
380358
* no bytes need to be escaped and we can continue to the next chunk. If the mask is not 0 then we
381359
* have at least one byte that needs to be escaped.
382360
*/
383-
while (search->ptr + sizeof(uint8x16_t) <= search->end) {
384-
uint64_t mask = neon_rules_update(search->ptr);
385361

386-
if (!mask) {
387-
search->ptr += sizeof(uint8x16_t);
388-
continue;
389-
}
390-
search->matches_mask = mask;
362+
if (string_scan_simd_neon(&search->ptr, search->end, &search->matches_mask)) {
391363
search->has_matches = true;
392364
search->chunk_base = search->ptr;
393365
search->chunk_end = search->ptr + sizeof(uint8x16_t);
@@ -399,7 +371,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search)
399371
if (remaining >= SIMD_MINIMUM_THRESHOLD) {
400372
char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining);
401373

402-
uint64_t mask = neon_rules_update(s);
374+
uint64_t mask = compute_chunk_mask_neon(s);
403375

404376
if (!mask) {
405377
// Nothing to escape, ensure search_flush doesn't do anything by setting
@@ -428,11 +400,6 @@ static inline unsigned char search_escape_basic_neon(search_state *search)
428400

429401
#ifdef HAVE_SIMD_SSE2
430402

431-
#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a)
432-
#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)
433-
#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
434-
#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)
435-
436403
static inline FORCE_INLINE unsigned char sse2_next_match(search_state *search)
437404
{
438405
int mask = search->matches_mask;
@@ -457,18 +424,6 @@ static inline FORCE_INLINE unsigned char sse2_next_match(search_state *search)
457424
#define TARGET_SSE2
458425
#endif
459426

460-
static inline TARGET_SSE2 FORCE_INLINE int sse2_update(const char *ptr)
461-
{
462-
__m128i chunk = _mm_loadu_si128((__m128i const*)ptr);
463-
464-
// Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
465-
// https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
466-
__m128i too_low_or_dbl_quote = _mm_cmplt_epu8(_mm_xor_si128(chunk, _mm_set1_epi8(2)), _mm_set1_epi8(33));
467-
__m128i has_backslash = _mm_cmpeq_epi8(chunk, _mm_set1_epi8('\\'));
468-
__m128i needs_escape = _mm_or_si128(too_low_or_dbl_quote, has_backslash);
469-
return _mm_movemask_epi8(needs_escape);
470-
}
471-
472427
static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(search_state *search)
473428
{
474429
if (RB_UNLIKELY(search->has_matches)) {
@@ -487,17 +442,10 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se
487442
}
488443
}
489444

490-
while (search->ptr + sizeof(__m128i) <= search->end) {
491-
int needs_escape_mask = sse2_update(search->ptr);
492-
493-
if (needs_escape_mask == 0) {
494-
search->ptr += sizeof(__m128i);
495-
continue;
496-
}
497-
445+
if (string_scan_simd_sse2(&search->ptr, search->end, &search->matches_mask)) {
498446
search->has_matches = true;
499-
search->matches_mask = needs_escape_mask;
500447
search->chunk_base = search->ptr;
448+
search->chunk_end = search->ptr + sizeof(__m128i);
501449
return sse2_next_match(search);
502450
}
503451

@@ -506,7 +454,7 @@ static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(se
506454
if (remaining >= SIMD_MINIMUM_THRESHOLD) {
507455
char *s = copy_remaining_bytes(search, sizeof(__m128i), remaining);
508456

509-
int needs_escape_mask = sse2_update(s);
457+
int needs_escape_mask = compute_chunk_mask_sse2(s);
510458

511459
if (needs_escape_mask == 0) {
512460
// Nothing to escape, ensure search_flush doesn't do anything by setting

0 commit comments

Comments
 (0)