Skip to content

Commit 77f407d

Browse files
authored
Add an ARM Neon string scanning implementation. (#994)
* Add an ARM Neon string scanning implementation. * Avoid undefined behavior in oj_ctz_msvc. * Removing an unnecessary else clause.
1 parent 412e9e0 commit 77f407d

File tree

2 files changed

+50
-1
lines changed

2 files changed

+50
-1
lines changed

ext/oj/parse.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,31 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
199199
return str;
200200
}
201201

202+
#ifdef HAVE_SIMD_NEON
203+
204+
static inline const char *string_scan_neon(const char *str, const char *end) {
205+
const uint8x16_t null_char = vdupq_n_u8(0);
206+
const uint8x16_t backslash = vdupq_n_u8('\\');
207+
const uint8x16_t quote = vdupq_n_u8('"');
208+
209+
while (str + sizeof(uint8x16_t) <= end) {
210+
uint8x16_t chunk = vld1q_u8((const uint8_t *)str);
211+
uint8x16_t tmp = vorrq_u8(vorrq_u8(vceqq_u8(chunk, null_char), vceqq_u8(chunk, backslash)),
212+
vceqq_u8(chunk, quote));
213+
const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(tmp), 4);
214+
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
215+
if (mask != 0) {
216+
mask &= 0x8888888888888888ull;
217+
return str + (OJ_CTZ64(mask) >> 2);
218+
}
219+
str += sizeof(uint8x16_t);
220+
}
221+
222+
return scan_string_noSIMD(str, end);
223+
}
224+
225+
#endif
226+
202227
#ifdef HAVE_SIMD_SSE4_2
203228
// Optimized SIMD string scanner using SSE4.2 instructions
204229
// Uses prefetching and processes multiple chunks in parallel to reduce latency
@@ -357,10 +382,12 @@ void oj_scanner_init(void) {
357382
#endif
358383
#ifdef HAVE_SIMD_SSE2
359384
case SIMD_SSE2: scan_func = scan_string_SSE2; break;
385+
#endif
386+
#ifdef HAVE_SIMD_NEON
387+
case SIMD_NEON: scan_func = string_scan_neon; break;
360388
#endif
361389
default: scan_func = scan_string_noSIMD; break;
362390
}
363-
// Note: ARM NEON string scanning would be added here if needed
364391
}
365392

366393
// entered at /

ext/oj/simd.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,26 @@ SIMD_Implementation oj_get_simd_implementation(void);
4040
// Count trailing zeros (for SSE2 mask scanning)
4141
#if defined(__GNUC__) || defined(__clang__)
4242
#define OJ_CTZ(x) __builtin_ctz(x)
43+
#define OJ_CTZ64(x) __builtin_ctzll(x)
4344
#elif defined(_MSC_VER)
4445
#include <intrin.h>
4546
static __inline int oj_ctz_msvc(unsigned int x) {
4647
unsigned long index;
48+
if (0 == x) {
49+
return 32;
50+
}
4751
_BitScanForward(&index, x);
4852
return (int)index;
4953
}
54+
static __inline int oj_ctz64_msvc(uint64_t x) {
55+
unsigned long index;
56+
if (_BitScanForward64(&index, x)) {
57+
return (int)index;
58+
}
59+
return 64;
60+
}
5061
#define OJ_CTZ(x) oj_ctz_msvc(x)
62+
#define OJ_CTZ64(x) oj_ctz64_msvc(x)
5163
#else
5264
// Fallback: naive implementation
5365
static inline int oj_ctz_fallback(unsigned int x) {
@@ -58,7 +70,17 @@ static inline int oj_ctz_fallback(unsigned int x) {
5870
}
5971
return count;
6072
}
73+
74+
static inline int oj_ctz64_fallback(uint64_t x) {
75+
int count = 0;
76+
while ((x & 1) == 0 && count < 64) {
77+
x >>= 1;
78+
count++;
79+
}
80+
return count;
81+
}
6182
#define OJ_CTZ(x) oj_ctz_fallback(x)
83+
#define OJ_CTZ64(x) oj_ctz64_fallback(x)
6284
#endif
6385

6486
// =============================================================================

0 commit comments

Comments
 (0)