@@ -199,6 +199,31 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
199199 return str ;
200200}
201201
202+ #ifdef HAVE_SIMD_NEON
203+
204+ static inline const char * string_scan_neon (const char * str , const char * end ) {
205+ const uint8x16_t null_char = vdupq_n_u8 (0 );
206+ const uint8x16_t backslash = vdupq_n_u8 ('\\' );
207+ const uint8x16_t quote = vdupq_n_u8 ('"' );
208+
209+ while (str + sizeof (uint8x16_t ) <= end ) {
210+ uint8x16_t chunk = vld1q_u8 ((const uint8_t * )str );
211+ uint8x16_t tmp = vorrq_u8 (vorrq_u8 (vceqq_u8 (chunk , null_char ), vceqq_u8 (chunk , backslash )),
212+ vceqq_u8 (chunk , quote ));
213+ const uint8x8_t res = vshrn_n_u16 (vreinterpretq_u16_u8 (tmp ), 4 );
214+ uint64_t mask = vget_lane_u64 (vreinterpret_u64_u8 (res ), 0 );
215+ if (mask != 0 ) {
216+ mask &= 0x8888888888888888ull ;
217+ return str + (OJ_CTZ64 (mask ) >> 2 );
218+ }
219+ str += sizeof (uint8x16_t );
220+ }
221+
222+ return scan_string_noSIMD (str , end );
223+ }
224+
225+ #endif
226+
202227#ifdef HAVE_SIMD_SSE4_2
203228// Optimized SIMD string scanner using SSE4.2 instructions
204229// Uses prefetching and processes multiple chunks in parallel to reduce latency
@@ -357,10 +382,12 @@ void oj_scanner_init(void) {
357382#endif
358383#ifdef HAVE_SIMD_SSE2
359384 case SIMD_SSE2 : scan_func = scan_string_SSE2 ; break ;
385+ #endif
386+ #ifdef HAVE_SIMD_NEON
387+ case SIMD_NEON : scan_func = string_scan_neon ; break ;
360388#endif
361389 default : scan_func = scan_string_noSIMD ; break ;
362390 }
363- // Note: ARM NEON string scanning would be added here if needed
364391}
365392
366393// entered at /
0 commit comments