3434#include <x86intrin.h>
3535#endif
3636#endif
37+ #ifdef __ARM_NEON
38+ #include <arm_neon.h>
39+ #endif
3740#include "picohttpparser.h"
3841
3942/* $Id$ */
7376#define ADVANCE_TOKEN (tok , toklen ) \
7477 do { \
7578 const char *tok_start = buf; \
76- static const char ALIGNED(16) ranges2[] = "\000\040\177\177"; \
7779 int found2; \
78- buf = findchar_fast (buf, buf_end, ranges2, sizeof(ranges2) - 1, &found2); \
80+ buf = findchar_nonprintable_fast (buf, buf_end, &found2); \
7981 if (!found2) { \
8082 CHECK_EOF(); \
8183 } \
@@ -133,6 +135,46 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
133135 return buf ;
134136}
135137
138+ static const char * findchar_nonprintable_fast (const char * buf , const char * buf_end , int * found )
139+ {
140+ #if defined(__ARM_64BIT_STATE ) && defined(__ARM_FEATURE_UNALIGNED ) && !defined(__ARM_BIG_ENDIAN )
141+ * found = 0 ;
142+
143+ for (size_t i = (buf_end - buf ) / sizeof (uint8x16_t ); i ; i -- ) {
144+ // This mask makes it possible to pack the comparison result into half a vector,
145+ // which has the same size as uint64_t.
146+ const uint8x16_t mask = vreinterpretq_u8_u16 (vmovq_n_u16 (0x8008 ));
147+ uint8x16_t v = vld1q_u8 ((const uint8_t * ) buf );
148+
149+ v = vorrq_u8 (vcltq_u8 (v , vmovq_n_u8 ('\041' )), vceqq_u8 (v , vmovq_n_u8 ('\177' )));
150+ // After masking, a byte in the result does not have the same bits set as any of its neighbours.
151+ v = vandq_u8 (v , mask );
152+ // Pack the comparison result into 64 bits.
153+ v = vpaddq_u8 (v , v );
154+
155+ uint64_t offset = vgetq_lane_u64 (vreinterpretq_u64_u8 (v ), 0 );
156+
157+ if (offset ) {
158+ * found = 1 ;
159+ __asm__ ("rbit %x0, %x0" : "+r" (offset ));
160+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ),
161+ "Need the number of leading 0-bits in uint64_t." );
162+ // offset uses 4 bits per byte of input.
163+ buf += __builtin_clzll (offset ) / 4 ;
164+ break ;
165+ }
166+
167+ buf += sizeof (v );
168+ }
169+
170+ return buf ;
171+ #else
172+ static const char ALIGNED (16 ) ranges2 [] = "\000\040\177\177" ;
173+
174+ return findchar_fast (buf , buf_end , ranges2 , sizeof (ranges2 ) - 1 , found );
175+ #endif
176+ }
177+
136178static const char * get_token_to_eol (const char * buf , const char * buf_end , const char * * token , size_t * token_len , int * ret )
137179{
138180 const char * token_start = buf ;
@@ -149,6 +191,38 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
149191 buf = findchar_fast (buf , buf_end , ranges1 , sizeof (ranges1 ) - 1 , & found );
150192 if (found )
151193 goto FOUND_CTL ;
194+ #elif defined(__ARM_64BIT_STATE ) && defined(__ARM_FEATURE_UNALIGNED ) && !defined(__ARM_BIG_ENDIAN )
195+ for (size_t i = (buf_end - buf ) / sizeof (uint8x16_t ); i ; i -- ) {
196+ const uint8x16_t space = vmovq_n_u8 ('\040' );
197+ const uint8x16_t v = vld1q_u8 ((const uint8_t * ) buf );
198+ uint8x16_t v2 = vcgeq_u8 (vsubq_u8 (v , space ), vmovq_n_u8 (0137u ));
199+
200+ // Pack the comparison result into half a vector, i.e. 64 bits; the result will still be non-zero
201+ // even if any adjacent bytes are the same (either 0 or 0xFF).
202+ v2 = vpaddq_u8 (v2 , v2 );
203+
204+ if (vgetq_lane_u64 (vreinterpretq_u64_u8 (v2 ), 0 )) {
205+ v2 = vandq_u8 (vcltq_u8 (v , space ), vmvnq_u8 (vceqq_u8 (v , vmovq_n_u8 ('\011' ))));
206+ v2 = vorrq_u8 (v2 , vceqq_u8 (v , vmovq_n_u8 ('\177' )));
207+ // After masking, a byte in the result does not have the same bits set as any of its neighbours.
208+ v2 = vandq_u8 (v2 , vreinterpretq_u8_u16 (vmovq_n_u16 (0x8008 )));
209+ // Pack the comparison result into 64 bits.
210+ v2 = vpaddq_u8 (v2 , v2 );
211+
212+ uint64_t offset = vgetq_lane_u64 (vreinterpretq_u64_u8 (v2 ), 0 );
213+
214+ if (offset ) {
215+ __asm__ ("rbit %x0, %x0" : "+r" (offset ));
216+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ),
217+ "Need the number of leading 0-bits in uint64_t." );
218+ // offset uses 4 bits per byte of input.
219+ buf += __builtin_clzll (offset ) / 4 ;
220+ goto FOUND_CTL ;
221+ }
222+ }
223+
224+ buf += sizeof (v );
225+ }
152226#else
153227 /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
154228 while (likely (buf_end - buf >= 8 )) {
0 commit comments