3434#include <x86intrin.h>
3535#endif
3636#endif
37+ #ifdef __ARM_FEATURE_SVE
38+ #include <arm_sve.h>
39+ #endif
40+ #ifdef __ARM_NEON
41+ #include <arm_neon.h>
42+ #endif
3743#include "picohttpparser.h"
3844
3945#if __GNUC__ >= 3
7177#define ADVANCE_TOKEN (tok , toklen ) \
7278 do { \
7379 const char *tok_start = buf; \
74- static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
7580 int found2; \
76- buf = findchar_fast (buf, buf_end, ranges2, 4, &found2); \
81+ buf = findchar_nonprintable_fast (buf, buf_end, &found2); \
7782 if (!found2) { \
7883 CHECK_EOF(); \
7984 } \
@@ -131,6 +136,66 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
131136 return buf ;
132137}
133138
139+ static const char * findchar_nonprintable_fast (const char * buf , const char * buf_end , int * found )
140+ {
141+ #ifdef __ARM_FEATURE_SVE
142+ * found = 0 ;
143+
144+ for (uint64_t i = 0 ;; i = svqincb (i , 1 )) {
145+ const uint64_t len = buf_end - buf ;
146+ const svbool_t pg = svwhilelt_b8 (i , len );
147+
148+ if (!svptest_first (svptrue_b8 (), pg )) {
149+ buf = buf_end ;
150+ break ;
151+ }
152+
153+ const svuint8_t v = svld1 (pg , (const uint8_t * )buf + i );
154+ svbool_t c = svcmplt (pg , v , '\041' );
155+
156+ c = svorr_z (pg , c , svcmpeq (pg , v , '\177' ));
157+
158+ if (svptest_any (pg , c )) {
159+ * found = 1 ;
160+ c = svbrkb_z (pg , c );
161+ buf += i + svcntp_b8 (pg , c );
162+ break ;
163+ }
164+ }
165+
166+ return buf ;
167+ #elif defined(__ARM_NEON ) && defined(__ARM_64BIT_STATE )
168+ * found = 0 ;
169+
170+ const size_t block_size = sizeof (uint8x16_t ) - 1 ;
171+ const char * const end = (size_t )(buf_end - buf ) >= block_size ? buf_end - block_size : buf ;
172+
173+ for (; buf < end ; buf += sizeof (uint8x16_t )) {
174+ uint8x16_t v = vld1q_u8 ((const uint8_t * )buf );
175+
176+ v = vorrq_u8 (vcltq_u8 (v , vmovq_n_u8 ('\041' )), vceqq_u8 (v , vmovq_n_u8 ('\177' )));
177+
178+ /* Pack the comparison result into 64 bits. */
179+ const uint8x8_t rv = vshrn_n_u16 (vreinterpretq_u16_u8 (v ), 4 );
180+ uint64_t offset = vget_lane_u64 (vreinterpret_u64_u8 (rv ), 0 );
181+
182+ if (offset ) {
183+ * found = 1 ;
184+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ), "Need the number of leading 0-bits in uint64_t." );
185+ /* offset uses 4 bits per byte of input. */
186+ buf += __builtin_ctzll (offset ) / 4 ;
187+ break ;
188+ }
189+ }
190+
191+ return buf ;
192+ #else
193+ static const char ALIGNED (16 ) ranges2 [16 ] = "\000\040\177\177" ;
194+
195+ return findchar_fast (buf , buf_end , ranges2 , 4 , found );
196+ #endif
197+ }
198+
134199static const char * get_token_to_eol (const char * buf , const char * buf_end , const char * * token , size_t * token_len , int * ret )
135200{
136201 const char * token_start = buf ;
@@ -143,6 +208,80 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
143208 buf = findchar_fast (buf , buf_end , ranges1 , 6 , & found );
144209 if (found )
145210 goto FOUND_CTL ;
211+ #elif defined(__ARM_FEATURE_SVE )
212+ for (uint64_t i = 0 ;; i = svqincb (i , 1 )) {
213+ const uint64_t len = buf_end - buf ;
214+ const svbool_t pg = svwhilelt_b8 (i , len );
215+
216+ if (!svptest_first (svptrue_b8 (), pg )) {
217+ buf = buf_end ;
218+ break ;
219+ }
220+
221+ const svuint8_t v = svld1 (pg , (const uint8_t * )buf + i );
222+ const uint8_t space = '\040' ;
223+ svbool_t c = svcmpge (pg , svsub_x (pg , v , space ), 0137u );
224+
225+ if (svptest_any (pg , c )) {
226+ c = svcmplt (pg , v , space );
227+ c = svcmpne (c , v , '\011' );
228+ c = svorr_z (pg , c , svcmpeq (pg , v , '\177' ));
229+
230+ if (svptest_any (pg , c )) {
231+ c = svbrkb_z (pg , c );
232+ buf += i + svcntp_b8 (pg , c );
233+ goto FOUND_CTL ;
234+ }
235+ }
236+ }
237+ #elif defined(__ARM_NEON ) && defined(__ARM_64BIT_STATE )
238+ const size_t block_size = 2 * sizeof (uint8x16_t ) - 1 ;
239+ const char * const end = (size_t )(buf_end - buf ) >= block_size ? buf_end - block_size : buf ;
240+
241+ for (; buf < end ; buf += 2 * sizeof (uint8x16_t )) {
242+ const uint8x16_t space = vmovq_n_u8 ('\040' );
243+ const uint8x16_t threshold = vmovq_n_u8 (0137u );
244+ const uint8x16_t v1 = vld1q_u8 ((const uint8_t * )buf );
245+ const uint8x16_t v2 = vld1q_u8 ((const uint8_t * )buf + sizeof (v1 ));
246+ uint8x16_t v3 = vsubq_u8 (v1 , space );
247+ uint8x16_t v4 = vsubq_u8 (v2 , space );
248+
249+ v3 = vcgeq_u8 (v3 , threshold );
250+ v4 = vcgeq_u8 (v4 , threshold );
251+ v3 = vorrq_u8 (v3 , v4 );
252+ /* Pack the comparison result into half a vector, i.e. 64 bits. */
253+ v3 = vpmaxq_u8 (v3 , v3 );
254+
255+ if (vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 )) {
256+ const uint8x16_t del = vmovq_n_u8 ('\177' );
257+ /* This mask makes it possible to pack the comparison results into half a vector,
258+ * which has the same size as uint64_t. */
259+ const uint8x16_t mask = vreinterpretq_u8_u32 (vmovq_n_u32 (0x40100401 ));
260+ const uint8x16_t tab = vmovq_n_u8 ('\011' );
261+
262+ v3 = vcltq_u8 (v1 , space );
263+ v4 = vcltq_u8 (v2 , space );
264+ v3 = vbicq_u8 (v3 , vceqq_u8 (v1 , tab ));
265+ v4 = vbicq_u8 (v4 , vceqq_u8 (v2 , tab ));
266+ v3 = vorrq_u8 (v3 , vceqq_u8 (v1 , del ));
267+ v4 = vorrq_u8 (v4 , vceqq_u8 (v2 , del ));
268+ /* After masking, four consecutive bytes in the results do not have the same bits set. */
269+ v3 = vandq_u8 (v3 , mask );
270+ v4 = vandq_u8 (v4 , mask );
271+ /* Pack the comparison results into 128, and then 64 bits. */
272+ v3 = vpaddq_u8 (v3 , v4 );
273+ v3 = vpaddq_u8 (v3 , v3 );
274+
275+ uint64_t offset = vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 );
276+
277+ if (offset ) {
278+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ), "Need the number of leading 0-bits in uint64_t." );
279+ /* offset uses 2 bits per byte of input. */
280+ buf += __builtin_ctzll (offset ) / 2 ;
281+ goto FOUND_CTL ;
282+ }
283+ }
284+ }
146285#else
147286 /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
148287 while (likely (buf_end - buf >= 8 )) {
0 commit comments