Skip to content

Commit c04a8f7

Browse files
committed
Optimizations for Armv8-A
1 parent 1d2b8a1 commit c04a8f7

File tree

1 file changed

+76
-2
lines changed

1 file changed

+76
-2
lines changed

picohttpparser.c

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@
3434
#include <x86intrin.h>
3535
#endif
3636
#endif
37+
#ifdef __ARM_NEON
38+
#include <arm_neon.h>
39+
#endif
3740
#include "picohttpparser.h"
3841

3942
/* $Id$ */
@@ -73,9 +76,8 @@
7376
#define ADVANCE_TOKEN(tok, toklen) \
7477
do { \
7578
const char *tok_start = buf; \
76-
static const char ALIGNED(16) ranges2[] = "\000\040\177\177"; \
7779
int found2; \
78-
buf = findchar_fast(buf, buf_end, ranges2, sizeof(ranges2) - 1, &found2); \
80+
buf = findchar_nonprintable_fast(buf, buf_end, &found2); \
7981
if (!found2) { \
8082
CHECK_EOF(); \
8183
} \
@@ -133,6 +135,46 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
133135
return buf;
134136
}
135137

138+
static const char *findchar_nonprintable_fast(const char *buf, const char *buf_end, int *found)
139+
{
140+
#if defined(__ARM_64BIT_STATE) && defined(__ARM_FEATURE_UNALIGNED) && !defined(__ARM_BIG_ENDIAN)
141+
*found = 0;
142+
143+
for (size_t i = (buf_end - buf) / sizeof(uint8x16_t); i; i--) {
144+
// This mask makes it possible to pack the comparison result into half a vector,
145+
// which has the same size as uint64_t.
146+
const uint8x16_t mask = vreinterpretq_u8_u16(vmovq_n_u16(0x8008));
147+
uint8x16_t v = vld1q_u8((const uint8_t *) buf);
148+
149+
v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')), vceqq_u8(v, vmovq_n_u8('\177')));
150+
// After masking, a byte in the result does not have the same bits set as any of its neighbours.
151+
v = vandq_u8(v, mask);
152+
// Pack the comparison result into 64 bits.
153+
v = vpaddq_u8(v, v);
154+
155+
uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v), 0);
156+
157+
if (offset) {
158+
*found = 1;
159+
__asm__ ("rbit %x0, %x0" : "+r" (offset));
160+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
161+
"Need the number of leading 0-bits in uint64_t.");
162+
// offset uses 4 bits per byte of input.
163+
buf += __builtin_clzll(offset) / 4;
164+
break;
165+
}
166+
167+
buf += sizeof(v);
168+
}
169+
170+
return buf;
171+
#else
172+
static const char ALIGNED(16) ranges2[] = "\000\040\177\177";
173+
174+
return findchar_fast(buf, buf_end, ranges2, sizeof(ranges2) - 1, found);
175+
#endif
176+
}
177+
136178
static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret)
137179
{
138180
const char *token_start = buf;
@@ -149,6 +191,38 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
149191
buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found);
150192
if (found)
151193
goto FOUND_CTL;
194+
#elif defined(__ARM_64BIT_STATE) && defined(__ARM_FEATURE_UNALIGNED) && !defined(__ARM_BIG_ENDIAN)
195+
for (size_t i = (buf_end - buf) / sizeof(uint8x16_t); i; i--) {
196+
const uint8x16_t space = vmovq_n_u8('\040');
197+
const uint8x16_t v = vld1q_u8((const uint8_t *) buf);
198+
uint8x16_t v2 = vcgeq_u8(vsubq_u8(v, space), vmovq_n_u8(0137u));
199+
200+
// Pack the comparison result into half a vector, i.e. 64 bits; the result will still be non-zero
201+
// even if any adjacent bytes are the same (either 0 or 0xFF).
202+
v2 = vpaddq_u8(v2, v2);
203+
204+
if (vgetq_lane_u64(vreinterpretq_u64_u8(v2), 0)) {
205+
v2 = vandq_u8(vcltq_u8(v, space), vmvnq_u8(vceqq_u8(v, vmovq_n_u8('\011'))));
206+
v2 = vorrq_u8(v2, vceqq_u8(v, vmovq_n_u8('\177')));
207+
// After masking, a byte in the result does not have the same bits set as any of its neighbours.
208+
v2 = vandq_u8(v2, vreinterpretq_u8_u16(vmovq_n_u16(0x8008)));
209+
// Pack the comparison result into 64 bits.
210+
v2 = vpaddq_u8(v2, v2);
211+
212+
uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v2), 0);
213+
214+
if (offset) {
215+
__asm__ ("rbit %x0, %x0" : "+r" (offset));
216+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
217+
"Need the number of leading 0-bits in uint64_t.");
218+
// offset uses 4 bits per byte of input.
219+
buf += __builtin_clzll(offset) / 4;
220+
goto FOUND_CTL;
221+
}
222+
}
223+
224+
buf += sizeof(v);
225+
}
152226
#else
153227
/* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
154228
while (likely(buf_end - buf >= 8)) {

0 commit comments

Comments
 (0)