Skip to content

Commit a47c4d0

Browse files
committed
Optimizations for Armv8-A
These changes apply only to the AArch64 execution state.
1 parent f832609 commit a47c4d0

File tree

1 file changed

+141
-2
lines changed

1 file changed

+141
-2
lines changed

picohttpparser.c

Lines changed: 141 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@
3434
#include <x86intrin.h>
3535
#endif
3636
#endif
37+
#ifdef __ARM_FEATURE_SVE
38+
#include <arm_sve.h>
39+
#endif
40+
#ifdef __ARM_NEON
41+
#include <arm_neon.h>
42+
#endif
3743
#include "picohttpparser.h"
3844

3945
#if __GNUC__ >= 3
@@ -71,9 +77,8 @@
7177
#define ADVANCE_TOKEN(tok, toklen) \
7278
do { \
7379
const char *tok_start = buf; \
74-
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
7580
int found2; \
76-
buf = findchar_fast(buf, buf_end, ranges2, 4, &found2); \
81+
buf = findchar_nonprintable_fast(buf, buf_end, &found2); \
7782
if (!found2) { \
7883
CHECK_EOF(); \
7984
} \
@@ -131,6 +136,66 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
131136
return buf;
132137
}
133138

139+
static const char *findchar_nonprintable_fast(const char *buf, const char *buf_end, int *found)
140+
{
141+
#ifdef __ARM_FEATURE_SVE
142+
*found = 0;
143+
144+
for (uint64_t i = 0;; i = svqincb(i, 1)) {
145+
const uint64_t len = buf_end - buf;
146+
const svbool_t pg = svwhilelt_b8(i, len);
147+
148+
if (!svptest_first(svptrue_b8(), pg)) {
149+
buf = buf_end;
150+
break;
151+
}
152+
153+
const svuint8_t v = svld1(pg, (const uint8_t *)buf + i);
154+
svbool_t c = svcmplt(pg, v, '\041');
155+
156+
c = svorr_z(pg, c, svcmpeq(pg, v, '\177'));
157+
158+
if (svptest_any(pg, c)) {
159+
*found = 1;
160+
c = svbrkb_z(pg, c);
161+
buf += i + svcntp_b8(pg, c);
162+
break;
163+
}
164+
}
165+
166+
return buf;
167+
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
168+
*found = 0;
169+
170+
const size_t block_size = sizeof(uint8x16_t) - 1;
171+
const char *const end = (size_t)(buf_end - buf) >= block_size ? buf_end - block_size : buf;
172+
173+
for (; buf < end; buf += sizeof(uint8x16_t)) {
174+
uint8x16_t v = vld1q_u8((const uint8_t *)buf);
175+
176+
v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')), vceqq_u8(v, vmovq_n_u8('\177')));
177+
178+
/* Pack the comparison result into 64 bits. */
179+
const uint8x8_t rv = vshrn_n_u16(vreinterpretq_u16_u8(v), 4);
180+
uint64_t offset = vget_lane_u64(vreinterpret_u64_u8(rv), 0);
181+
182+
if (offset) {
183+
*found = 1;
184+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "Need the number of leading 0-bits in uint64_t.");
185+
/* offset uses 4 bits per byte of input. */
186+
buf += __builtin_ctzll(offset) / 4;
187+
break;
188+
}
189+
}
190+
191+
return buf;
192+
#else
193+
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177";
194+
195+
return findchar_fast(buf, buf_end, ranges2, 4, found);
196+
#endif
197+
}
198+
134199
static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret)
135200
{
136201
const char *token_start = buf;
@@ -143,6 +208,80 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
143208
buf = findchar_fast(buf, buf_end, ranges1, 6, &found);
144209
if (found)
145210
goto FOUND_CTL;
211+
#elif defined(__ARM_FEATURE_SVE)
212+
for (uint64_t i = 0;; i = svqincb(i, 1)) {
213+
const uint64_t len = buf_end - buf;
214+
const svbool_t pg = svwhilelt_b8(i, len);
215+
216+
if (!svptest_first(svptrue_b8(), pg)) {
217+
buf = buf_end;
218+
break;
219+
}
220+
221+
const svuint8_t v = svld1(pg, (const uint8_t *)buf + i);
222+
const uint8_t space = '\040';
223+
svbool_t c = svcmpge(pg, svsub_x(pg, v, space), 0137u);
224+
225+
if (svptest_any(pg, c)) {
226+
c = svcmplt(pg, v, space);
227+
c = svcmpne(c, v, '\011');
228+
c = svorr_z(pg, c, svcmpeq(pg, v, '\177'));
229+
230+
if (svptest_any(pg, c)) {
231+
c = svbrkb_z(pg, c);
232+
buf += i + svcntp_b8(pg, c);
233+
goto FOUND_CTL;
234+
}
235+
}
236+
}
237+
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
238+
const size_t block_size = 2 * sizeof(uint8x16_t) - 1;
239+
const char *const end = (size_t)(buf_end - buf) >= block_size ? buf_end - block_size : buf;
240+
241+
for (; buf < end; buf += 2 * sizeof(uint8x16_t)) {
242+
const uint8x16_t space = vmovq_n_u8('\040');
243+
const uint8x16_t threshold = vmovq_n_u8(0137u);
244+
const uint8x16_t v1 = vld1q_u8((const uint8_t *)buf);
245+
const uint8x16_t v2 = vld1q_u8((const uint8_t *)buf + sizeof(v1));
246+
uint8x16_t v3 = vsubq_u8(v1, space);
247+
uint8x16_t v4 = vsubq_u8(v2, space);
248+
249+
v3 = vcgeq_u8(v3, threshold);
250+
v4 = vcgeq_u8(v4, threshold);
251+
v3 = vorrq_u8(v3, v4);
252+
/* Pack the comparison result into half a vector, i.e. 64 bits. */
253+
v3 = vpmaxq_u8(v3, v3);
254+
255+
if (vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0)) {
256+
const uint8x16_t del = vmovq_n_u8('\177');
257+
/* This mask makes it possible to pack the comparison results into half a vector,
258+
* which has the same size as uint64_t. */
259+
const uint8x16_t mask = vreinterpretq_u8_u32(vmovq_n_u32(0x40100401));
260+
const uint8x16_t tab = vmovq_n_u8('\011');
261+
262+
v3 = vcltq_u8(v1, space);
263+
v4 = vcltq_u8(v2, space);
264+
v3 = vbicq_u8(v3, vceqq_u8(v1, tab));
265+
v4 = vbicq_u8(v4, vceqq_u8(v2, tab));
266+
v3 = vorrq_u8(v3, vceqq_u8(v1, del));
267+
v4 = vorrq_u8(v4, vceqq_u8(v2, del));
268+
/* After masking, four consecutive bytes in the results do not have the same bits set. */
269+
v3 = vandq_u8(v3, mask);
270+
v4 = vandq_u8(v4, mask);
271+
/* Pack the comparison results into 128, and then 64 bits. */
272+
v3 = vpaddq_u8(v3, v4);
273+
v3 = vpaddq_u8(v3, v3);
274+
275+
uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0);
276+
277+
if (offset) {
278+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "Need the number of leading 0-bits in uint64_t.");
279+
/* offset uses 2 bits per byte of input. */
280+
buf += __builtin_ctzll(offset) / 2;
281+
goto FOUND_CTL;
282+
}
283+
}
284+
}
146285
#else
147286
/* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
148287
while (likely(buf_end - buf >= 8)) {

0 commit comments

Comments
 (0)