Skip to content

Commit 4ea6fdf

Browse files
authored
Optional SIMD memchr (#592)
Continuing #580, followup to #586. Chose `memchr` because it's somewhat similar to `strlen`, but also because it is the basis for `strnlen` (and in that capacity, for `strndup` and `strlcat`) and is also used by `strstr`, `fnmatch`.
1 parent 50ae119 commit 4ea6fdf

File tree

3 files changed

+130
-15
lines changed

3 files changed

+130
-15
lines changed

libc-top-half/musl/src/string/memchr.c

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
#include <stdint.h>
33
#include <limits.h>
44

5+
#ifdef __wasm_simd128__
6+
#include <wasm_simd128.h>
7+
#endif
8+
59
#define SS (sizeof(size_t))
610
#define ALIGN (sizeof(size_t)-1)
711
#define ONES ((size_t)-1/UCHAR_MAX)
@@ -10,6 +14,64 @@
1014

1115
void *memchr(const void *src, int c, size_t n)
1216
{
17+
#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
18+
// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
19+
// which results in an ICE when inline assembly is used with a vector result.
20+
#if __clang_major__ != 19 && __clang_major__ != 20
21+
// When n is zero, a function that locates a character finds no occurrence.
22+
// Otherwise, decrement n to ensure sub_overflow overflows
23+
// when n would go equal-to-or-below zero.
24+
if (!n--) {
25+
return NULL;
26+
}
27+
28+
// Note that reading before/after the allocation of a pointer is UB in
29+
// C, so inline assembly is used to generate the exact machine
30+
// instruction we want with opaque semantics to the compiler to avoid
31+
// the UB.
32+
uintptr_t align = (uintptr_t)src % sizeof(v128_t);
33+
uintptr_t addr = (uintptr_t)src - align;
34+
v128_t vc = wasm_i8x16_splat(c);
35+
36+
for (;;) {
37+
v128_t v;
38+
__asm__ (
39+
"local.get %1\n"
40+
"v128.load 0\n"
41+
"local.set %0\n"
42+
: "=r"(v)
43+
: "r"(addr)
44+
: "memory");
45+
v128_t cmp = wasm_i8x16_eq(v, vc);
46+
// Bitmask is slow on AArch64, any_true is much faster.
47+
if (wasm_v128_any_true(cmp)) {
48+
// Clear the bits corresponding to align (little-endian)
49+
// so we can count trailing zeros.
50+
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
51+
// At least one bit will be set, unless align cleared them.
52+
// Knowing this helps the compiler if it unrolls the loop.
53+
__builtin_assume(mask || align);
54+
// If the mask became zero because of align,
55+
// it's as if we didn't find anything.
56+
if (mask) {
57+
// Find the offset of the first one bit (little-endian).
58+
// That's a match, unless it is beyond the end of the object.
59+
// Recall that we decremented n, so less-than-or-equal-to is correct.
60+
size_t ctz = __builtin_ctz(mask);
61+
return ctz - align <= n ? (char *)src + (addr + ctz - (uintptr_t)src)
62+
: NULL;
63+
}
64+
}
65+
// Decrement n; if it overflows we're done.
66+
if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
67+
return NULL;
68+
}
69+
align = 0;
70+
addr += sizeof(v128_t);
71+
}
72+
#endif
73+
#endif
74+
1375
const unsigned char *s = src;
1476
c = (unsigned char)c;
1577
#ifdef __GNUC__

libc-top-half/musl/src/string/strlen.c

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,28 +14,28 @@
1414
size_t strlen(const char *s)
1515
{
1616
#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
17-
// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574) which
18-
// results in an ICE when inline assembly is used with a vector result.
17+
// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
18+
// which results in an ICE when inline assembly is used with a vector result.
1919
#if __clang_major__ != 19 && __clang_major__ != 20
20-
// Note that reading before/after the allocation of a pointer is UB in
21-
// C, so inline assembly is used to generate the exact machine
22-
// instruction we want with opaque semantics to the compiler to avoid
23-
// the UB.
20+
// Note that reading before/after the allocation of a pointer is UB in
21+
// C, so inline assembly is used to generate the exact machine
22+
// instruction we want with opaque semantics to the compiler to avoid
23+
// the UB.
2424
uintptr_t align = (uintptr_t)s % sizeof(v128_t);
25-
uintptr_t v = (uintptr_t)s - align;
25+
uintptr_t addr = (uintptr_t)s - align;
2626

2727
for (;;) {
28-
v128_t chunk;
28+
v128_t v;
2929
__asm__ (
3030
"local.get %1\n"
3131
"v128.load 0\n"
3232
"local.set %0\n"
33-
: "=r"(chunk)
34-
: "r"(v)
35-
: "memory");
33+
: "=r"(v)
34+
: "r"(addr)
35+
: "memory");
3636
// Bitmask is slow on AArch64, all_true is much faster.
37-
if (!wasm_i8x16_all_true(chunk)) {
38-
const v128_t cmp = wasm_i8x16_eq(chunk, (v128_t){});
37+
if (!wasm_i8x16_all_true(v)) {
38+
const v128_t cmp = wasm_i8x16_eq(v, (v128_t){});
3939
// Clear the bits corresponding to align (little-endian)
4040
// so we can count trailing zeros.
4141
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
@@ -46,11 +46,11 @@ size_t strlen(const char *s)
4646
// it's as if we didn't find anything.
4747
if (mask) {
4848
// Find the offset of the first one bit (little-endian).
49-
return v - (uintptr_t)s + __builtin_ctz(mask);
49+
return addr - (uintptr_t)s + __builtin_ctz(mask);
5050
}
5151
}
5252
align = 0;
53-
v += sizeof(v128_t);
53+
addr += sizeof(v128_t);
5454
}
5555
#endif
5656
#endif

test/src/misc/memchr.c

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
//! add-flags.py(LDFLAGS): -Wl,--stack-first -Wl,--initial-memory=327680
2+
3+
#include <__macro_PAGESIZE.h>
4+
#include <stddef.h>
5+
#include <stdio.h>
6+
#include <string.h>
7+
8+
void test(char *ptr, size_t length, void *want) {
9+
void *got = memchr(ptr, 7, length);
10+
if (got != want) {
11+
printf("memchr(%p, 7, %lu) = %p, want %p\n", ptr, length, got, want);
12+
}
13+
}
14+
15+
int main(void) {
16+
char *const LIMIT = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE);
17+
18+
for (size_t length = 0; length < 64; length++) {
19+
for (size_t alignment = 0; alignment < 24; alignment++) {
20+
for (ptrdiff_t pos = -2; pos < length + 2; pos++) {
21+
// Create a buffer with the given length, at a pointer with the given
22+
// alignment. Using the offset LIMIT - PAGESIZE - 8 means many buffers
23+
// will straddle a (Wasm, and likely OS) page boundary. Place the
24+
// character to find at every position in the buffer, including just
25+
// prior to it and after its end.
26+
char *ptr = LIMIT - PAGESIZE - 8 + alignment;
27+
memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE);
28+
memset(ptr, 5, length);
29+
ptr[pos] = 7;
30+
31+
// The first instance of the character is found.
32+
if (pos >= 0) ptr[pos + 2] = 7;
33+
34+
// The character is found if it's within range.
35+
test(ptr, length, 0 <= pos && pos < length ? &ptr[pos] : NULL);
36+
}
37+
}
38+
39+
// Ensure we never read past the end of memory.
40+
char *ptr = LIMIT - length;
41+
memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE);
42+
memset(ptr, 5, length);
43+
ptr[length - 1] = 7;
44+
45+
// Nothing found on an empty buffer.
46+
test(ptr, length, length != 0 ? &ptr[length - 1] : NULL);
47+
48+
// Test for length overflow.
49+
if (length > 0) test(ptr, SIZE_MAX, &ptr[length - 1]);
50+
}
51+
52+
return 0;
53+
}

0 commit comments

Comments
 (0)