Skip to content

Commit c1b54ea

Browse files
authored
Optional SIMD memrchr (#598)
Continuing #580. `memrchr` is mostly used to implement `strrchr`, so that's what I test. It has the advantage that we know the length and can access the entire buffer, so no undefined behavior for this one. I does do unaligned reads, however, so `wasm_v128_load` is used to dereference the pointer. It uses SIMD while there are 16 or more bytes to read, then fallback to scalar. The only other notable feature is using `clz` rather than `ctz`.
1 parent 777cf2c commit c1b54ea

File tree

2 files changed

+78
-0
lines changed

2 files changed

+78
-0
lines changed

libc-top-half/musl/src/string/memrchr.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,29 @@
11
#include <string.h>
22

3+
#ifdef __wasm_simd128__
4+
#include <wasm_simd128.h>
5+
#endif
6+
37
void *__memrchr(const void *m, int c, size_t n)
48
{
9+
#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
10+
// memrchr is allowed to read up to n bytes from the object.
11+
// Search backward for the last matching character.
12+
const v128_t *v = (v128_t *)((char *)m + n);
13+
const v128_t vc = wasm_i8x16_splat(c);
14+
for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
15+
const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(--v), vc);
16+
// Bitmask is slow on AArch64, any_true is much faster.
17+
if (wasm_v128_any_true(cmp)) {
18+
// Find the offset of the last one bit (little-endian).
19+
// The leading 16 bits of the bitmask are always zero,
20+
// and to be ignored.
21+
size_t clz = __builtin_clz(wasm_i8x16_bitmask(cmp)) - 16;
22+
return (char *)(v + 1) - (clz + 1);
23+
}
24+
}
25+
#endif
26+
527
const unsigned char *s = m;
628
c = (unsigned char)c;
729
while (n--) if (s[n]==c) return (void *)(s+n);

test/src/misc/strrchr.c

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
//! add-flags.py(LDFLAGS): -Wl,--stack-first -Wl,--initial-memory=327680
2+
3+
#include <__macro_PAGESIZE.h>
4+
#include <stdio.h>
5+
#include <string.h>
6+
7+
void test(char *ptr, char *want) {
8+
char *got = strrchr(ptr, 7);
9+
if (got != want) {
10+
printf("strrchr(%p, 7) = %p, want %p\n", ptr, got, want);
11+
}
12+
}
13+
14+
int main(void) {
15+
char *const LIMIT = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE);
16+
17+
for (ptrdiff_t length = 0; length < 64; length++) {
18+
for (ptrdiff_t alignment = 0; alignment < 24; alignment++) {
19+
for (ptrdiff_t pos = -2; pos < length + 2; pos++) {
20+
// Create a buffer with the given length, at a pointer with the given
21+
// alignment. Using the offset LIMIT - PAGESIZE - 8 means many buffers
22+
// will straddle a (Wasm, and likely OS) page boundary. Place the
23+
// character to find at every position in the buffer, including just
24+
// prior to it and after its end.
25+
char *ptr = LIMIT - PAGESIZE - 8 + alignment;
26+
memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE);
27+
memset(ptr, 5, pos > length ? pos : length);
28+
29+
// The last instance of the character is found.
30+
ptr[0] = 7;
31+
ptr[pos] = 7;
32+
ptr[length] = 0;
33+
34+
// The character is found if it's within range.
35+
char *want = NULL;
36+
if (length > 0) want = 0 <= pos && pos < length ? &ptr[pos] : ptr;
37+
test(ptr, want);
38+
}
39+
}
40+
41+
// We need space for the terminator.
42+
if (length <= 1) continue;
43+
44+
// Ensure we never read past the end of memory.
45+
char *ptr = LIMIT - length;
46+
memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE);
47+
memset(ptr, 5, length);
48+
49+
ptr[0] = 7;
50+
ptr[length - 2] = 7;
51+
ptr[length - 1] = 0;
52+
test(ptr, &ptr[length - 2]);
53+
}
54+
55+
return 0;
56+
}

0 commit comments

Comments
 (0)