Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions libc-top-half/musl/src/string/memchr.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
#include <stdint.h>
#include <limits.h>

#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#endif

#define SS (sizeof(size_t))
#define ALIGN (sizeof(size_t)-1)
#define ONES ((size_t)-1/UCHAR_MAX)
Expand All @@ -10,6 +14,64 @@

void *memchr(const void *src, int c, size_t n)
{
#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
// which results in an ICE when inline assembly is used with a vector result.
#if __clang_major__ != 19 && __clang_major__ != 20
// When n is zero, a function that locates a character finds no occurrence.
// Otherwise, decrement n to ensure sub_overflow overflows
// when n would go equal-to-or-below zero.
if (!n--) {
return NULL;
}

// Note that reading before/after the allocation of a pointer is UB in
// C, so inline assembly is used to generate the exact machine
// instruction we want with opaque semantics to the compiler to avoid
// the UB.
uintptr_t align = (uintptr_t)src % sizeof(v128_t);
uintptr_t addr = (uintptr_t)src - align;
v128_t vc = wasm_i8x16_splat(c);

for (;;) {
v128_t v;
__asm__ (
"local.get %1\n"
"v128.load 0\n"
"local.set %0\n"
: "=r"(v)
: "r"(addr)
: "memory");
v128_t cmp = wasm_i8x16_eq(v, vc);
// Bitmask is slow on AArch64, any_true is much faster.
if (wasm_v128_any_true(cmp)) {
Comment on lines +46 to +47
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure about this: using the wasm_i8x16_bitmask directly here is a better lowering for x64 than wasm_v128_any_true.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. This is a mitigation for AArch64. We don't have the luxury of knowing the final architecture, and much less the CPU. Or how good the runtime is at (e.g. peephole) optimizing the final generated assembly.

But I'm pretty sure I measured, and at least for large buffers (and wazero) this was a significant improvement on AArch64, for a pretty insignificant cost on 3 different x86-64 CPUs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried this again with bench.c with wasmtime on my Xeon W-2135, and... it's hard to measure. I'm not saying it's not slower, it may, but it's close enough that between processors, VMs, lengths, I'm not sure which is better.

So, where do I put something like bench.c, and how do we settle the matter?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like this kind of thing could fit in sightglass, even though this is a bit of a micro-benchmark. You could take a look at this example of how the blake3 benchmark was added: benchmark.c. In the Dockerfile that builds a benchmark you could add the special "build wasi-libc with SIMD enabled" logic.

But you don't have to put it there. I think we could probably settle this using the bench.c you provided. I'd probably be comfortable merging this without the special aarch64 optimization now and then submitting that as a second PR once I have a chance to measure a few things. Let me make sure I understand what you're saying precisely: (a) you can't detect a difference using bitmask or any_true with the x64 CPUs you tested but (b) it still makes a very big difference for aarch64?

// Clear the bits corresponding to align (little-endian)
// so we can count trailing zeros.
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
// At least one bit will be set, unless align cleared them.
// Knowing this helps the compiler if it unrolls the loop.
__builtin_assume(mask || align);
// If the mask became zero because of align,
// it's as if we didn't find anything.
if (mask) {
// Find the offset of the first one bit (little-endian).
// That's a match, unless it is beyond the end of the object.
// Recall that we decremented n, so less-than-or-equal-to is correct.
size_t ctz = __builtin_ctz(mask);
return ctz - align <= n ? (char *)src + (addr + ctz - (uintptr_t)src)
: NULL;
}
}
// Decrement n; if it overflows we're done.
if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
return NULL;
}
align = 0;
addr += sizeof(v128_t);
}
#endif
#endif

const unsigned char *s = src;
c = (unsigned char)c;
#ifdef __GNUC__
Expand Down
30 changes: 15 additions & 15 deletions libc-top-half/musl/src/string/strlen.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,28 @@
size_t strlen(const char *s)
{
#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574) which
// results in an ICE when inline assembly is used with a vector result.
// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
// which results in an ICE when inline assembly is used with a vector result.
#if __clang_major__ != 19 && __clang_major__ != 20
// Note that reading before/after the allocation of a pointer is UB in
// C, so inline assembly is used to generate the exact machine
// instruction we want with opaque semantics to the compiler to avoid
// the UB.
// Note that reading before/after the allocation of a pointer is UB in
// C, so inline assembly is used to generate the exact machine
// instruction we want with opaque semantics to the compiler to avoid
// the UB.
uintptr_t align = (uintptr_t)s % sizeof(v128_t);
uintptr_t v = (uintptr_t)s - align;
uintptr_t addr = (uintptr_t)s - align;

for (;;) {
v128_t chunk;
v128_t v;
__asm__ (
"local.get %1\n"
"v128.load 0\n"
"local.set %0\n"
: "=r"(chunk)
: "r"(v)
: "memory");
: "=r"(v)
: "r"(addr)
: "memory");
// Bitmask is slow on AArch64, all_true is much faster.
if (!wasm_i8x16_all_true(chunk)) {
const v128_t cmp = wasm_i8x16_eq(chunk, (v128_t){});
if (!wasm_i8x16_all_true(v)) {
const v128_t cmp = wasm_i8x16_eq(v, (v128_t){});
// Clear the bits corresponding to align (little-endian)
// so we can count trailing zeros.
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
Expand All @@ -46,11 +46,11 @@ size_t strlen(const char *s)
// it's as if we didn't find anything.
if (mask) {
// Find the offset of the first one bit (little-endian).
return v - (uintptr_t)s + __builtin_ctz(mask);
return addr - (uintptr_t)s + __builtin_ctz(mask);
}
}
align = 0;
v += sizeof(v128_t);
addr += sizeof(v128_t);
}
#endif
#endif
Expand Down
53 changes: 53 additions & 0 deletions test/src/misc/memchr.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//! add-flags.py(LDFLAGS): -Wl,--stack-first -Wl,--initial-memory=327680

#include <__macro_PAGESIZE.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>

void test(char *ptr, size_t length, void *want) {
void *got = memchr(ptr, 7, length);
if (got != want) {
printf("memchr(%p, 7, %lu) = %p, want %p\n", ptr, length, got, want);
}
}

int main(void) {
char *const LIMIT = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE);

for (size_t length = 0; length < 64; length++) {
for (size_t alignment = 0; alignment < 24; alignment++) {
for (ptrdiff_t pos = -2; pos < length + 2; pos++) {
// Create a buffer with the given length, at a pointer with the given
// alignment. Using the offset LIMIT - PAGESIZE - 8 means many buffers
// will straddle a (Wasm, and likely OS) page boundary. Place the
// character to find at every position in the buffer, including just
// prior to it and after its end.
char *ptr = LIMIT - PAGESIZE - 8 + alignment;
memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE);
memset(ptr, 5, length);
ptr[pos] = 7;

// The first instance of the character is found.
if (pos >= 0) ptr[pos + 2] = 7;

// The character is found if it's within range.
test(ptr, length, 0 <= pos && pos < length ? &ptr[pos] : NULL);
}
}

// Ensure we never read past the end of memory.
char *ptr = LIMIT - length;
memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE);
memset(ptr, 5, length);
ptr[length - 1] = 7;

// Nothing found on an empty buffer.
test(ptr, length, length != 0 ? &ptr[length - 1] : NULL);

// Test for length overflow.
if (length > 0) test(ptr, SIZE_MAX, &ptr[length - 1]);
}

return 0;
}