diff --git a/libc-top-half/musl/src/string/memcmp.c b/libc-top-half/musl/src/string/memcmp.c index bdbce9f0..ce313049 100644 --- a/libc-top-half/musl/src/string/memcmp.c +++ b/libc-top-half/musl/src/string/memcmp.c @@ -1,7 +1,42 @@ #include +#ifdef __wasm_simd128__ +#include +#endif + int memcmp(const void *vl, const void *vr, size_t n) { +#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string) + if (n >= sizeof(v128_t)) { + // memcmp is allowed to read up to n bytes from each object. + // Find the first different character in the objects. + // Unaligned loads handle the case where the objects + // have mismatching alignments. + const v128_t *v1 = (v128_t *)vl; + const v128_t *v2 = (v128_t *)vr; + while (n) { + const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(v1), wasm_v128_load(v2)); + // Bitmask is slow on AArch64, all_true is much faster. + if (!wasm_i8x16_all_true(cmp)) { + // Find the offset of the first zero bit (little-endian). + size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp)); + const unsigned char *u1 = (unsigned char *)v1 + ctz; + const unsigned char *u2 = (unsigned char *)v2 + ctz; + // This may help the compiler if the function is inlined. + __builtin_assume(*u1 - *u2 != 0); + return *u1 - *u2; + } + // This makes n a multiple of sizeof(v128_t) + // for every iteration except the first. + size_t align = (n - 1) % sizeof(v128_t) + 1; + v1 = (v128_t *)((char *)v1 + align); + v2 = (v128_t *)((char *)v2 + align); + n -= align; + } + return 0; + } +#endif + const unsigned char *l=vl, *r=vr; for (; n && *l == *r; n--, l++, r++); return n ? *l-*r : 0; diff --git a/test/src/misc/memcmp.c b/test/src/misc/memcmp.c new file mode 100644 index 00000000..88d580d7 --- /dev/null +++ b/test/src/misc/memcmp.c @@ -0,0 +1,50 @@ +//! add-flags.py(LDFLAGS): -Wl,--stack-first -Wl,--initial-memory=327680 + +#include <__macro_PAGESIZE.h> +#include +#include +#include + +int sign(int val) { + return (0 < val) - (val < 0); +} +void test(char *ptr1, char *ptr2, size_t length, int want) { + int got = memcmp(ptr1, ptr2, length); + if (sign(got) != sign(want)) { + printf("memcmp(%p, %p, %lu) = %d, want %d\n", ptr1, ptr2, length, got, + want); + } +} + +int main(void) { + char *const LIMIT = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE); + + for (ptrdiff_t length = 0; length < 64; length++) { + for (ptrdiff_t alignment = 0; alignment < 24; alignment++) { + for (ptrdiff_t pos = -2; pos < length + 2; pos++) { + // Create a buffer with the given length, at a pointer with the given + // alignment. Using the offset LIMIT - PAGESIZE - 8 means many buffers + // will straddle a (Wasm, and likely OS) page boundary. + // The second buffer has a fixed address, which means it won't + // always share alignment with first buffer. + // Place the difference to find at every position in the buffers, + // including just prior to it and after its end. + char *ptr1 = LIMIT - PAGESIZE - 8 + alignment; + char *ptr2 = LIMIT - PAGESIZE / 2; + memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE); + memset(ptr1, 5, length); + memset(ptr2, 5, length); + + ptr1[pos] = 7; + ptr2[pos] = 3; + + test(ptr1, ptr2, length, + 0 <= pos && pos < length ? ptr1[pos] - ptr2[pos] : 0); + test(ptr2, ptr1, length, + 0 <= pos && pos < length ? ptr2[pos] - ptr1[pos] : 0); + } + } + } + + return 0; +}