From 26464d64ce053feaff8329860c9b72435932b474 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 2 Jul 2025 10:09:40 -0700 Subject: [PATCH 1/2] Use inline assembly in strlen for vector loads This commit is a refinement of #586 to use inline assembly to perform vector loads instead of using a C-defined load. This is done to avoid UB in LLVM where C cannot read either before or after an allocation. When `strlen` is not inlined, as it currently isn't, then there's not really any reasonable path that a compiler could prove that a load was out-of-bounds so this is issue is unlikely in practice, but it nevertheless is still UB. In the future the eventual goal is to move these SIMD routines into header files to avoid needing multiple builds of libc itself, and in such a situation inlining is indeed possible and a compiler would be capable of much more easily seeing the UB which could cause problems. Inline assembly unfortunately doesn't work with vector output parameters on Clang 19 and Clang 20 due to an ICE. This was fixed in https://github.com/llvm/llvm-project/pull/146574 for Clang 21, but it means that the SIMD routines are now excluded with Clang 19 and Clang 20 to avoid compilation errors there. --- libc-top-half/musl/src/string/strlen.c | 29 ++++++++++++++++++-------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/libc-top-half/musl/src/string/strlen.c b/libc-top-half/musl/src/string/strlen.c index 272186801..2dd2972f2 100644 --- a/libc-top-half/musl/src/string/strlen.c +++ b/libc-top-half/musl/src/string/strlen.c @@ -14,17 +14,27 @@ size_t strlen(const char *s) { #if defined(__wasm_simd128__) && defined(__wasilibc_simd_string) - // strlen must stop as soon as it finds the terminator. - // Aligning ensures loads beyond the terminator are safe. - // Casting through uintptr_t makes this implementation-defined, - // rather than undefined behavior. +// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574) which +// results in an ICE when inline assembly is used with a vector result. +#if __clang_major__ != 19 && __clang_major__ != 20 + // Note that reading before/after the allocation of a pointer is UB in + // C, so inline assembly is used to generate the exact machine + // instruction we want with opaque semantics to the compiler to avoid + // the UB. uintptr_t align = (uintptr_t)s % sizeof(v128_t); - const v128_t *v = (v128_t *)((uintptr_t)s - align); + uintptr_t v = (uintptr_t)s - align; for (;;) { + v128_t chunk; + __asm__ ( + "local.get %1\n" + "v128.load 0\n" + "local.set %0\n" + : "=r"(chunk) + : "r"(v)); // Bitmask is slow on AArch64, all_true is much faster. - if (!wasm_i8x16_all_true(*v)) { - const v128_t cmp = wasm_i8x16_eq(*v, (v128_t){}); + if (!wasm_i8x16_all_true(chunk)) { + const v128_t cmp = wasm_i8x16_eq(chunk, (v128_t){}); // Clear the bits corresponding to align (little-endian) // so we can count trailing zeros. int mask = wasm_i8x16_bitmask(cmp) >> align << align; @@ -35,12 +45,13 @@ size_t strlen(const char *s) // it's as if we didn't find anything. if (mask) { // Find the offset of the first one bit (little-endian). - return (char *)v - s + __builtin_ctz(mask); + return v - (uintptr_t)s + __builtin_ctz(mask); } } align = 0; - v++; + v += sizeof(v128_t); } +#endif #endif const char *a = s; From 8276fce689db4d4bf12d94f9d0d7d51e5350b2bf Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 2 Jul 2025 15:57:33 -0700 Subject: [PATCH 2/2] Add a memory clobber --- libc-top-half/musl/src/string/strlen.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libc-top-half/musl/src/string/strlen.c b/libc-top-half/musl/src/string/strlen.c index 2dd2972f2..82ae4a0c3 100644 --- a/libc-top-half/musl/src/string/strlen.c +++ b/libc-top-half/musl/src/string/strlen.c @@ -31,7 +31,8 @@ size_t strlen(const char *s) "v128.load 0\n" "local.set %0\n" : "=r"(chunk) - : "r"(v)); + : "r"(v) + : "memory"); // Bitmask is slow on AArch64, all_true is much faster. if (!wasm_i8x16_all_true(chunk)) { const v128_t cmp = wasm_i8x16_eq(chunk, (v128_t){});