diff --git a/libc-top-half/musl/src/string/strlen.c b/libc-top-half/musl/src/string/strlen.c
index 272186801..82ae4a0c3 100644
--- a/libc-top-half/musl/src/string/strlen.c
+++ b/libc-top-half/musl/src/string/strlen.c
@@ -14,17 +14,28 @@
 size_t strlen(const char *s)
 {
 #if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
-	// strlen must stop as soon as it finds the terminator.
-	// Aligning ensures loads beyond the terminator are safe.
-	// Casting through uintptr_t makes this implementation-defined,
-	// rather than undefined behavior.
+// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574) which
+// results in an ICE when inline assembly is used with a vector result.
+#if __clang_major__ != 19 && __clang_major__ != 20
+        // Note that reading before/after the allocation of a pointer is UB in
+        // C, so inline assembly is used to generate the exact machine
+        // instruction we want with opaque semantics to the compiler to avoid
+        // the UB.
 	uintptr_t align = (uintptr_t)s % sizeof(v128_t);
-	const v128_t *v = (v128_t *)((uintptr_t)s - align);
+	uintptr_t v = (uintptr_t)s - align;
 
 	for (;;) {
+                v128_t chunk;
+		__asm__ (
+			"local.get %1\n"
+			"v128.load 0\n"
+			"local.set %0\n"
+			: "=r"(chunk)
+                        : "r"(v)
+                        : "memory");
 		// Bitmask is slow on AArch64, all_true is much faster.
-		if (!wasm_i8x16_all_true(*v)) {
-			const v128_t cmp = wasm_i8x16_eq(*v, (v128_t){});
+		if (!wasm_i8x16_all_true(chunk)) {
+			const v128_t cmp = wasm_i8x16_eq(chunk, (v128_t){});
 			// Clear the bits corresponding to align (little-endian)
 			// so we can count trailing zeros.
 			int mask = wasm_i8x16_bitmask(cmp) >> align << align;
@@ -35,12 +46,13 @@ size_t strlen(const char *s)
 			// it's as if we didn't find anything.
 			if (mask) {
 				// Find the offset of the first one bit (little-endian).
-				return (char *)v - s + __builtin_ctz(mask);
+				return v - (uintptr_t)s + __builtin_ctz(mask);
 			}
 		}
 		align = 0;
-		v++;
+		v += sizeof(v128_t);
 	}
+#endif
 #endif
 
 	const char *a = s;