llvm · Sterling-Augustine · Aug 19, 2025 · Aug 4, 2025 · Aug 6, 2025 · Aug 6, 2025
diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h
@@ -0,0 +1,51 @@
+//===-- Strlen implementation for aarch64 ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
+
+#if defined(__ARM_NEON)
+#include "src/__support/CPP/bit.h" // countr_zero
+
+#include <arm_neon.h>
+#include <stddef.h> // size_t
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace neon {
+[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
+  using Vector __attribute__((may_alias)) = uint8x8_t;
+
+  uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
+  Vector *block_ptr = reinterpret_cast<Vector *>(src - misalign_bytes);
+  Vector v = *block_ptr;
+  Vector vcmp = vceqz_u8(v);
+  uint64x1_t cmp_mask = vreinterpret_u64_s8(vcmp);
+  uint64_t cmp = vget_lane_u64(cmp_mask, 0);
+  cmp = cmp >> (misalign_bytes << 3);
+  if (cmp)
+    return cpp::countr_zero(cmp) >> 3;
+
+  while (true) {
+    ++block_ptr;
+    v = *block_ptr;
+    vcmp = vceqz_u8(v);
+    cmp_mask = vreinterpret_u64_s8(vcmp);
+    cmp = vget_lane_u64(cmp_mask, 0);
+    if (cmp)
+      return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
+                                 reinterpret_cast<uintptr_t>(src) +
+                                 (cpp::countr_zero(cmp) >> 3));
+  }
+}
+} // namespace neon
+
+namespace string_length_impl = neon;
+
+} // namespace LIBC_NAMESPACE_DECL
+#endif // __ARM_NEON
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
diff --git a/libc/src/string/memory_utils/x86_64/inline_strlen.h b/libc/src/string/memory_utils/x86_64/inline_strlen.h
@@ -0,0 +1,102 @@
+//===-- Strlen implementation for x86_64 ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H
+
+#include "src/__support/CPP/bit.h" // countr_zero
+
+#include <immintrin.h>
+#include <stddef.h> // size_t
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace string_length_internal {
+// Return a bit-mask with the nth bit set if the nth-byte in block_ptr is zero.
+template <typename Vector, typename Mask>
+Mask CompareAndMask(const Vector *block_ptr);
+
+template <typename Vector, typename Mask,
+          decltype(CompareAndMask<Vector, Mask>)>
+size_t string_length_vector(const char *src) {
+  uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
+
+  const Vector *block_ptr =
+      reinterpret_cast<const Vector *>(src - misalign_bytes);
+  auto cmp = CompareAndMask<Vector, Mask>(block_ptr) >> misalign_bytes;
+  if (cmp)
+    return cpp::countr_zero(cmp);
+
+  while (true) {
+    block_ptr++;
+    cmp = CompareAndMask<Vector, Mask>(block_ptr);
+    if (cmp)
+      return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
+                                 reinterpret_cast<uintptr_t>(src) +
+                                 cpp::countr_zero(cmp));
+  }
+}
+
+template <>
+uint32_t CompareAndMask<__m128i, uint32_t>(const __m128i *block_ptr) {
+  __m128i v = _mm_load_si128(block_ptr);
+  __m128i z = _mm_setzero_si128();
+  __m128i c = _mm_cmpeq_epi8(z, v);
+  return _mm_movemask_epi8(c);
+}
+
+namespace sse2 {
+[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
+  return string_length_vector<__m128i, uint32_t,
+                              CompareAndMask<__m128i, uint32_t>>(src);
+}
+} // namespace sse2
+
+#if defined(__AVX2__)
+template <>
+uint32_t CompareAndMask<__m256i, uint32_t>(const __m256i *block_ptr) {
+  __m256i v = _mm256_load_si256(block_ptr);
+  __m256i z = _mm256_setzero_si256();
+  __m256i c = _mm256_cmpeq_epi8(z, v);
+  return _mm256_movemask_epi8(c);
+}
+
+namespace avx2 {
+[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
+  return string_length_vector<__m256i, uint32_t,
+                              CompareAndMask<__m256i, uint32_t>>(src);
+}
+} // namespace avx2
+#endif
+
+#if defined(__AVX512F__)
+template <>
+__mmask64 CompareAndMask<__m512i, __mmask64>(const __m512i *block_ptr) {
+  __m512i v = _mm512_load_si512(block_ptr);
+  __m512i z = _mm512_setzero_si512();
+  return _mm512_cmp_epu8_mask(z, v, _MM_CMPINT_EQ);
+}
+namespace avx512 {
+[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
+  return string_length_vector<__m512i, __mmask64,
+                              CompareAndMask<__m512i, __mmask64>>(src);
+}
+} // namespace avx512
+#endif
+} // namespace string_length_internal
+
+#if defined(__AVX512F__)
+namespace string_length_impl = string_length_internal::avx512;
+#elif defined(__AVX2__)
+namespace string_length_impl = string_length_internal::avx2;
+#else
+namespace string_length_impl = string_length_internal::sse2;
+#endif
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
@@ -22,6 +22,16 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
+#if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
+#if defined(LIBC_TARGET_ARCH_IS_X86)
+#include "src/string/memory_utils/x86_64/inline_strlen.h"
+#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_NEON)
+#include "src/string/memory_utils/aarch64/inline_strlen.h"
+#else
+namespace string_length_impl = LIBC_NAMESPACE::wide_read;
+#endif
+#endif
+
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
@@ -53,7 +63,7 @@ template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) {
 // high bit set will no longer have it set, narrowing the list of bytes which
 // result in non-zero values to just the zero byte.
 template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
-  constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
+  constexpr unsigned int LOW_BITS = repeat_byte<Word>(0x01);
   constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
   Word subtracted = block - LOW_BITS;
   Word inverted = ~block;
@@ -81,16 +91,23 @@ LIBC_INLINE size_t string_length_wide_read(const char *src) {
   return static_cast<size_t>(char_ptr - src);
 }
 
-// Returns the length of a string, denoted by the first occurrence
-// of a null terminator.
-template <typename T> LIBC_INLINE size_t string_length(const T *src) {
-#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
+namespace wide_read {
+LIBC_INLINE size_t string_length(const char *src) {
   // Unsigned int is the default size for most processors, and on x86-64 it
   // performs better than larger sizes when the src pointer can't be assumed to
   // be aligned to a word boundary, so it's the size we use for reading the
   // string a block at a time.
+  return string_length_wide_read<unsigned int>(src);
+}
+
+} // namespace wide_read
+
+// Returns the length of a string, denoted by the first occurrence
+// of a null terminator.
+template <typename T> LIBC_INLINE size_t string_length(const T *src) {
+#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
   if constexpr (cpp::is_same_v<T, char>)
-    return string_length_wide_read<unsigned int>(src);
+    return string_length_impl::string_length(src);
 #endif
   size_t length;
   for (length = 0; *src; ++src, ++length)

@@ -4894,6 +4894,7 @@ libc_support_library(
         "src/string/memory_utils/aarch64/inline_memcpy.h",
         "src/string/memory_utils/aarch64/inline_memmove.h",
         "src/string/memory_utils/aarch64/inline_memset.h",
+        "src/string/memory_utils/aarch64/inline_strlen.h",
         "src/string/memory_utils/arm/common.h",
         "src/string/memory_utils/arm/inline_memcpy.h",
         "src/string/memory_utils/arm/inline_memset.h",
@@ -4918,6 +4919,7 @@ libc_support_library(
         "src/string/memory_utils/x86_64/inline_memcpy.h",
         "src/string/memory_utils/x86_64/inline_memmove.h",
         "src/string/memory_utils/x86_64/inline_memset.h",
+        "src/string/memory_utils/x86_64/inline_strlen.h",
     ],
     deps = [
         ":__support_common",
@@ -4942,6 +4944,7 @@ libc_support_library(
         ":__support_macros_optimization",
         ":hdr_limits_macros",
         ":llvm_libc_types_size_t",
+        ":string_memory_utils",
         ":types_size_t",
     ],
 )