Address some comments

Sterling-Augustine · Sterling-Augustine · commit 863ad749e26b · 2025-08-12T11:09:32.000-07:00
diff --git a/libc/src/string/inline_strlen.h b/libc/src/string/inline_strlen.h
diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h
@@ -8,43 +8,44 @@
 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
 
+#if defined(__ARM_NEON)
+#include "src/__support/CPP/bit.h" // countr_zero
+
 #include <arm_neon.h>
 #include <stddef.h> // size_t
 
 namespace LIBC_NAMESPACE_DECL {
-
-size_t string_length_neon(const char *src) {
+[[maybe_unused]] LIBC_INLINE size_t string_length_neon(const char *src) {
   using Vector __attribute__((may_alias)) = uint8x8_t;
-  uintptr_t misalign_bytes = reinterpret_case<uintptr_t>(src) % sizeof(Vector);
+
+  uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
   Vector *block_ptr = reinterpret_cast<Vector *>(src - misalign_bytes);
-  if (misalign_bytes) {
-    Vector v = *block_ptr;
-    Vector vcmp = vceqz_u8(v);
-    uint64x1_t cmp_mask = vreinterpret_u64_s8(vcmp);
-    uint64_t cmp = vget_lane_u64(cmp_mask, 0);
-    cmp = cmp >> (misalign_bytes << 3);
-    if (cmp)
-      return __builtin_ctzl(cmp) >> 3;
-    ++block_ptr;
-  }
+  Vector v = *block_ptr;
+  Vector vcmp = vceqz_u8(v);
+  uint64x1_t cmp_mask = vreinterpret_u64_s8(vcmp);
+  uint64_t cmp = vget_lane_u64(cmp_mask, 0);
+  cmp = cmp >> (misalign_bytes << 3);
+  if (cmp)
+    return cpp::countr_zero(cmp) >> 3;
+
   while (true) {
-    Vector v = *block_ptr;
-    Vector vcmp = vceqz_u8(v);
-    uint64x1_t cmp_mask = vreinterpret_u64_s8(vcmp);
-    uint64_t cmp = vget_lane_u64(cmp_mask, 0);
+    ++block_ptr;
+    v = *block_ptr;
+    vcmp = vceqz_u8(v);
+    cmp_mask = vreinterpret_u64_s8(vcmp);
+    cmp = vget_lane_u64(cmp_mask, 0);
     if (cmp)
-      return static_cast<size_t>(reinterpret_case<uintptr_t>(block_ptr) -
-                                 reinterpret_case<uintptr_t>(src) +
-                                 (__builtin_ctzl(cmp) >> 3));
-    block_ptr++;
+      return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
+                                 reinterpret_cast<uintptr_t>(src) +
+                                 (cpp::countr_zero(cmp) >> 3));
   }
 }
 
 template <typename T>
 [[maybe_unused]] LIBC_INLINE void string_length_aarch64(const char *src) {
   return inline_string_length_neon(src);
 }
-
 } // namespace LIBC_NAMESPACE_DECL
 
+#endif // __ARM_NEON
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
diff --git a/libc/src/string/memory_utils/x86_64/inline_strlen.h b/libc/src/string/memory_utils/x86_64/inline_strlen.h
@@ -8,105 +8,102 @@
 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H
 
+#include "src/__support/CPP/bit.h"          // countr_zero
 #include "src/string/memory_utils/op_x86.h" // K_AVX
 
 #include <stddef.h> // size_t
-#include <x86intrin.h>
+
 namespace LIBC_NAMESPACE_DECL {
 
-#if defined(__SSE2__)
 [[maybe_unused]] LIBC_INLINE size_t string_length_sse2(const char *src) {
   using Vector __attribute__((may_alias)) = __m128i;
+
   Vector z = _mm_setzero_si128();
   uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
   const Vector *block_ptr =
       reinterpret_cast<const Vector *>(src - misalign_bytes);
-  if (misalign_bytes) {
-    Vector v = _mm_load_si128(block_ptr);
-    Vector vcmp = _mm_cmpeq_epi8(z, v);
-    // shift away results in irrelevant bytes.
-    int cmp = _mm_movemask_epi8(vcmp) >> misalign_bytes;
-    if (cmp)
-      return __builtin_ctz(cmp);
-    block_ptr++;
-  }
+  Vector v = _mm_load_si128(block_ptr);
+  Vector vcmp = _mm_cmpeq_epi8(z, v);
+  // shift away results in irrelevant bytes.
+  uint32_t cmp = _mm_movemask_epi8(vcmp) >> misalign_bytes;
+  if (cmp)
+    return cpp::countr_zero(cmp);
+
   while (true) {
-    Vector v = _mm_load_si128(block_ptr);
-    Vector vcmp = _mm_cmpeq_epi8(z, v);
-    int cmp = _mm_movemask_epi8(vcmp);
+    block_ptr++;
+    v = _mm_load_si128(block_ptr);
+    vcmp = _mm_cmpeq_epi8(z, v);
+    cmp = _mm_movemask_epi8(vcmp);
     if (cmp)
       return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
                                  reinterpret_cast<uintptr_t>(src) +
-                                 __builtin_ctz(cmp));
-    block_ptr++;
+                                 cpp::countr_zero(cmp));
   }
 }
-#endif
 
 #if defined(__AVX2__)
 [[maybe_unused]] LIBC_INLINE size_t string_length_avx2(const char *src) {
   using Vector __attribute__((may_alias)) = __mm256i;
+
   Vector z = _mm256_setzero_si256();
   uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
   const Vector *block_ptr =
       reinterpret_cast<const Vector *>(src - misalign_bytes);
-  if (misalign_bytes) {
-    Vector v = _mm256_load_si256(block_ptr);
-    Vector vcmp = _mm256_cmpeq_epi8(z, v);
-    // shift away results in irrelevant bytes.
-    int cmp = _mm256_movemask_epi8(vcmp) >> misalign_bytes;
-    if (cmp)
-      return __builtin_ctz(cmp);
-    block_ptr++;
-  }
+  Vector v = _mm256_load_si256(block_ptr);
+  Vector vcmp = _mm256_cmpeq_epi8(z, v);
+  // shift away results in irrelevant bytes.
+  int cmp = _mm256_movemask_epi8(vcmp) >> misalign_bytes;
+  if (cmp)
+    return cpp::countr_zero(cmp);
+
   while (true) {
-    Vector v = _mm256_load_si256(block_ptr);
-    Vector vcmp = _mm256_cmpeq_epi8(z, v);
-    int cmp = _mm256_movemask_epi8(vcmp);
+    block_ptr++;
+    v = _mm256_load_si256(block_ptr);
+    vcmp = _mm256_cmpeq_epi8(z, v);
+    cmp = _mm256_movemask_epi8(vcmp);
     if (cmp)
       return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
                                  reinterpret_cast<uintptr_t>(src) +
-                                 __builtin_ctz(cmp));
-    block_ptr++;
+                                 cpp::countr_zero(cmp));
   }
 }
-#endif // __AVX__
+#endif // __AVX2__
 
 #if defined(__AVX512F__)
 [[maybe_unused]] LIBC_INLINE size_t string_length_avx512(const char *src) {
   using Vector __attribute__((may_alias)) = __mm512i;
+
   Vector z = _mm512_setzero_si512();
   uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
   const Vector *block_ptr =
       reinterpret_cast<const Vector *>(src - misalign_bytes);
-  if (misalign_bytes) {
-    Vector v = _mm512_load_si512(block_ptr);
-    __mmask64 cmp = _mm512_cmp_epu8_mask(z, v, _MM_CMPINT_EQ) >> misalign_bytes;
-    if (cmp)
-      return __builtin_ctzl(cmp);
-    block_ptr++;
-  }
+  Vector v = _mm512_load_si512(block_ptr);
+  __mmask64 cmp = _mm512_cmp_epu8_mask(z, v, _MM_CMPINT_EQ) >> misalign_bytes;
+  if (cmp)
+    return cpp::countr_zero(cmp);
+
   while (true) {
+    block_ptr++;
     Vector v = _mm512_load_si512(block_ptr);
     __mmask64 cmp = _mm512_cmp_epu8_mask(z, v, _MM_CMPINT_EQ);
     if (cmp)
       return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
                                  reinterpret_cast<uintptr_t>(src) +
-                                 __builtin_ctz(cmp));
-    block_ptr++;
+                                 cpp::countr_zero(cmp));
   }
 }
 #endif // __AVX512F__
 
+namespace x86 {
 template <typename T> LIBC_INLINE size_t string_length_x86_64(const char *src) {
 #if defined(__AVX512F__)
   return string_length_avx512(src);
-#endif
-#if defined(__AVX__)
+#elif defined(__AVX2__)
   return string_length_avx2(src);
 #endif
   return string_length_sse2(src);
 }
+}
 
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
@@ -25,12 +25,12 @@
 #if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
 #if defined(LIBC_TARGET_ARCH_IS_X86)
 #include "src/string/memory_utils/x86_64/inline_strlen.h"
-#define LIBC_SRC_STRING_MEMORY_UTILS_STRLEN_WIDE_READ string_length_x86_64
-#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
+namespace wide_read_impl = x86;
+#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_NEON)
 #include "src/string/memory_utils/aarch64/inline_strlen.h"
-#define LIBC_SRC_STRING_MEMORY_UTILS_STRLEN_WIDE_READ string_length_aarch64
+namespace wide_read_impl = aarch64;
 #else
-#define LIBC_SRC_STRING_MEMORY_UTILS_STRLEN_WIDE_READ string_length_wide_read
+namespace wide_read_impl = default_wide_read;
 #endif
 #endif
 
@@ -65,13 +65,14 @@ template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) {
 // high bit set will no longer have it set, narrowing the list of bytes which
 // result in non-zero values to just the zero byte.
 template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
-  constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
+  constexpr unsigned int LOW_BITS = repeat_byte<Word>(0x01);
   constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
   Word subtracted = block - LOW_BITS;
   Word inverted = ~block;
   return (subtracted & inverted & HIGH_BITS) != 0;
 }
 
+namespace default_wide_read {
 template <typename Word>
 LIBC_INLINE size_t string_length_wide_read(const char *src) {
   const char *char_ptr = src;
@@ -92,6 +93,7 @@ LIBC_INLINE size_t string_length_wide_read(const char *src) {
   }
   return static_cast<size_t>(char_ptr - src);
 }
+} // namespace default_wide_read
 
 // Returns the length of a string, denoted by the first occurrence
 // of a null terminator.
@@ -102,7 +104,7 @@ template <typename T> LIBC_INLINE size_t string_length(const T *src) {
   // be aligned to a word boundary, so it's the size we use for reading the
   // string a block at a time.
   if constexpr (cpp::is_same_v<T, char>)
-    return LIBC_SRC_STRING_MEMORY_UTILS_STRLEN_WIDE_READ<unsigned int>(src);
+    return wide_read_impl::string_length(src);
 #endif
   size_t length;
   for (length = 0; *src; ++src, ++length)