diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h new file mode 100644 index 0000000000000..79487f4752b83 --- /dev/null +++ b/libc/src/string/memory_utils/aarch64/inline_strlen.h @@ -0,0 +1,51 @@ +//===-- Strlen implementation for aarch64 ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H + +#if defined(__ARM_NEON) +#include "src/__support/CPP/bit.h" // countr_zero + +#include +#include // size_t + +namespace LIBC_NAMESPACE_DECL { + +namespace neon { +[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) { + using Vector __attribute__((may_alias)) = uint8x8_t; + + uintptr_t misalign_bytes = reinterpret_cast(src) % sizeof(Vector); + Vector *block_ptr = reinterpret_cast(src - misalign_bytes); + Vector v = *block_ptr; + Vector vcmp = vceqz_u8(v); + uint64x1_t cmp_mask = vreinterpret_u64_s8(vcmp); + uint64_t cmp = vget_lane_u64(cmp_mask, 0); + cmp = cmp >> (misalign_bytes << 3); + if (cmp) + return cpp::countr_zero(cmp) >> 3; + + while (true) { + ++block_ptr; + v = *block_ptr; + vcmp = vceqz_u8(v); + cmp_mask = vreinterpret_u64_s8(vcmp); + cmp = vget_lane_u64(cmp_mask, 0); + if (cmp) + return static_cast(reinterpret_cast(block_ptr) - + reinterpret_cast(src) + + (cpp::countr_zero(cmp) >> 3)); + } +} +} // namespace neon + +namespace string_length_impl = neon; + +} // namespace LIBC_NAMESPACE_DECL +#endif // __ARM_NEON +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H diff --git a/libc/src/string/memory_utils/x86_64/inline_strlen.h b/libc/src/string/memory_utils/x86_64/inline_strlen.h new file mode 100644 index 0000000000000..5eb184cbf8107 --- /dev/null +++ b/libc/src/string/memory_utils/x86_64/inline_strlen.h @@ -0,0 +1,102 @@ +//===-- Strlen implementation for x86_64 ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H + +#include "src/__support/CPP/bit.h" // countr_zero + +#include +#include // size_t + +namespace LIBC_NAMESPACE_DECL { + +namespace string_length_internal { +// Return a bit-mask with the nth bit set if the nth-byte in block_ptr is zero. +template +Mask CompareAndMask(const Vector *block_ptr); + +template )> +size_t string_length_vector(const char *src) { + uintptr_t misalign_bytes = reinterpret_cast(src) % sizeof(Vector); + + const Vector *block_ptr = + reinterpret_cast(src - misalign_bytes); + auto cmp = CompareAndMask(block_ptr) >> misalign_bytes; + if (cmp) + return cpp::countr_zero(cmp); + + while (true) { + block_ptr++; + cmp = CompareAndMask(block_ptr); + if (cmp) + return static_cast(reinterpret_cast(block_ptr) - + reinterpret_cast(src) + + cpp::countr_zero(cmp)); + } +} + +template <> +uint32_t CompareAndMask<__m128i, uint32_t>(const __m128i *block_ptr) { + __m128i v = _mm_load_si128(block_ptr); + __m128i z = _mm_setzero_si128(); + __m128i c = _mm_cmpeq_epi8(z, v); + return _mm_movemask_epi8(c); +} + +namespace sse2 { +[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) { + return string_length_vector<__m128i, uint32_t, + CompareAndMask<__m128i, uint32_t>>(src); +} +} // namespace sse2 + +#if defined(__AVX2__) +template <> +uint32_t CompareAndMask<__m256i, uint32_t>(const __m256i *block_ptr) { + __m256i v = _mm256_load_si256(block_ptr); + __m256i z = _mm256_setzero_si256(); + __m256i c = _mm256_cmpeq_epi8(z, v); + return _mm256_movemask_epi8(c); +} + +namespace avx2 { +[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) { + return string_length_vector<__m256i, uint32_t, + CompareAndMask<__m256i, uint32_t>>(src); +} +} // namespace avx2 +#endif + +#if defined(__AVX512F__) +template <> +__mmask64 CompareAndMask<__m512i, __mmask64>(const __m512i *block_ptr) { + __m512i v = _mm512_load_si512(block_ptr); + __m512i z = _mm512_setzero_si512(); + return _mm512_cmp_epu8_mask(z, v, _MM_CMPINT_EQ); +} +namespace avx512 { +[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) { + return string_length_vector<__m512i, __mmask64, + CompareAndMask<__m512i, __mmask64>>(src); +} +} // namespace avx512 +#endif +} // namespace string_length_internal + +#if defined(__AVX512F__) +namespace string_length_impl = string_length_internal::avx512; +#elif defined(__AVX2__) +namespace string_length_impl = string_length_internal::avx2; +#else +namespace string_length_impl = string_length_internal::sse2; +#endif + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h index 80e5783c7890b..cc99633aa49d8 100644 --- a/libc/src/string/string_utils.h +++ b/libc/src/string/string_utils.h @@ -22,6 +22,16 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ) +#if defined(LIBC_TARGET_ARCH_IS_X86) +#include "src/string/memory_utils/x86_64/inline_strlen.h" +#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_NEON) +#include "src/string/memory_utils/aarch64/inline_strlen.h" +#else +namespace string_length_impl = LIBC_NAMESPACE::wide_read; +#endif +#endif + namespace LIBC_NAMESPACE_DECL { namespace internal { @@ -53,7 +63,7 @@ template LIBC_INLINE constexpr Word repeat_byte(Word byte) { // high bit set will no longer have it set, narrowing the list of bytes which // result in non-zero values to just the zero byte. template LIBC_INLINE constexpr bool has_zeroes(Word block) { - constexpr Word LOW_BITS = repeat_byte(0x01); + constexpr unsigned int LOW_BITS = repeat_byte(0x01); constexpr Word HIGH_BITS = repeat_byte(0x80); Word subtracted = block - LOW_BITS; Word inverted = ~block; @@ -81,16 +91,23 @@ LIBC_INLINE size_t string_length_wide_read(const char *src) { return static_cast(char_ptr - src); } -// Returns the length of a string, denoted by the first occurrence -// of a null terminator. -template LIBC_INLINE size_t string_length(const T *src) { -#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ +namespace wide_read { +LIBC_INLINE size_t string_length(const char *src) { // Unsigned int is the default size for most processors, and on x86-64 it // performs better than larger sizes when the src pointer can't be assumed to // be aligned to a word boundary, so it's the size we use for reading the // string a block at a time. + return string_length_wide_read(src); +} + +} // namespace wide_read + +// Returns the length of a string, denoted by the first occurrence +// of a null terminator. +template LIBC_INLINE size_t string_length(const T *src) { +#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ if constexpr (cpp::is_same_v) - return string_length_wide_read(src); + return string_length_impl::string_length(src); #endif size_t length; for (length = 0; *src; ++src, ++length) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 38f7e3bcc8e27..ff14d81ccfb6d 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -4894,6 +4894,7 @@ libc_support_library( "src/string/memory_utils/aarch64/inline_memcpy.h", "src/string/memory_utils/aarch64/inline_memmove.h", "src/string/memory_utils/aarch64/inline_memset.h", + "src/string/memory_utils/aarch64/inline_strlen.h", "src/string/memory_utils/arm/common.h", "src/string/memory_utils/arm/inline_memcpy.h", "src/string/memory_utils/arm/inline_memset.h", @@ -4918,6 +4919,7 @@ libc_support_library( "src/string/memory_utils/x86_64/inline_memcpy.h", "src/string/memory_utils/x86_64/inline_memmove.h", "src/string/memory_utils/x86_64/inline_memset.h", + "src/string/memory_utils/x86_64/inline_strlen.h", ], deps = [ ":__support_common", @@ -4942,6 +4944,7 @@ libc_support_library( ":__support_macros_optimization", ":hdr_limits_macros", ":llvm_libc_types_size_t", + ":string_memory_utils", ":types_size_t", ], )