Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions libc/src/__support/CPP/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,9 @@ add_object_library(
libc.src.__support.common
libc.src.__support.macros.properties.os
)

add_header_library(
simd
HDRS
simd.h
)
6 changes: 6 additions & 0 deletions libc/src/__support/CPP/algorithm.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
namespace LIBC_NAMESPACE_DECL {
namespace cpp {

template <class T = void> struct plus {};
template <class T = void> struct multiplies {};
template <class T = void> struct bit_and {};
template <class T = void> struct bit_or {};
template <class T = void> struct bit_xor {};

template <class T> LIBC_INLINE constexpr const T &max(const T &a, const T &b) {
return (a < b) ? b : a;
}
Expand Down
227 changes: 227 additions & 0 deletions libc/src/__support/CPP/simd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
//===-- Portable SIMD library similar to stdx::simd -------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides a generic interface into fixed-size SIMD instructions
// using the clang vector type. The API shares some similarities with the
// stdx::simd proposal, but instead chooses to use vectors as primitive types
// with several extra helper functions.
//
//===----------------------------------------------------------------------===//

#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/type_traits/integral_constant.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"

#include <stddef.h>

#ifndef LLVM_LIBC_SRC___SUPPORT_CPP_SIMD_H
#define LLVM_LIBC_SRC___SUPPORT_CPP_SIMD_H

#if LIBC_HAS_VECTOR_TYPE

namespace LIBC_NAMESPACE_DECL {
namespace cpp {

namespace internal {

template <typename T>
using get_as_integer_type_t = unsigned _BitInt(sizeof(T) * CHAR_BIT);

#if defined(LIBC_TARGET_CPU_HAS_AVX512F)
template <typename T>
inline constexpr size_t native_vector_size = 64 / sizeof(T);
#elif defined(LIBC_TARGET_CPU_HAS_AVX2)
template <typename T>
inline constexpr size_t native_vector_size = 32 / sizeof(T);
#elif defined(LIBC_TARGET_CPU_HAS_SSE2) || defined(LIBC_TARGET_CPU_HAS_ARM_NEON)
template <typename T>
inline constexpr size_t native_vector_size = 16 / sizeof(T);
#else
template <typename T> inline constexpr size_t native_vector_size = 1;
#endif

template <typename T> LIBC_INLINE constexpr T poison() {
return __builtin_nondeterministic_value(T());
}
} // namespace internal

// Type aliases.
template <typename T, size_t N>
using fixed_size_simd = T [[clang::ext_vector_type(N)]];
template <typename T, size_t N = internal::native_vector_size<T>>
using simd = T [[clang::ext_vector_type(N)]];
template <typename T>
using simd_mask = simd<bool, internal::native_vector_size<T>>;

// Type trait helpers.
template <typename T>
struct simd_size : cpp::integral_constant<size_t, __builtin_vectorelements(T)> {
};
template <class T> constexpr size_t simd_size_v = simd_size<T>::value;

template <typename T> struct is_simd : cpp::integral_constant<bool, false> {};
template <typename T, unsigned N>
struct is_simd<simd<T, N>> : cpp::integral_constant<bool, true> {};
template <class T> constexpr bool is_simd_v = is_simd<T>::value;

template <typename T>
struct is_simd_mask : cpp::integral_constant<bool, false> {};
template <unsigned N>
struct is_simd_mask<simd<bool, N>> : cpp::integral_constant<bool, true> {};
template <class T> constexpr bool is_simd_mask_v = is_simd_mask<T>::value;

template <typename T> struct simd_element_type;
template <typename T, size_t N> struct simd_element_type<simd<T, N>> {
using type = T;
};
template <typename T>
using simd_element_type_t = typename simd_element_type<T>::type;

template <typename T>
using enable_if_simd_t = cpp::enable_if_t<is_simd_v<T>, T>;

// Casting.
template <typename To, typename From, size_t N>
LIBC_INLINE constexpr simd<To, N> simd_cast(simd<From, N> v) {
return __builtin_convertvector(v, simd<To, N>);
}

// SIMD mask operations.
template <size_t N> LIBC_INLINE constexpr bool all_of(simd<bool, N> m) {
return __builtin_reduce_and(m);
}
template <size_t N> LIBC_INLINE constexpr bool any_of(simd<bool, N> m) {
return __builtin_reduce_or(m);
}
template <size_t N> LIBC_INLINE constexpr bool none_of(simd<bool, N> m) {
return !any_of(m);
}
template <size_t N> LIBC_INLINE constexpr bool some_of(simd<bool, N> m) {
return any_of(m) && !all_of(m);
}
template <size_t N> LIBC_INLINE constexpr int popcount(simd<bool, N> m) {
return __builtin_popcountg(m);
}
template <size_t N> LIBC_INLINE constexpr int find_first_set(simd<bool, N> m) {
return __builtin_ctzg(m);
}
template <size_t N> LIBC_INLINE constexpr int find_last_set(simd<bool, N> m) {
constexpr size_t size = simd_size_v<simd<bool, N>>;
return size - __builtin_clzg(m);
}

// Elementwise operations.
template <typename T, size_t N>
LIBC_INLINE constexpr simd<T, N> min(simd<T, N> x, simd<T, N> y) {
return __builtin_elementwise_min(x, y);
}
template <typename T, size_t N>
LIBC_INLINE constexpr simd<T, N> max(simd<T, N> x, simd<T, N> y) {
return __builtin_elementwise_max(x, y);
}

// Reduction operations.
template <typename T, size_t N, typename Op = cpp::plus<>>
LIBC_INLINE constexpr T reduce(simd<T, N> v, Op op = {}) {
return reduce(v, op);
}
template <typename T, size_t N>
LIBC_INLINE constexpr T reduce(simd<T, N> v, cpp::plus<>) {
return __builtin_reduce_add(v);
}
template <typename T, size_t N>
LIBC_INLINE constexpr T reduce(simd<T, N> v, cpp::multiplies<>) {
return __builtin_reduce_mul(v);
}
template <typename T, size_t N>
LIBC_INLINE constexpr T reduce(simd<T, N> v, cpp::bit_and<>) {
return __builtin_reduce_and(v);
}
template <typename T, size_t N>
LIBC_INLINE constexpr T reduce(simd<T, N> v, cpp::bit_or<>) {
return __builtin_reduce_or(v);
}
template <typename T, size_t N>
LIBC_INLINE constexpr T reduce(simd<T, N> v, cpp::bit_xor<>) {
return __builtin_reduce_xor(v);
}
template <typename T, size_t N> LIBC_INLINE constexpr T hmin(simd<T, N> v) {
return __builtin_reduce_min(v);
}
template <typename T, size_t N> LIBC_INLINE constexpr T hmax(simd<T, N> v) {
return __builtin_reduce_max(v);
}

// Accessor helpers.
template <typename T>
LIBC_INLINE enable_if_simd_t<T> load_unaligned(const void *ptr) {
T tmp;
__builtin_memcpy(&tmp, ptr, sizeof(T));
return tmp;
}
template <typename T>
LIBC_INLINE enable_if_simd_t<T> load_aligned(const void *ptr) {
return load_unaligned<T>(__builtin_assume_aligned(ptr, alignof(T)));
}
template <typename T>
LIBC_INLINE enable_if_simd_t<T> store_unaligned(T v, void *ptr) {
__builtin_memcpy(ptr, &v, sizeof(T));
}
template <typename T>
LIBC_INLINE enable_if_simd_t<T> store_aligned(T v, void *ptr) {
store_unaligned<T>(v, __builtin_assume_aligned(ptr, alignof(T)));
}
template <typename T>
LIBC_INLINE enable_if_simd_t<T>
masked_load(simd<bool, simd_size_v<T>> m, void *ptr,
T passthru = internal::poison<simd_element_type<T>>()) {
return __builtin_masked_load(m, ptr, passthru);
}
template <typename T>
LIBC_INLINE enable_if_simd_t<T> masked_store(simd<bool, simd_size_v<T>> m, T v,
void *ptr) {
__builtin_masked_store(
m, v, static_cast<T *>(__builtin_assume_aligned(ptr, alignof(T))));
}

// Construction helpers.
template <typename T, size_t N> LIBC_INLINE constexpr simd<T, N> splat(T v) {
return simd<T, N>(v);
}
template <typename T> LIBC_INLINE constexpr simd<T> splat(T v) {
return splat<T, simd_size_v<simd<T>>>(v);
}
template <typename T, unsigned N>
LIBC_INLINE constexpr simd<T, N> iota(T base = T(0), T step = T(1)) {
simd<T, N> v{};
for (unsigned i = 0; i < N; ++i)
v[i] = base + T(i) * step;
return v;
}
template <typename T>
LIBC_INLINE constexpr simd<T> iota(T base = T(0), T step = T(1)) {
return iota<T, simd_size_v<simd<T>>>(base, step);
}

// Conditional helpers.
template <typename T, size_t N>
LIBC_INLINE constexpr simd<T, N> select(simd<bool, N> m, simd<T, N> x,
simd<T, N> y) {
return m ? x : y;
}

// TODO: where expressions, scalar overloads, ABI types.

} // namespace cpp
} // namespace LIBC_NAMESPACE_DECL

#endif // LIBC_HAS_VECTOR_TYPE
#endif
6 changes: 6 additions & 0 deletions libc/src/__support/macros/attributes.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,10 @@ LIBC_THREAD_MODE_EXTERNAL.
#define LIBC_PREFERED_TYPE(TYPE)
#endif

#if __has_attribute(ext_vector_type) && __has_feature(ext_vector_type_boolean)
#define LIBC_HAS_VECTOR_TYPE 1
#else
#define LIBC_HAS_VECTOR_TYPE 0
#endif

#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_ATTRIBUTES_H
4 changes: 4 additions & 0 deletions libc/src/__support/macros/properties/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@
#endif // LIBC_TARGET_CPU_HAS_ARM_FPU_DOUBLE
#endif // __ARM_FP

#if defined(__ARM_NEON)
#define LIBC_TARGET_CPU_HAS_ARM_NEON
#endif

#if defined(__riscv_flen)
// https://github.com/riscv-non-isa/riscv-c-api-doc/blob/main/src/c-api.adoc
#if defined(__riscv_zfhmin)
Expand Down
1 change: 1 addition & 0 deletions libc/src/string/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ add_header_library(
libc.hdr.stdint_proxy
libc.src.__support.CPP.bitset
libc.src.__support.CPP.type_traits
libc.src.__support.CPP.simd
libc.src.__support.common
${string_config_options}
)
Expand Down
53 changes: 53 additions & 0 deletions libc/src/string/memory_utils/generic/inline_strlen.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//===-- Strlen for generic SIMD types -------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_GENERIC_INLINE_STRLEN_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_GENERIC_INLINE_STRLEN_H

#include "src/__support/CPP/simd.h"
#include "src/__support/common.h"

namespace LIBC_NAMESPACE_DECL {
namespace internal {

// Exploit the underlying integer representation to do a variable shift.
LIBC_INLINE constexpr cpp::simd_mask<char> shift_mask(cpp::simd_mask<char> m,
size_t shift) {
using bitmask_ty = cpp::internal::get_as_integer_type_t<cpp::simd_mask<char>>;
bitmask_ty r = cpp::bit_cast<bitmask_ty>(m) >> shift;
return cpp::bit_cast<cpp::simd_mask<char>>(r);
}

[[clang::no_sanitize("address")]] LIBC_INLINE size_t
string_length(const char *src) {
constexpr cpp::simd<char> null_byte = cpp::splat('\0');

size_t alignment = alignof(cpp::simd<char>);
const cpp::simd<char> *aligned = reinterpret_cast<const cpp::simd<char> *>(
__builtin_align_down(src, alignment));

cpp::simd<char> chars = cpp::load_aligned<cpp::simd<char>>(aligned);
cpp::simd_mask<char> mask = cpp::simd_cast<bool>(chars == null_byte);
size_t offset = src - reinterpret_cast<const char *>(aligned);
if (cpp::any_of(shift_mask(mask, offset)))
return cpp::find_first_set(shift_mask(mask, offset));

for (;;) {
cpp::simd<char> chars = cpp::load_aligned<cpp::simd<char>>(++aligned);
cpp::simd_mask<char> mask = cpp::simd_cast<bool>(chars == null_byte);
if (cpp::any_of(mask))
return (reinterpret_cast<const char *>(aligned) - src) +
cpp::find_first_set(mask);
}
}
} // namespace internal

namespace string_length_impl = internal;
} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_GENERIC_INLINE_STRLEN_H
6 changes: 4 additions & 2 deletions libc/src/string/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,16 @@
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY

#if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
#if defined(LIBC_TARGET_ARCH_IS_X86)
#if LIBC_HAS_VECTOR_TYPE
#include "src/string/memory_utils/generic/inline_strlen.h"
Comment on lines +26 to +27
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we want the generic vector strlen to be the default or the target specific one? Either way we should probably consider a followup PR which allows configuring this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That'd be my expectation, unless we expect the results to be significantly different.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd expect the target specific versions to be faster, unless there's a reason that they shouldn't be.

Copy link
Contributor Author

@jhuber6 jhuber6 Aug 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://godbolt.org/z/cceGfoYdh has a rough comparison, It's out of date but the core should be the same. the LLVM vectors tend to output quite optimal code. Did you ever get that bazel build updated? I'd like to land this after the CI is done.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That link only has the x86 specific versions, I don't see the generic one. Either way, I have realized I should probably just do proper testing on both of them so I'm not going to block this PR on performance.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's in a separate window, check source 1 and 2. @Sterling-Augustine already did a performance comparison before, this version doesn't change the core implementation he used #152389 (comment). TL;DR, it's pretty much the same, very slightly slower for AVX512. I think the only difference between mine and his is that I do the unaligned load unconditionally. I could probably change that in this PR to match his if we think it's faster.

#elif defined(LIBC_TARGET_ARCH_IS_X86)
#include "src/string/memory_utils/x86_64/inline_strlen.h"
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_NEON)
#include "src/string/memory_utils/aarch64/inline_strlen.h"
#else
namespace string_length_impl = LIBC_NAMESPACE::wide_read;
#endif
#endif
#endif // defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)

namespace LIBC_NAMESPACE_DECL {
namespace internal {
Expand Down
14 changes: 14 additions & 0 deletions utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,18 @@ libc_support_library(
],
)

libc_support_library(
name = "__support_cpp_simd",
hdrs = ["src/__support/CPP/simd.h"],
deps = [
":__support_cpp_algorithm",
":__support_cpp_bit",
":__support_cpp_type_traits",
":__support_macros_attributes",
":hdr_stdint_proxy",
],
)

libc_support_library(
name = "__support_cpp_span",
hdrs = ["src/__support/CPP/span.h"],
Expand Down Expand Up @@ -4938,6 +4950,7 @@ libc_support_library(
"src/string/memory_utils/arm/inline_memset.h",
"src/string/memory_utils/generic/aligned_access.h",
"src/string/memory_utils/generic/byte_per_byte.h",
"src/string/memory_utils/generic/inline_strlen.h",
"src/string/memory_utils/inline_bcmp.h",
"src/string/memory_utils/inline_bzero.h",
"src/string/memory_utils/inline_memcmp.h",
Expand All @@ -4964,6 +4977,7 @@ libc_support_library(
":__support_cpp_array",
":__support_cpp_bit",
":__support_cpp_cstddef",
":__support_cpp_simd",
":__support_cpp_type_traits",
":__support_macros_attributes",
":__support_macros_optimization",
Expand Down
Loading