Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions libcxx/docs/ReleaseNotes/22.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ Improvements and New Features
- Multiple internal types have been refactored to use ``[[no_unique_address]]``, resulting in faster compile times and
reduced debug information.

- The performance of ``std::find`` has been improved by up to 2x for integral types

Deprecations and Removals
-------------------------

Expand Down
110 changes: 87 additions & 23 deletions libcxx/include/__algorithm/find.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <__algorithm/find_segment_if.h>
#include <__algorithm/min.h>
#include <__algorithm/simd_utils.h>
#include <__algorithm/unwrap_iter.h>
#include <__bit/countr.h>
#include <__bit/invert_if.h>
Expand Down Expand Up @@ -44,39 +45,102 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// generic implementation
template <class _Iter, class _Sent, class _Tp, class _Proj>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
__find_loop(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In an ideal world, the code in __find_loop as-is would be using __builtin_assume_dereferenceable and getting vectorized: https://godbolt.org/z/M9xo6PdY1. IIUC this doesn't work in @philnik777 's experience because:

  1. The function that we want to vectorize can't contain pointer arithmetic, which is always going to happen because vector.begin() and vector.end() perform such arithmetic.
  2. The code is inside a function template, and apparently the optimization doesn't work inside templates?

I'm just summarizing the issues we encountered while trying to make it work with the builtin. CC @fhahn , let's figure out whether we've done something incorrectly here or whether something needs to change in the optimization.

for (; __first != __last; ++__first)
if (std::__invoke(__proj, *__first) == __value)
break;
return __first;
}

// trivially equality comparable implementations
template <class _Tp,
class _Up,
class _Proj,
__enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
sizeof(_Tp) == 1,
int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
return __ret;
return __last;
template <class _Iter, class _Sent, class _Tp, class _Proj>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
return std::__find_loop(std::move(__first), std::move(__last), __value, __proj);
}

#if _LIBCPP_HAS_WIDE_CHARACTERS
template <class _Tp,
class _Up,
class _Proj,
__enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t),
int> = 0>
#if _LIBCPP_VECTORIZE_ALGORITHMS
template <class _Tp, class _Up>
[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI
_LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find_vectorized(_Tp* __first, _Tp* __last, _Up __value) {
if (!__libcpp_is_constant_evaluated()) {
constexpr size_t __unroll_count = 4;
constexpr size_t __vec_size = __native_vector_size<_Tp>;
using __vec = __simd_vector<_Tp, __vec_size>;

auto __orig_first = __first;

auto __values = static_cast<__simd_vector<_Up, __vec_size>>(__value); // broadcast the value
while (static_cast<size_t>(__last - __first) >= __unroll_count * __vec_size) [[__unlikely__]] {
__vec __lhs[__unroll_count];

for (size_t __i = 0; __i != __unroll_count; ++__i)
__lhs[__i] = std::__load_vector<__vec>(__first + __i * __vec_size);

for (size_t __i = 0; __i != __unroll_count; ++__i) {
if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) {
auto __offset = __i * __vec_size + std::__find_first_set(__cmp_res);
return __first + __offset;
}
}

__first += __unroll_count * __vec_size;
}

// check the remaining 0-3 vectors
while (static_cast<size_t>(__last - __first) >= __vec_size) {
if (auto __cmp_res = std::__load_vector<__vec>(__first) == __values; std::__any_of(__cmp_res)) {
return __first + std::__find_first_set(__cmp_res);
}
__first += __vec_size;
}

if (__last - __first == 0)
return __first;

// Check if we can load elements in front of the current pointer. If that's the case load a vector at
// (last - vector_size) to check the remaining elements
if (static_cast<size_t>(__first - __orig_first) >= __vec_size) {
__first = __last - __vec_size;
return __first + std::__find_first_set(std::__load_vector<__vec>(__first) == __values);
}
}

__identity __proj;
return std::__find_loop(__first, __last, __value, __proj);
}
#endif

#ifndef _LIBCPP_CXX03_LANG
// trivially equality comparable implementations
template <
class _Tp,
class _Up,
class _Proj,
__enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
return __ret;
return __last;
if constexpr (sizeof(_Tp) == 1) {
if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
return __ret;
return __last;
}
# if _LIBCPP_HAS_WIDE_CHARACTERS
else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) {
if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
return __ret;
return __last;
}
# endif
# if _LIBCPP_VECTORIZE_ALGORITHMS
else if constexpr (is_integral<_Tp>::value) {
return std::__find_vectorized(__first, __last, __value);
}
# endif
else {
__identity __proj;
return std::__find_loop(__first, __last, __value, __proj);
}
}
#endif // _LIBCPP_HAS_WIDE_CHARACTERS
#endif

// TODO: This should also be possible to get right with different signedness
// cast integral types to allow vectorization
Expand Down
5 changes: 5 additions & 0 deletions libcxx/include/__algorithm/simd_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ template <class _VecT, class _Iter>
}(make_index_sequence<__simd_vector_size_v<_VecT>>{});
}

template <class _Tp, size_t _Np>
[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __any_of(__simd_vector<_Tp, _Np> __vec) noexcept {
return __builtin_reduce_or(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
}

template <class _Tp, size_t _Np>
[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept {
return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
Expand Down
2 changes: 2 additions & 0 deletions libcxx/include/module.modulemap.in
Original file line number Diff line number Diff line change
Expand Up @@ -1225,6 +1225,7 @@ module std [system] {
header "deque"
export *
export std.iterator.reverse_iterator
export std.algorithm.simd_utils // This is a workaround for https://llvm.org/PR120108.
}

module exception {
Expand Down Expand Up @@ -2238,6 +2239,7 @@ module std [system] {
header "vector"
export std.iterator.reverse_iterator
export *
export std.algorithm.simd_utils // This is a workaround for https://llvm.org/PR120108.
}

// Experimental C++ Standard Library interfaces
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ int main(int argc, char** argv) {
// find
bm.template operator()<std::vector<char>>("std::find(vector<char>) (" + comment + ")", std_find);
bm.template operator()<std::vector<int>>("std::find(vector<int>) (" + comment + ")", std_find);
bm.template operator()<std::vector<long long>>("std::find(vector<long long>) (" + comment + ")", std_find);
bm.template operator()<std::deque<int>>("std::find(deque<int>) (" + comment + ")", std_find);
bm.template operator()<std::list<int>>("std::find(list<int>) (" + comment + ")", std_find);

Expand Down
Loading