|
12 | 12 |
|
13 | 13 | #include <__algorithm/find_segment_if.h>
|
14 | 14 | #include <__algorithm/min.h>
|
| 15 | +#include <__algorithm/simd_utils.h> |
15 | 16 | #include <__algorithm/unwrap_iter.h>
|
16 | 17 | #include <__bit/countr.h>
|
17 | 18 | #include <__bit/invert_if.h>
|
@@ -44,39 +45,102 @@ _LIBCPP_BEGIN_NAMESPACE_STD
|
44 | 45 | // generic implementation
|
45 | 46 | template <class _Iter, class _Sent, class _Tp, class _Proj>
|
46 | 47 | _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
|
47 |
| -__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) { |
| 48 | +__find_loop(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) { |
48 | 49 | for (; __first != __last; ++__first)
|
49 | 50 | if (std::__invoke(__proj, *__first) == __value)
|
50 | 51 | break;
|
51 | 52 | return __first;
|
52 | 53 | }
|
53 | 54 |
|
54 |
| -// trivially equality comparable implementations |
55 |
| -template <class _Tp, |
56 |
| - class _Up, |
57 |
| - class _Proj, |
58 |
| - __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value && |
59 |
| - sizeof(_Tp) == 1, |
60 |
| - int> = 0> |
61 |
| -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) { |
62 |
| - if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first)) |
63 |
| - return __ret; |
64 |
| - return __last; |
| 55 | +template <class _Iter, class _Sent, class _Tp, class _Proj> |
| 56 | +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter |
| 57 | +__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) { |
| 58 | + return std::__find_loop(std::move(__first), std::move(__last), __value, __proj); |
65 | 59 | }
|
66 | 60 |
|
67 |
| -#if _LIBCPP_HAS_WIDE_CHARACTERS |
68 |
| -template <class _Tp, |
69 |
| - class _Up, |
70 |
| - class _Proj, |
71 |
| - __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value && |
72 |
| - sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t), |
73 |
| - int> = 0> |
| 61 | +#if _LIBCPP_VECTORIZE_ALGORITHMS |
| 62 | +template <class _Tp, class _Up> |
| 63 | +[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI |
| 64 | +_LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find_vectorized(_Tp* __first, _Tp* __last, _Up __value) { |
| 65 | + if (!__libcpp_is_constant_evaluated()) { |
| 66 | + constexpr size_t __unroll_count = 4; |
| 67 | + constexpr size_t __vec_size = __native_vector_size<_Tp>; |
| 68 | + using __vec = __simd_vector<_Tp, __vec_size>; |
| 69 | + |
| 70 | + auto __orig_first = __first; |
| 71 | + |
| 72 | + auto __values = static_cast<__simd_vector<_Up, __vec_size>>(__value); // broadcast the value |
| 73 | + while (static_cast<size_t>(__last - __first) >= __unroll_count * __vec_size) [[__unlikely__]] { |
| 74 | + __vec __lhs[__unroll_count]; |
| 75 | + |
| 76 | + for (size_t __i = 0; __i != __unroll_count; ++__i) |
| 77 | + __lhs[__i] = std::__load_vector<__vec>(__first + __i * __vec_size); |
| 78 | + |
| 79 | + for (size_t __i = 0; __i != __unroll_count; ++__i) { |
| 80 | + if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) { |
| 81 | + auto __offset = __i * __vec_size + std::__find_first_set(__cmp_res); |
| 82 | + return __first + __offset; |
| 83 | + } |
| 84 | + } |
| 85 | + |
| 86 | + __first += __unroll_count * __vec_size; |
| 87 | + } |
| 88 | + |
| 89 | + // check the remaining 0-3 vectors |
| 90 | + while (static_cast<size_t>(__last - __first) >= __vec_size) { |
| 91 | + if (auto __cmp_res = std::__load_vector<__vec>(__first) == __values; std::__any_of(__cmp_res)) { |
| 92 | + return __first + std::__find_first_set(__cmp_res); |
| 93 | + } |
| 94 | + __first += __vec_size; |
| 95 | + } |
| 96 | + |
| 97 | + if (__last - __first == 0) |
| 98 | + return __first; |
| 99 | + |
| 100 | + // Check if we can load elements in front of the current pointer. If that's the case load a vector at |
| 101 | + // (last - vector_size) to check the remaining elements |
| 102 | + if (static_cast<size_t>(__first - __orig_first) >= __vec_size) { |
| 103 | + __first = __last - __vec_size; |
| 104 | + return __first + std::__find_first_set(std::__load_vector<__vec>(__first) == __values); |
| 105 | + } |
| 106 | + } |
| 107 | + |
| 108 | + __identity __proj; |
| 109 | + return std::__find_loop(__first, __last, __value, __proj); |
| 110 | +} |
| 111 | +#endif |
| 112 | + |
| 113 | +#ifndef _LIBCPP_CXX03_LANG |
| 114 | +// trivially equality comparable implementations |
| 115 | +template < |
| 116 | + class _Tp, |
| 117 | + class _Up, |
| 118 | + class _Proj, |
| 119 | + __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value, int> = 0> |
74 | 120 | _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
|
75 |
| - if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first)) |
76 |
| - return __ret; |
77 |
| - return __last; |
| 121 | + if constexpr (sizeof(_Tp) == 1) { |
| 122 | + if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first)) |
| 123 | + return __ret; |
| 124 | + return __last; |
| 125 | + } |
| 126 | +# if _LIBCPP_HAS_WIDE_CHARACTERS |
| 127 | + else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) { |
| 128 | + if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first)) |
| 129 | + return __ret; |
| 130 | + return __last; |
| 131 | + } |
| 132 | +# endif |
| 133 | +# if _LIBCPP_VECTORIZE_ALGORITHMS |
| 134 | + else if constexpr (is_integral<_Tp>::value) { |
| 135 | + return std::__find_vectorized(__first, __last, __value); |
| 136 | + } |
| 137 | +# endif |
| 138 | + else { |
| 139 | + __identity __proj; |
| 140 | + return std::__find_loop(__first, __last, __value, __proj); |
| 141 | + } |
78 | 142 | }
|
79 |
| -#endif // _LIBCPP_HAS_WIDE_CHARACTERS |
| 143 | +#endif |
80 | 144 |
|
81 | 145 | // TODO: This should also be possible to get right with different signedness
|
82 | 146 | // cast integral types to allow vectorization
|
|
0 commit comments