Skip to content

Commit eaf7355

Browse files
Improve basic_string::find_first_of and basic_string::find_last_of vectorization for large needles or very large haystacks (#5029)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent 0119980 commit eaf7355

File tree

5 files changed

+850
-236
lines changed

5 files changed

+850
-236
lines changed

benchmarks/src/find_first_of.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#include <type_traits>
1414
#include <vector>
1515

16+
#include "skewed_allocator.hpp"
17+
1618
using namespace std;
1719

1820
enum class AlgType { std_func, str_member_first, str_member_last };
@@ -24,7 +26,8 @@ void bm(benchmark::State& state) {
2426
const size_t HSize = Pos * 2;
2527
const size_t Which = 0;
2628

27-
using container = conditional_t<Alg == AlgType::std_func, vector<T>, basic_string<T>>;
29+
using container = conditional_t<Alg == AlgType::std_func, vector<T, not_highly_aligned_allocator<T>>,
30+
basic_string<T, char_traits<T>, not_highly_aligned_allocator<T>>>;
2831

2932
constexpr T HaystackFiller{' '};
3033
static_assert(HaystackFiller < Start, "The following iota() should not produce the haystack filler.");
@@ -59,8 +62,9 @@ void bm(benchmark::State& state) {
5962
}
6063

6164
void common_args(auto bm) {
62-
bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2})->Args({102, 4});
63-
bm->Args({325, 1})->Args({400, 50})->Args({1011, 11})->Args({1502, 23})->Args({3056, 7});
65+
bm->Args({2, 3})->Args({6, 81})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2});
66+
bm->Args({75, 85})->Args({102, 4})->Args({200, 46})->Args({325, 1})->Args({400, 50});
67+
bm->Args({1011, 11})->Args({1280, 46})->Args({1502, 23})->Args({2203, 54})->Args({3056, 7});
6468
}
6569

6670
BENCHMARK(bm<AlgType::std_func, uint8_t>)->Apply(common_args);

stl/inc/__msvc_string_view.hpp

Lines changed: 61 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,15 @@ extern "C" {
2929
// compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to
3030
// unanalyzable routines may modify those arrays.
3131

32+
__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_1(
33+
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;
34+
__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_2(
35+
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;
36+
__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_4(
37+
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;
38+
__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_8(
39+
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;
40+
3241
__declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_1(
3342
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;
3443
__declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2(
@@ -38,6 +47,23 @@ __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2(
3847

3948
_STD_BEGIN
4049

50+
template <class _Ty1, class _Ty2>
51+
size_t _Find_first_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length,
52+
const _Ty2* const _Needle, const size_t _Needle_length) noexcept {
53+
_STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2));
54+
if constexpr (sizeof(_Ty1) == 1) {
55+
return ::__std_find_first_of_trivial_pos_1(_Haystack, _Haystack_length, _Needle, _Needle_length);
56+
} else if constexpr (sizeof(_Ty1) == 2) {
57+
return ::__std_find_first_of_trivial_pos_2(_Haystack, _Haystack_length, _Needle, _Needle_length);
58+
} else if constexpr (sizeof(_Ty1) == 4) {
59+
return ::__std_find_first_of_trivial_pos_4(_Haystack, _Haystack_length, _Needle, _Needle_length);
60+
} else if constexpr (sizeof(_Ty1) == 8) {
61+
return ::__std_find_first_of_trivial_pos_8(_Haystack, _Haystack_length, _Needle, _Needle_length);
62+
} else {
63+
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
64+
}
65+
}
66+
4167
template <class _Ty1, class _Ty2>
4268
size_t _Find_last_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length,
4369
const _Ty2* const _Needle, const size_t _Needle_length) noexcept {
@@ -834,48 +860,31 @@ constexpr size_t _Traits_find_first_of(_In_reads_(_Hay_size) const _Traits_ptr_t
834860
const auto _Hay_end = _Haystack + _Hay_size;
835861

836862
if constexpr (_Is_implementation_handled_char_traits<_Traits>) {
837-
if (!_STD _Is_constant_evaluated()) {
838-
using _Elem = typename _Traits::char_type;
839-
840863
#if _USE_STD_VECTOR_ALGORITHMS
841-
const bool _Try_vectorize = _Hay_size - _Start_at > _Threshold_find_first_of;
842-
843-
// Additional condition for when the vectorization outperforms the table lookup
844-
constexpr size_t _Find_first_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48 : sizeof(_Elem) == 8 ? 8 : 16;
845-
846-
const bool _Use_bitmap = !_Try_vectorize || _Needle_size > _Find_first_of_bitmap_threshold;
847-
#else // ^^^ _USE_STD_VECTOR_ALGORITHMS / !_USE_STD_VECTOR_ALGORITHMS vvv
848-
const bool _Use_bitmap = true;
849-
#endif // ^^^ !_USE_STD_VECTOR_ALGORITHMS ^^^
850-
851-
if (_Use_bitmap) {
852-
_String_bitmap<_Elem> _Matches;
853-
854-
if (_Matches._Mark(_Needle, _Needle + _Needle_size)) {
855-
for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) {
856-
if (_Matches._Match(*_Match_try)) {
857-
return static_cast<size_t>(_Match_try - _Haystack); // found a match
858-
}
859-
}
860-
return static_cast<size_t>(-1); // no match
864+
if (!_STD _Is_constant_evaluated()) {
865+
const size_t _Remaining_size = _Hay_size - _Start_at;
866+
if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) {
867+
size_t _Pos = _Find_first_of_pos_vectorized(_Hay_start, _Remaining_size, _Needle, _Needle_size);
868+
if (_Pos != static_cast<size_t>(-1)) {
869+
_Pos += _Start_at;
861870
}
862-
863-
// couldn't put one of the characters into the bitmap, fall back to vectorized or serial algorithms
871+
return _Pos;
864872
}
873+
}
874+
#endif // _USE_STD_VECTOR_ALGORITHMS
865875

866-
#if _USE_STD_VECTOR_ALGORITHMS
867-
if (_Try_vectorize) {
868-
const _Traits_ptr_t<_Traits> _Found =
869-
_STD _Find_first_of_vectorized(_Hay_start, _Hay_end, _Needle, _Needle + _Needle_size);
870-
871-
if (_Found != _Hay_end) {
872-
return static_cast<size_t>(_Found - _Haystack); // found a match
873-
} else {
874-
return static_cast<size_t>(-1); // no match
876+
_String_bitmap<typename _Traits::char_type> _Matches;
877+
878+
if (_Matches._Mark(_Needle, _Needle + _Needle_size)) {
879+
for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) {
880+
if (_Matches._Match(*_Match_try)) {
881+
return static_cast<size_t>(_Match_try - _Haystack); // found a match
875882
}
876883
}
877-
#endif // _USE_STD_VECTOR_ALGORITHMS
884+
return static_cast<size_t>(-1); // no match
878885
}
886+
887+
// couldn't put one of the characters into the bitmap, fall back to serial algorithm
879888
}
880889

881890
for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) {
@@ -899,47 +908,32 @@ constexpr size_t _Traits_find_last_of(_In_reads_(_Hay_size) const _Traits_ptr_t<
899908
const auto _Hay_start = (_STD min)(_Start_at, _Hay_size - 1);
900909

901910
if constexpr (_Is_implementation_handled_char_traits<_Traits>) {
902-
if (!_STD _Is_constant_evaluated()) {
903-
using _Elem = typename _Traits::char_type;
904-
905-
bool _Use_bitmap = true;
911+
using _Elem = typename _Traits::char_type;
906912
#if _USE_STD_VECTOR_ALGORITHMS
907-
bool _Try_vectorize = false;
908-
909-
if constexpr (sizeof(_Elem) <= 2) {
910-
_Try_vectorize = _Hay_start + 1 > _Threshold_find_first_of;
911-
// Additional condition for when the vectorization outperforms the table lookup
912-
constexpr size_t _Find_last_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48 : 8;
913-
914-
_Use_bitmap = !_Try_vectorize || _Needle_size > _Find_last_of_bitmap_threshold;
913+
if constexpr (sizeof(_Elem) <= 2) {
914+
if (!_STD _Is_constant_evaluated()) {
915+
const size_t _Remaining_size = _Hay_start + 1;
916+
if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) { // same threshold for first/last
917+
return _Find_last_of_pos_vectorized(_Haystack, _Remaining_size, _Needle, _Needle_size);
918+
}
915919
}
920+
}
916921
#endif // _USE_STD_VECTOR_ALGORITHMS
917922

918-
if (_Use_bitmap) {
919-
_String_bitmap<_Elem> _Matches;
920-
if (_Matches._Mark(_Needle, _Needle + _Needle_size)) {
921-
for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) {
922-
if (_Matches._Match(*_Match_try)) {
923-
return static_cast<size_t>(_Match_try - _Haystack); // found a match
924-
}
925-
926-
if (_Match_try == _Haystack) {
927-
return static_cast<size_t>(-1); // at beginning, no more chance for match
928-
}
929-
}
923+
_String_bitmap<_Elem> _Matches;
924+
if (_Matches._Mark(_Needle, _Needle + _Needle_size)) {
925+
for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) {
926+
if (_Matches._Match(*_Match_try)) {
927+
return static_cast<size_t>(_Match_try - _Haystack); // found a match
930928
}
931929

932-
// couldn't put one of the characters into the bitmap, fall back to vectorized or serial algorithms
933-
}
934-
935-
#if _USE_STD_VECTOR_ALGORITHMS
936-
if constexpr (sizeof(_Elem) <= 2) {
937-
if (_Try_vectorize) {
938-
return _STD _Find_last_of_pos_vectorized(_Haystack, _Hay_start + 1, _Needle, _Needle_size);
930+
if (_Match_try == _Haystack) {
931+
return static_cast<size_t>(-1); // at beginning, no more chance for match
939932
}
940933
}
941-
#endif // _USE_STD_VECTOR_ALGORITHMS
942934
}
935+
936+
// couldn't put one of the characters into the bitmap, fall back to serial algorithm
943937
}
944938

945939
for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) {

stl/inc/algorithm

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ extern "C" {
3838
// functions are in native code objects that the compiler cannot analyze. In the absence of the noalias attribute, the
3939
// compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to
4040
// unanalyzable routines may modify those arrays.
41+
const void* __stdcall __std_find_first_of_trivial_1(
42+
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
43+
const void* __stdcall __std_find_first_of_trivial_2(
44+
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
45+
const void* __stdcall __std_find_first_of_trivial_4(
46+
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
47+
const void* __stdcall __std_find_first_of_trivial_8(
48+
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
49+
4150
__declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_1(
4251
const void* _First, const void* _Last, void* _Dest) noexcept;
4352
__declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_2(
@@ -73,6 +82,27 @@ __declspec(noalias) void __stdcall __std_replace_8(
7382
} // extern "C"
7483

7584
_STD_BEGIN
85+
template <class _Ty1, class _Ty2>
86+
_Ty1* _Find_first_of_vectorized(
87+
_Ty1* const _First1, _Ty1* const _Last1, _Ty2* const _First2, _Ty2* const _Last2) noexcept {
88+
_STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2));
89+
if constexpr (sizeof(_Ty1) == 1) {
90+
return const_cast<_Ty1*>(
91+
static_cast<const _Ty1*>(::__std_find_first_of_trivial_1(_First1, _Last1, _First2, _Last2)));
92+
} else if constexpr (sizeof(_Ty1) == 2) {
93+
return const_cast<_Ty1*>(
94+
static_cast<const _Ty1*>(::__std_find_first_of_trivial_2(_First1, _Last1, _First2, _Last2)));
95+
} else if constexpr (sizeof(_Ty1) == 4) {
96+
return const_cast<_Ty1*>(
97+
static_cast<const _Ty1*>(::__std_find_first_of_trivial_4(_First1, _Last1, _First2, _Last2)));
98+
} else if constexpr (sizeof(_Ty1) == 8) {
99+
return const_cast<_Ty1*>(
100+
static_cast<const _Ty1*>(::__std_find_first_of_trivial_8(_First1, _Last1, _First2, _Last2)));
101+
} else {
102+
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
103+
}
104+
}
105+
76106
template <size_t _Nx>
77107
__declspec(noalias) void _Reverse_copy_vectorized(const void* _First, const void* _Last, void* _Dest) noexcept {
78108
if constexpr (_Nx == 1) {

stl/inc/xutility

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,6 @@ const void* __stdcall __std_find_last_trivial_2(const void* _First, const void*
9898
const void* __stdcall __std_find_last_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept;
9999
const void* __stdcall __std_find_last_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept;
100100

101-
const void* __stdcall __std_find_first_of_trivial_1(
102-
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
103-
const void* __stdcall __std_find_first_of_trivial_2(
104-
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
105-
const void* __stdcall __std_find_first_of_trivial_4(
106-
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
107-
const void* __stdcall __std_find_first_of_trivial_8(
108-
const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept;
109-
110101
const void* __stdcall __std_search_1(
111102
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;
112103
const void* __stdcall __std_search_2(
@@ -252,27 +243,6 @@ _Ty* _Find_last_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val
252243
// find_first_of vectorization is likely to be a win after this size (in elements)
253244
_INLINE_VAR constexpr ptrdiff_t _Threshold_find_first_of = 16;
254245

255-
template <class _Ty1, class _Ty2>
256-
_Ty1* _Find_first_of_vectorized(
257-
_Ty1* const _First1, _Ty1* const _Last1, _Ty2* const _First2, _Ty2* const _Last2) noexcept {
258-
_STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2));
259-
if constexpr (sizeof(_Ty1) == 1) {
260-
return const_cast<_Ty1*>(
261-
static_cast<const _Ty1*>(::__std_find_first_of_trivial_1(_First1, _Last1, _First2, _Last2)));
262-
} else if constexpr (sizeof(_Ty1) == 2) {
263-
return const_cast<_Ty1*>(
264-
static_cast<const _Ty1*>(::__std_find_first_of_trivial_2(_First1, _Last1, _First2, _Last2)));
265-
} else if constexpr (sizeof(_Ty1) == 4) {
266-
return const_cast<_Ty1*>(
267-
static_cast<const _Ty1*>(::__std_find_first_of_trivial_4(_First1, _Last1, _First2, _Last2)));
268-
} else if constexpr (sizeof(_Ty1) == 8) {
269-
return const_cast<_Ty1*>(
270-
static_cast<const _Ty1*>(::__std_find_first_of_trivial_8(_First1, _Last1, _First2, _Last2)));
271-
} else {
272-
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
273-
}
274-
}
275-
276246
template <class _Ty1, class _Ty2>
277247
_Ty1* _Search_vectorized(_Ty1* const _First1, _Ty1* const _Last1, _Ty2* const _First2, const size_t _Count2) noexcept {
278248
_STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2));

0 commit comments

Comments
 (0)