@@ -340,14 +340,36 @@ BENCHMARK(i32_addition_randomly_initialized)->Threads(physical_cores());
340340 */
341341#include < algorithm> // `std::sort`
342342#include < numeric> // `std::iota`
343- #include < vector> // `std::vector`
343+
344+ /* *
345+ * @brief A minimalistic `std::vector` replacement, wrapping an aligned
346+ * allocation similar to `std::unique_ptr`.
347+ * @see https://stackoverflow.com/a/79363156/2766161
348+ */
349+ template <typename type_>
350+ class aligned_array {
351+
352+ type_ *data_ = nullptr ;
353+ std::size_t size_ = 0 ;
354+
355+ public:
356+ aligned_array (std::size_t size, std::size_t alignment = 64 ) : size_(size) {
357+ data_ = static_cast <type_ *>(std::aligned_alloc (alignment, sizeof (type_) * size_));
358+ if (!data_) throw std::bad_alloc ();
359+ }
360+ ~aligned_array () noexcept { std::free (data_); }
361+ type_ *begin () const noexcept { return data_; }
362+ type_ *end () const noexcept { return data_ + size_; }
363+ type_ &operator [](std::size_t index) noexcept { return data_[index]; }
364+ type_ operator [](std::size_t index) const noexcept { return data_[index]; }
365+ };
344366
345367static void sorting (bm::State &state) {
346368
347369 auto length = static_cast <std::size_t >(state.range (0 ));
348370 auto include_preprocessing = static_cast <bool >(state.range (1 ));
349371
350- std::vector <std::uint32_t > array (length);
372+ aligned_array <std::uint32_t > array (length);
351373 std::iota (array.begin (), array.end (), 1u );
352374
353375 for (auto _ : state) {
@@ -356,9 +378,7 @@ static void sorting(bm::State &state) {
356378 // Reverse order is the most classical worst case, but not the only one.
357379 std::reverse (array.begin (), array.end ());
358380 if (!include_preprocessing) state.ResumeTiming ();
359-
360381 std::sort (array.begin (), array.end ());
361- bm::DoNotOptimize (array.size ());
362382 }
363383
364384 if (!std::is_sorted (array.begin (), array.end ())) state.SkipWithError (" Array is not sorted!" );
@@ -393,13 +413,12 @@ static void sorting_with_executors( //
393413 bm::State &state, execution_policy_ &&policy) {
394414
395415 auto length = static_cast <std::size_t >(state.range (0 ));
396- std::vector <std::uint32_t > array (length);
416+ aligned_array <std::uint32_t > array (length);
397417 std::iota (array.begin (), array.end (), 1u );
398418
399419 for (auto _ : state) {
400420 std::reverse (policy, array.begin (), array.end ());
401421 std::sort (policy, array.begin (), array.end ());
402- bm::DoNotOptimize (array.size ());
403422 }
404423
405424 if (!std::is_sorted (array.begin (), array.end ())) state.SkipWithError (" Array is not sorted!" );
@@ -498,7 +517,7 @@ static void sorting_with_openmp(bm::State &state) {
498517 return offset < length ? offset : length;
499518 };
500519
501- std::vector <std::uint32_t > array (length);
520+ aligned_array <std::uint32_t > array (length);
502521 std::iota (array.begin (), array.end (), 1u );
503522
504523 for (auto _ : state) {
@@ -528,8 +547,6 @@ static void sorting_with_openmp(bm::State &state) {
528547 std::inplace_merge (array.begin () + start, array.begin () + mid, array.begin () + finish);
529548 }
530549 }
531-
532- bm::DoNotOptimize (array.size ());
533550 }
534551
535552 if (!std::is_sorted (array.begin (), array.end ())) state.SkipWithError (" Array is not sorted!" );
@@ -621,6 +638,7 @@ struct quick_sort_recurse {
621638 * with additional bookkeeping. In our logic we never need to pop from the middle
622639 * or from the front, so a `std::vector` is a better choice.
623640 */
641+ #include < vector> // `std::vector`
624642
625643template <typename element_type_>
626644struct quick_sort_iterate {
@@ -663,10 +681,10 @@ template <typename sorter_type_, std::size_t length_> //
663681static void recursion_cost (bm::State &state) {
664682 using element_t = typename sorter_type_::element_t ;
665683 sorter_type_ sorter;
666- std::vector <element_t > array (length_);
684+ aligned_array <element_t > array (length_);
667685 for (auto _ : state) {
668686 for (std::size_t i = 0 ; i != length_; ++i) array[i] = length_ - i;
669- sorter (array.data (), 0 , static_cast <std::ptrdiff_t >(length_ - 1 ));
687+ sorter (array.begin (), 0 , static_cast <std::ptrdiff_t >(length_ - 1 ));
670688 }
671689
672690 if (!std::is_sorted (array.begin (), array.end ())) state.SkipWithError (" Array is not sorted!" );
@@ -720,8 +738,8 @@ BENCHMARK_TEMPLATE(recursion_cost, iterative_sort_i32s, 4096);
720738 */
721739static void branch_cost (bm::State &state) {
722740 auto count = static_cast <std::size_t >(state.range (0 ));
723- std::vector <std::int32_t > random_values (count);
724- std::generate_n (random_values.begin (), random_values. size () , &std::rand);
741+ aligned_array <std::int32_t > random_values (count);
742+ std::generate_n (random_values.begin (), count , &std::rand);
725743 std::int32_t variable = 0 ;
726744 std::size_t iteration = 0 ;
727745
@@ -764,16 +782,16 @@ static void cache_misses_cost(bm::State &state) {
764782 auto count = static_cast <std::uint32_t >(state.range (0 ));
765783
766784 // Populate with arbitrary data
767- std::vector <std::int32_t > data (count);
785+ aligned_array <std::int32_t > data (count);
768786 std::iota (data.begin (), data.end (), 0 );
769787
770788 // Initialize different access orders
771- std::vector <std::uint32_t > indices (count);
789+ aligned_array <std::uint32_t > indices (count);
772790 if constexpr (access_order_ == access_order_t ::random) {
773791 std::random_device random_device;
774792 std::mt19937 generator (random_device ());
775793 std::uniform_int_distribution<std::uint32_t > uniform_distribution (0 , count - 1 );
776- std::generate_n (indices.begin (), indices.size (), [&] { return uniform_distribution (generator); });
794+ std::generate (indices.begin (), indices.end (), [&] { return uniform_distribution (generator); });
777795 }
778796 else { std::iota (indices.begin (), indices.end (), 0u ); }
779797
@@ -1928,13 +1946,6 @@ class strided_ptr {
19281946 // clang-format on
19291947};
19301948
1931- template <typename type_>
1932- std::unique_ptr<type_[], decltype (&std::free)> make_aligned_array (std::size_t size, std::size_t alignment) {
1933- type_ *raw_ptr = static_cast <type_ *>(std::aligned_alloc (alignment, sizeof (type_) * size));
1934- if (!raw_ptr) throw std::bad_alloc ();
1935- return std::unique_ptr<type_[], decltype (&std::free)>(raw_ptr, &std::free);
1936- }
1937-
19381949#if defined(__aarch64__)
19391950/* *
19401951 * @brief Helper derived from `__aarch64_sync_cache_range` in `libgcc`, used to
@@ -1959,8 +1970,8 @@ static void memory_access(bm::State &state) {
19591970 // memory accesses may suffer from the same issues. For split-loads, pad our
19601971 // buffer with an extra `cache_line_width` bytes of space.
19611972 std::size_t const buffer_size = typical_l2_size + cache_line_width;
1962- auto const buffer = make_aligned_array <std::byte>(buffer_size, cache_line_width);
1963- std::byte *const buffer_ptr = buffer.get ();
1973+ aligned_array <std::byte> buffer (buffer_size, cache_line_width);
1974+ std::byte *const buffer_ptr = buffer.begin ();
19641975
19651976 // Let's initialize a strided range using out `strided_ptr` template, but
19661977 // for `alignment_mode_t::unaligned_k` make sure that the scalar-of-interest in each
@@ -2050,16 +2061,16 @@ template <typename kernel_type_>
20502061static void spread_memory (bm::State &state, kernel_type_ kernel, std::size_t align = sizeof (spread_data_t )) {
20512062
20522063 std::size_t const size = static_cast <std::size_t >(state.range (0 ));
2053- auto indices = make_aligned_array <spread_index_t >(size, align);
2054- auto first = make_aligned_array <spread_data_t >(size, align);
2055- auto second = make_aligned_array <spread_data_t >(size, align);
2064+ aligned_array <spread_index_t > indices (size, align);
2065+ aligned_array <spread_data_t > first (size, align);
2066+ aligned_array <spread_data_t > second (size, align);
20562067
2057- std::iota (indices.get (), indices.get () + size, 0 );
2068+ std::iota (indices.begin (), indices.begin () + size, 0 );
20582069 std::random_device random_device;
20592070 std::mt19937 generator (random_device ());
2060- std::shuffle (indices.get (), indices.get () + size, generator);
2071+ std::shuffle (indices.begin (), indices.begin () + size, generator);
20612072
2062- for (auto _ : state) kernel (first.get (), indices.get (), second.get (), size);
2073+ for (auto _ : state) kernel (first.begin (), indices.begin (), second.begin (), size);
20632074}
20642075
20652076BENCHMARK_CAPTURE (spread_memory, gather_scalar, spread_gather_scalar)->Range(1 << 10 , 1 << 20 )->MinTime(5 );
@@ -2191,20 +2202,20 @@ static void cblas_tops(bm::State &state) {
21912202 int const lda = static_cast <int >(n), ldb = static_cast <int >(n), ldc = static_cast <int >(n);
21922203
21932204 // Allocate and initialize data
2194- std::vector <scalar_type_> a (n * n), b (n * n), c (n * n, 0 );
2205+ aligned_array <scalar_type_> a (n * n), b (n * n), c (n * n, 0 );
21952206 std::iota (a.begin (), a.end (), 0 );
21962207 std::iota (b.begin (), b.end (), 0 );
21972208
21982209 // BLAS defines GEMM routines as: alpha * a * b + beta * c
21992210 for (auto _ : state)
22002211 if constexpr (std::is_same_v<scalar_type_, float >)
22012212 cblas_sgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, //
2202- /* alpha: */ 1 , a.data (), lda, b.data (), ldb, //
2203- /* beta: */ 0 , c.data (), ldc);
2213+ /* alpha: */ 1 , a.begin (), lda, b.begin (), ldb, //
2214+ /* beta: */ 0 , c.begin (), ldc);
22042215 else
22052216 cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, //
2206- /* alpha: */ 1 , a.data (), lda, b.data (), ldb, //
2207- /* beta: */ 0 , c.data (), ldc);
2217+ /* alpha: */ 1 , a.begin (), lda, b.begin (), ldb, //
2218+ /* beta: */ 0 , c.begin (), ldc);
22082219
22092220 std::size_t tops_per_cycle = n * n * (n /* multiplications */ + (n - 1 ) /* additions */ );
22102221 state.counters [" TOP" ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
0 commit comments