Skip to content

Commit 1539624

Browse files
committed
Improve: Use aligned_array in more places
1 parent 53f9e71 commit 1539624

File tree

1 file changed

+47
-36
lines changed

1 file changed

+47
-36
lines changed

less_slow.cpp

Lines changed: 47 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -340,14 +340,36 @@ BENCHMARK(i32_addition_randomly_initialized)->Threads(physical_cores());
340340
*/
341341
#include <algorithm> // `std::sort`
342342
#include <numeric> // `std::iota`
343-
#include <vector> // `std::vector`
343+
344+
/**
345+
* @brief A minimalistic `std::vector` replacement, wrapping an aligned
346+
* allocation similar to `std::unique_ptr`.
347+
* @see https://stackoverflow.com/a/79363156/2766161
348+
*/
349+
template <typename type_>
350+
class aligned_array {
351+
352+
type_ *data_ = nullptr;
353+
std::size_t size_ = 0;
354+
355+
public:
356+
aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size) {
357+
data_ = static_cast<type_ *>(std::aligned_alloc(alignment, sizeof(type_) * size_));
358+
if (!data_) throw std::bad_alloc();
359+
}
360+
~aligned_array() noexcept { std::free(data_); }
361+
type_ *begin() const noexcept { return data_; }
362+
type_ *end() const noexcept { return data_ + size_; }
363+
type_ &operator[](std::size_t index) noexcept { return data_[index]; }
364+
type_ operator[](std::size_t index) const noexcept { return data_[index]; }
365+
};
344366

345367
static void sorting(bm::State &state) {
346368

347369
auto length = static_cast<std::size_t>(state.range(0));
348370
auto include_preprocessing = static_cast<bool>(state.range(1));
349371

350-
std::vector<std::uint32_t> array(length);
372+
aligned_array<std::uint32_t> array(length);
351373
std::iota(array.begin(), array.end(), 1u);
352374

353375
for (auto _ : state) {
@@ -356,9 +378,7 @@ static void sorting(bm::State &state) {
356378
// Reverse order is the most classical worst case, but not the only one.
357379
std::reverse(array.begin(), array.end());
358380
if (!include_preprocessing) state.ResumeTiming();
359-
360381
std::sort(array.begin(), array.end());
361-
bm::DoNotOptimize(array.size());
362382
}
363383

364384
if (!std::is_sorted(array.begin(), array.end())) state.SkipWithError("Array is not sorted!");
@@ -393,13 +413,12 @@ static void sorting_with_executors( //
393413
bm::State &state, execution_policy_ &&policy) {
394414

395415
auto length = static_cast<std::size_t>(state.range(0));
396-
std::vector<std::uint32_t> array(length);
416+
aligned_array<std::uint32_t> array(length);
397417
std::iota(array.begin(), array.end(), 1u);
398418

399419
for (auto _ : state) {
400420
std::reverse(policy, array.begin(), array.end());
401421
std::sort(policy, array.begin(), array.end());
402-
bm::DoNotOptimize(array.size());
403422
}
404423

405424
if (!std::is_sorted(array.begin(), array.end())) state.SkipWithError("Array is not sorted!");
@@ -498,7 +517,7 @@ static void sorting_with_openmp(bm::State &state) {
498517
return offset < length ? offset : length;
499518
};
500519

501-
std::vector<std::uint32_t> array(length);
520+
aligned_array<std::uint32_t> array(length);
502521
std::iota(array.begin(), array.end(), 1u);
503522

504523
for (auto _ : state) {
@@ -528,8 +547,6 @@ static void sorting_with_openmp(bm::State &state) {
528547
std::inplace_merge(array.begin() + start, array.begin() + mid, array.begin() + finish);
529548
}
530549
}
531-
532-
bm::DoNotOptimize(array.size());
533550
}
534551

535552
if (!std::is_sorted(array.begin(), array.end())) state.SkipWithError("Array is not sorted!");
@@ -621,6 +638,7 @@ struct quick_sort_recurse {
621638
* with additional bookkeeping. In our logic we never need to pop from the middle
622639
* or from the front, so a `std::vector` is a better choice.
623640
*/
641+
#include <vector> // `std::vector`
624642

625643
template <typename element_type_>
626644
struct quick_sort_iterate {
@@ -663,10 +681,10 @@ template <typename sorter_type_, std::size_t length_> //
663681
static void recursion_cost(bm::State &state) {
664682
using element_t = typename sorter_type_::element_t;
665683
sorter_type_ sorter;
666-
std::vector<element_t> array(length_);
684+
aligned_array<element_t> array(length_);
667685
for (auto _ : state) {
668686
for (std::size_t i = 0; i != length_; ++i) array[i] = length_ - i;
669-
sorter(array.data(), 0, static_cast<std::ptrdiff_t>(length_ - 1));
687+
sorter(array.begin(), 0, static_cast<std::ptrdiff_t>(length_ - 1));
670688
}
671689

672690
if (!std::is_sorted(array.begin(), array.end())) state.SkipWithError("Array is not sorted!");
@@ -720,8 +738,8 @@ BENCHMARK_TEMPLATE(recursion_cost, iterative_sort_i32s, 4096);
720738
*/
721739
static void branch_cost(bm::State &state) {
722740
auto count = static_cast<std::size_t>(state.range(0));
723-
std::vector<std::int32_t> random_values(count);
724-
std::generate_n(random_values.begin(), random_values.size(), &std::rand);
741+
aligned_array<std::int32_t> random_values(count);
742+
std::generate_n(random_values.begin(), count, &std::rand);
725743
std::int32_t variable = 0;
726744
std::size_t iteration = 0;
727745

@@ -764,16 +782,16 @@ static void cache_misses_cost(bm::State &state) {
764782
auto count = static_cast<std::uint32_t>(state.range(0));
765783

766784
// Populate with arbitrary data
767-
std::vector<std::int32_t> data(count);
785+
aligned_array<std::int32_t> data(count);
768786
std::iota(data.begin(), data.end(), 0);
769787

770788
// Initialize different access orders
771-
std::vector<std::uint32_t> indices(count);
789+
aligned_array<std::uint32_t> indices(count);
772790
if constexpr (access_order_ == access_order_t::random) {
773791
std::random_device random_device;
774792
std::mt19937 generator(random_device());
775793
std::uniform_int_distribution<std::uint32_t> uniform_distribution(0, count - 1);
776-
std::generate_n(indices.begin(), indices.size(), [&] { return uniform_distribution(generator); });
794+
std::generate(indices.begin(), indices.end(), [&] { return uniform_distribution(generator); });
777795
}
778796
else { std::iota(indices.begin(), indices.end(), 0u); }
779797

@@ -1928,13 +1946,6 @@ class strided_ptr {
19281946
// clang-format on
19291947
};
19301948

1931-
template <typename type_>
1932-
std::unique_ptr<type_[], decltype(&std::free)> make_aligned_array(std::size_t size, std::size_t alignment) {
1933-
type_ *raw_ptr = static_cast<type_ *>(std::aligned_alloc(alignment, sizeof(type_) * size));
1934-
if (!raw_ptr) throw std::bad_alloc();
1935-
return std::unique_ptr<type_[], decltype(&std::free)>(raw_ptr, &std::free);
1936-
}
1937-
19381949
#if defined(__aarch64__)
19391950
/**
19401951
* @brief Helper derived from `__aarch64_sync_cache_range` in `libgcc`, used to
@@ -1959,8 +1970,8 @@ static void memory_access(bm::State &state) {
19591970
// memory accesses may suffer from the same issues. For split-loads, pad our
19601971
// buffer with an extra `cache_line_width` bytes of space.
19611972
std::size_t const buffer_size = typical_l2_size + cache_line_width;
1962-
auto const buffer = make_aligned_array<std::byte>(buffer_size, cache_line_width);
1963-
std::byte *const buffer_ptr = buffer.get();
1973+
aligned_array<std::byte> buffer(buffer_size, cache_line_width);
1974+
std::byte *const buffer_ptr = buffer.begin();
19641975

19651976
// Let's initialize a strided range using out `strided_ptr` template, but
19661977
// for `alignment_mode_t::unaligned_k` make sure that the scalar-of-interest in each
@@ -2050,16 +2061,16 @@ template <typename kernel_type_>
20502061
static void spread_memory(bm::State &state, kernel_type_ kernel, std::size_t align = sizeof(spread_data_t)) {
20512062

20522063
std::size_t const size = static_cast<std::size_t>(state.range(0));
2053-
auto indices = make_aligned_array<spread_index_t>(size, align);
2054-
auto first = make_aligned_array<spread_data_t>(size, align);
2055-
auto second = make_aligned_array<spread_data_t>(size, align);
2064+
aligned_array<spread_index_t> indices(size, align);
2065+
aligned_array<spread_data_t> first(size, align);
2066+
aligned_array<spread_data_t> second(size, align);
20562067

2057-
std::iota(indices.get(), indices.get() + size, 0);
2068+
std::iota(indices.begin(), indices.begin() + size, 0);
20582069
std::random_device random_device;
20592070
std::mt19937 generator(random_device());
2060-
std::shuffle(indices.get(), indices.get() + size, generator);
2071+
std::shuffle(indices.begin(), indices.begin() + size, generator);
20612072

2062-
for (auto _ : state) kernel(first.get(), indices.get(), second.get(), size);
2073+
for (auto _ : state) kernel(first.begin(), indices.begin(), second.begin(), size);
20632074
}
20642075

20652076
BENCHMARK_CAPTURE(spread_memory, gather_scalar, spread_gather_scalar)->Range(1 << 10, 1 << 20)->MinTime(5);
@@ -2191,20 +2202,20 @@ static void cblas_tops(bm::State &state) {
21912202
int const lda = static_cast<int>(n), ldb = static_cast<int>(n), ldc = static_cast<int>(n);
21922203

21932204
// Allocate and initialize data
2194-
std::vector<scalar_type_> a(n * n), b(n * n), c(n * n, 0);
2205+
aligned_array<scalar_type_> a(n * n), b(n * n), c(n * n, 0);
21952206
std::iota(a.begin(), a.end(), 0);
21962207
std::iota(b.begin(), b.end(), 0);
21972208

21982209
// BLAS defines GEMM routines as: alpha * a * b + beta * c
21992210
for (auto _ : state)
22002211
if constexpr (std::is_same_v<scalar_type_, float>)
22012212
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, //
2202-
/* alpha: */ 1, a.data(), lda, b.data(), ldb, //
2203-
/* beta: */ 0, c.data(), ldc);
2213+
/* alpha: */ 1, a.begin(), lda, b.begin(), ldb, //
2214+
/* beta: */ 0, c.begin(), ldc);
22042215
else
22052216
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, //
2206-
/* alpha: */ 1, a.data(), lda, b.data(), ldb, //
2207-
/* beta: */ 0, c.data(), ldc);
2217+
/* alpha: */ 1, a.begin(), lda, b.begin(), ldb, //
2218+
/* beta: */ 0, c.begin(), ldc);
22082219

22092220
std::size_t tops_per_cycle = n * n * (n /* multiplications */ + (n - 1) /* additions */);
22102221
state.counters["TOP"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);

0 commit comments

Comments
 (0)