Skip to content

Commit 05c81ac

Browse files
committed
Docs: Spelling and links
1 parent 78d5da2 commit 05c81ac

File tree

6 files changed

+52
-27
lines changed

6 files changed

+52
-27
lines changed

.vscode/settings.json

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,5 +92,14 @@
9292
"cuchar": "cpp",
9393
"hash_set": "cpp",
9494
"latch": "cpp"
95-
}
95+
},
96+
"cSpell.words": [
97+
"ashvardanian",
98+
"CCCL",
99+
"CUDA",
100+
"Kahan",
101+
"shfl",
102+
"SPIR",
103+
"STREQUAL"
104+
]
96105
}

README.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,21 @@
22

33
![Parallel Reductions Benchmark](https://github.com/ashvardanian/ashvardanian/blob/master/repositories/ParallelReductionsBenchmark.jpg?raw=true)
44

5-
One of the canonical examples when designing parallel algorithms is implementing parallel tree-like reductions or its special case of accumulating a bunch of numbers located in a continuous block of memory.
5+
One of the canonical examples when designing parallel algorithms is implementing parallel tree-like reductions, which is a special case of accumulating a bunch of numbers located in a continuous block of memory.
66
In modern C++, most developers would call `std::accumulate(array.begin(), array.end(), 0)`, and in Python, it's just a `sum(array)`.
77
Implementing those operations with high utilization in many-core systems is surprisingly non-trivial and depends heavily on the hardware architecture.
8+
Moreover, on arrays with billions of elements, the default `float` error mounts, and the results become inaccurate unless a [Kahan-like scheme](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) is used.
9+
810
This repository contains several educational examples showcasing the performance differences between different solutions:
911

10-
- AVX2 single-threaded, but SIMD-parallel code.
12+
- Single-threaded but SIMD-accelerated code:
13+
- SSE, AVX, AVX-512 on x86.
14+
- 🔜 NEON and SVE on Arm.
1115
- OpenMP `reduction` clause.
1216
- Thrust with its `thrust::reduce`.
1317
- CUDA kernels with warp-reductions.
1418
- OpenCL kernels, eight of them.
15-
- Parallel STL `<algorithm>'s in GCC with Intel oneTBB.
19+
- Parallel STL `<algorithm>` in GCC with Intel oneTBB.
1620

1721
Previously, it also compared ArrayFire, Halide, and Vulkan queues for SPIR-V kernels and SyCL.
1822
Examples were collected from early 2010s until 2019 and later updated in 2022.
@@ -94,12 +98,12 @@ std::reduce<par, f64>/min_time:10.000/real_time 3921280 ns 3916897
9498
std::reduce<par_unseq, f32>/min_time:10.000/real_time 3884794 ns 3864061 ns 3644 bytes/s=276.396G/s error,%=0
9599
std::reduce<par_unseq, f64>/min_time:10.000/real_time 3889332 ns 3866968 ns 3585 bytes/s=276.074G/s error,%=100
96100
openmp<f32>/min_time:10.000/real_time 5061544 ns 5043250 ns 2407 bytes/s=212.137G/s error,%=65.5651u
101+
sse<f32aligned>@threads/min_time:10.000/real_time 5986350 ns 5193690 ns 2343 bytes/s=179.365G/s error,%=1.25021
97102
avx2<f32>/min_time:10.000/real_time 110796474 ns 110794861 ns 127 bytes/s=9.69112G/s error,%=50
98103
avx2<f32kahan>/min_time:10.000/real_time 134144762 ns 134137771 ns 105 bytes/s=8.00435G/s error,%=0
99104
avx2<f64>/min_time:10.000/real_time 115791797 ns 115790878 ns 121 bytes/s=9.27304G/s error,%=0
100105
avx2<f32aligned>@threads/min_time:10.000/real_time 5958283 ns 5041060 ns 2358 bytes/s=180.21G/s error,%=1.25033
101106
avx2<f64>@threads/min_time:10.000/real_time 5996481 ns 5123440 ns 2337 bytes/s=179.062G/s error,%=1.25001
102-
sse<f32aligned>@threads/min_time:10.000/real_time 5986350 ns 5193690 ns 2343 bytes/s=179.365G/s error,%=1.25021
103107
cub@cuda/min_time:10.000/real_time 356488 ns 356482 ns 39315 bytes/s=3.012T/s error,%=0
104108
warps@cuda/min_time:10.000/real_time 486387 ns 486377 ns 28788 bytes/s=2.20759T/s error,%=0
105109
thrust@cuda/min_time:10.000/real_time 500941 ns 500919 ns 27512 bytes/s=2.14345T/s error,%=0

reduce_bench.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,22 +90,25 @@ int main(int argc, char **argv) {
9090
bm::RegisterBenchmark("std::accumulate<f64>", &make<stl_accumulate_gt<double>>)->MinTime(10)->UseRealTime();
9191
bm::RegisterBenchmark("std::reduce<par, f32>", &make<stl_par_reduce_gt<float>>)->MinTime(10)->UseRealTime();
9292
bm::RegisterBenchmark("std::reduce<par, f64>", &make<stl_par_reduce_gt<double>>)->MinTime(10)->UseRealTime();
93-
bm::RegisterBenchmark("std::reduce<par_unseq, f32>", &make<stl_parunseq_reduce_gt<float>>)
93+
bm::RegisterBenchmark("std::reduce<par_unseq, f32>", &make<stl_par_unseq_reduce_gt<float>>)
9494
->MinTime(10)
9595
->UseRealTime();
96-
bm::RegisterBenchmark("std::reduce<par_unseq, f64>", &make<stl_parunseq_reduce_gt<double>>)
96+
bm::RegisterBenchmark("std::reduce<par_unseq, f64>", &make<stl_par_unseq_reduce_gt<double>>)
9797
->MinTime(10)
9898
->UseRealTime();
9999
bm::RegisterBenchmark("openmp<f32>", &make<openmp_t>)->MinTime(10)->UseRealTime();
100100

101+
// x86 SSE
102+
#if defined(__SSE__)
103+
bm::RegisterBenchmark("sse<f32aligned>@threads", &make<threads_gt<sse_f32aligned_t>>)->MinTime(10)->UseRealTime();
104+
#endif
101105
// x86 AVX2
102106
#if defined(__AVX2__)
103107
bm::RegisterBenchmark("avx2<f32>", &make<avx2_f32_t>)->MinTime(10)->UseRealTime();
104108
bm::RegisterBenchmark("avx2<f32kahan>", &make<avx2_f32kahan_t>)->MinTime(10)->UseRealTime();
105109
bm::RegisterBenchmark("avx2<f64>", &make<avx2_f64_t>)->MinTime(10)->UseRealTime();
106110
bm::RegisterBenchmark("avx2<f32aligned>@threads", &make<threads_gt<avx2_f32aligned_t>>)->MinTime(10)->UseRealTime();
107111
bm::RegisterBenchmark("avx2<f64>@threads", &make<threads_gt<avx2_f64_t>>)->MinTime(10)->UseRealTime();
108-
bm::RegisterBenchmark("sse<f32aligned>@threads", &make<threads_gt<sse_f32aligned_t>>)->MinTime(10)->UseRealTime();
109112
#endif
110113
// x86 AVX-512
111114
#if defined(__AVX512F__)

reduce_cpu.hpp

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ template <typename accumulator_at = float> struct stl_accumulate_gt {
6262
accumulator_at operator()() const noexcept { return std::accumulate(begin_, end_, accumulator_at(0)); }
6363
};
6464

65-
/// Computes the sum of a sequence of float values using parallel std::reduce with execution policy std::execution::par.
65+
/// Computes the sum of a sequence of float values using parallel `std::reduce` with execution
66+
/// policy @b `std::execution::par`.
6667
template <typename accumulator_at = float> struct stl_par_reduce_gt {
6768
float const *const begin_ = nullptr;
6869
float const *const end_ = nullptr;
@@ -72,9 +73,9 @@ template <typename accumulator_at = float> struct stl_par_reduce_gt {
7273
}
7374
};
7475

75-
/// Computes the sum of a sequence of float values using parallel std::reduce with execution policy
76-
/// std::execution::par_unseq for non-blocking parallelism.
77-
template <typename accumulator_at = float> struct stl_parunseq_reduce_gt {
76+
/// Computes the sum of a sequence of float values using parallel `std::reduce` with execution
77+
/// policy @b `std::execution::par_unseq` for non-blocking parallelism.
78+
template <typename accumulator_at = float> struct stl_par_unseq_reduce_gt {
7879
float const *const begin_ = nullptr;
7980
float const *const end_ = nullptr;
8081

@@ -83,6 +84,8 @@ template <typename accumulator_at = float> struct stl_parunseq_reduce_gt {
8384
}
8485
};
8586

87+
#if defined(__SSE__)
88+
8689
/// Computes the sum of a sequence of float values using SIMD @b SSE intrinsics,
8790
/// processing 128 bits of data on every logic thread.
8891
struct sse_f32aligned_t {
@@ -104,6 +107,8 @@ struct sse_f32aligned_t {
104107
}
105108
};
106109

110+
#endif
111+
107112
#if defined(__AVX2__)
108113

109114
/// Reduces a __m256 vector to a single float by horizontal addition.
@@ -301,10 +306,12 @@ struct avx512_f32unrolled_t {
301306
fwd1 = _mm512_add_ps(fwd1, _mm512_castsi512_ps(_mm512_stream_load_si512((void *)(it_begin))));
302307

303308
// Combine the accumulators
304-
__m512 fwd = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(fwd0, fwd1), _mm512_add_ps(fwd2, fwd3)),
305-
_mm512_add_ps(_mm512_add_ps(fwd4, fwd5), _mm512_add_ps(fwd5, fwd7)));
306-
__m512 rev = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(rev0, rev1), _mm512_add_ps(rev2, rev3)),
307-
_mm512_add_ps(_mm512_add_ps(rev4, rev5), _mm512_add_ps(rev5, rev7)));
309+
__m512 fwd = _mm512_add_ps( //
310+
_mm512_add_ps(_mm512_add_ps(fwd0, fwd1), _mm512_add_ps(fwd2, fwd3)),
311+
_mm512_add_ps(_mm512_add_ps(fwd4, fwd5), _mm512_add_ps(fwd5, fwd7)));
312+
__m512 rev = _mm512_add_ps( //
313+
_mm512_add_ps(_mm512_add_ps(rev0, rev1), _mm512_add_ps(rev2, rev3)),
314+
_mm512_add_ps(_mm512_add_ps(rev4, rev5), _mm512_add_ps(rev5, rev7)));
308315
__m512 acc = _mm512_add_ps(fwd, rev);
309316
float sum = _mm512_reduce_add_ps(acc);
310317
while (it_begin < it_end)

reduce_cuda.hpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@ namespace ashvardanian::reduce {
1515
/// Base class for CUDA-based reductions.
1616
struct cuda_base_t {
1717
static constexpr int max_block_size_k = 1024;
18-
static constexpr int threads = 512;
18+
static constexpr int threads_k = 512;
1919

2020
int blocks = max_block_size_k;
2121
thrust::device_vector<float> gpu_inputs;
2222
thrust::device_vector<float> gpu_partial_sums;
2323
thrust::host_vector<float> cpu_partial_sums;
2424

2525
cuda_base_t(float const *b, float const *e)
26-
: blocks(std::min<int>(((e - b) + threads - 1) / threads, max_block_size_k)), gpu_inputs(b, e),
26+
: blocks(std::min<int>(((e - b) + threads_k - 1) / threads_k, max_block_size_k)), gpu_inputs(b, e),
2727
gpu_partial_sums(max_block_size_k), cpu_partial_sums(max_block_size_k) {}
2828
};
2929

@@ -59,14 +59,14 @@ struct cuda_blocks_t : public cuda_base_t {
5959
float operator()() {
6060

6161
// Accumulate partial results...
62-
int shared_memory = threads * sizeof(float);
63-
cu_reduce_blocks<<<blocks, threads, shared_memory>>>(gpu_inputs.data().get(), gpu_inputs.size(),
64-
gpu_partial_sums.data().get());
62+
int shared_memory = threads_k * sizeof(float);
63+
cu_reduce_blocks<<<blocks, threads_k, shared_memory>>>( //
64+
gpu_inputs.data().get(), gpu_inputs.size(), gpu_partial_sums.data().get());
6565

6666
// Then reduce them further to inputs single scalar
6767
shared_memory = max_block_size_k * sizeof(float);
68-
cu_reduce_blocks<<<1, max_block_size_k, shared_memory>>>(gpu_partial_sums.data().get(), blocks,
69-
gpu_partial_sums.data().get());
68+
cu_reduce_blocks<<<1, max_block_size_k, shared_memory>>>( //
69+
gpu_partial_sums.data().get(), blocks, gpu_partial_sums.data().get());
7070

7171
// Sync all queues and fetch results
7272
cudaDeviceSynchronize();
@@ -128,10 +128,12 @@ struct cuda_warps_t : public cuda_base_t {
128128
float operator()() {
129129

130130
// Accumulate partial results...
131-
cu_reduce_warps<<<blocks, threads>>>(gpu_inputs.data().get(), gpu_inputs.size(), gpu_partial_sums.data().get());
131+
cu_reduce_warps<<<blocks, threads_k>>>( //
132+
gpu_inputs.data().get(), gpu_inputs.size(), gpu_partial_sums.data().get());
132133

133134
// Then reduce them further to inputs single scalar
134-
cu_reduce_warps<<<1, max_block_size_k>>>(gpu_partial_sums.data().get(), blocks, gpu_partial_sums.data().get());
135+
cu_reduce_warps<<<1, max_block_size_k>>>( //
136+
gpu_partial_sums.data().get(), blocks, gpu_partial_sums.data().get());
135137

136138
// Sync all queues and fetch results
137139
cudaDeviceSynchronize();

reduce_opencl.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
// Project: SandboxGPUs.
2-
// Author: Ashot Vardanian.
2+
// Author: Ash Vardanian.
33
// Created: 04/09/2019.
44
// Copyright: Check "License" file.
55
//
66

77
/**
8-
* Most of the algorithms here have follwong properties:
8+
* Most of the algorithms here have following properties:
99
* - takes log(n) steps for n input elements,
1010
* - uses n threads,
1111
* - only works for power-of-2 arrays.

0 commit comments

Comments
 (0)