Docs: Spelling and links

ashvardanian · ashvardanian · commit 05c81acb3793 · 2025-01-13T20:31:02.000Z
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -92,5 +92,14 @@
         "cuchar": "cpp",
         "hash_set": "cpp",
         "latch": "cpp"
-    }
+    },
+    "cSpell.words": [
+        "ashvardanian",
+        "CCCL",
+        "CUDA",
+        "Kahan",
+        "shfl",
+        "SPIR",
+        "STREQUAL"
+    ]
 }
diff --git a/README.md b/README.md
@@ -2,17 +2,21 @@
 
 ![Parallel Reductions Benchmark](https://github.com/ashvardanian/ashvardanian/blob/master/repositories/ParallelReductionsBenchmark.jpg?raw=true)
 
-One of the canonical examples when designing parallel algorithms is implementing parallel tree-like reductions or its special case of accumulating a bunch of numbers located in a continuous block of memory.
+One of the canonical examples when designing parallel algorithms is implementing parallel tree-like reductions, which is a special case of accumulating a bunch of numbers located in a continuous block of memory.
 In modern C++, most developers would call `std::accumulate(array.begin(), array.end(), 0)`, and in Python, it's just a `sum(array)`.
 Implementing those operations with high utilization in many-core systems is surprisingly non-trivial and depends heavily on the hardware architecture.
+Moreover, on arrays with billions of elements, the default `float` error mounts, and the results become inaccurate unless a [Kahan-like scheme](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) is used.
+
 This repository contains several educational examples showcasing the performance differences between different solutions:
 
-- AVX2 single-threaded, but SIMD-parallel code.
+- Single-threaded but SIMD-accelerated code:
+  - SSE, AVX, AVX-512 on x86.
+  - 🔜 NEON and SVE on Arm.
 - OpenMP `reduction` clause.
 - Thrust with its `thrust::reduce`.
 - CUDA kernels with warp-reductions.
 - OpenCL kernels, eight of them.
-- Parallel STL `<algorithm>'s in GCC with Intel oneTBB.
+- Parallel STL `<algorithm>` in GCC with Intel oneTBB.
 
 Previously, it also compared ArrayFire, Halide, and Vulkan queues for SPIR-V kernels and SyCL.
 Examples were collected from early 2010s until 2019 and later updated in 2022.
@@ -94,12 +98,12 @@ std::reduce<par, f64>/min_time:10.000/real_time          3921280 ns      3916897
 std::reduce<par_unseq, f32>/min_time:10.000/real_time    3884794 ns      3864061 ns         3644 bytes/s=276.396G/s error,%=0
 std::reduce<par_unseq, f64>/min_time:10.000/real_time    3889332 ns      3866968 ns         3585 bytes/s=276.074G/s error,%=100
 openmp<f32>/min_time:10.000/real_time                    5061544 ns      5043250 ns         2407 bytes/s=212.137G/s error,%=65.5651u
+sse<f32aligned>@threads/min_time:10.000/real_time        5986350 ns      5193690 ns         2343 bytes/s=179.365G/s error,%=1.25021
 avx2<f32>/min_time:10.000/real_time                    110796474 ns    110794861 ns          127 bytes/s=9.69112G/s error,%=50
 avx2<f32kahan>/min_time:10.000/real_time               134144762 ns    134137771 ns          105 bytes/s=8.00435G/s error,%=0
 avx2<f64>/min_time:10.000/real_time                    115791797 ns    115790878 ns          121 bytes/s=9.27304G/s error,%=0
 avx2<f32aligned>@threads/min_time:10.000/real_time       5958283 ns      5041060 ns         2358 bytes/s=180.21G/s error,%=1.25033
 avx2<f64>@threads/min_time:10.000/real_time              5996481 ns      5123440 ns         2337 bytes/s=179.062G/s error,%=1.25001
-sse<f32aligned>@threads/min_time:10.000/real_time        5986350 ns      5193690 ns         2343 bytes/s=179.365G/s error,%=1.25021
 cub@cuda/min_time:10.000/real_time                        356488 ns       356482 ns        39315 bytes/s=3.012T/s error,%=0
 warps@cuda/min_time:10.000/real_time                      486387 ns       486377 ns        28788 bytes/s=2.20759T/s error,%=0
 thrust@cuda/min_time:10.000/real_time                     500941 ns       500919 ns        27512 bytes/s=2.14345T/s error,%=0
diff --git a/reduce_bench.cpp b/reduce_bench.cpp
@@ -90,22 +90,25 @@ int main(int argc, char **argv) {
     bm::RegisterBenchmark("std::accumulate<f64>", &make<stl_accumulate_gt<double>>)->MinTime(10)->UseRealTime();
     bm::RegisterBenchmark("std::reduce<par, f32>", &make<stl_par_reduce_gt<float>>)->MinTime(10)->UseRealTime();
     bm::RegisterBenchmark("std::reduce<par, f64>", &make<stl_par_reduce_gt<double>>)->MinTime(10)->UseRealTime();
-    bm::RegisterBenchmark("std::reduce<par_unseq, f32>", &make<stl_parunseq_reduce_gt<float>>)
+    bm::RegisterBenchmark("std::reduce<par_unseq, f32>", &make<stl_par_unseq_reduce_gt<float>>)
         ->MinTime(10)
         ->UseRealTime();
-    bm::RegisterBenchmark("std::reduce<par_unseq, f64>", &make<stl_parunseq_reduce_gt<double>>)
+    bm::RegisterBenchmark("std::reduce<par_unseq, f64>", &make<stl_par_unseq_reduce_gt<double>>)
         ->MinTime(10)
         ->UseRealTime();
     bm::RegisterBenchmark("openmp<f32>", &make<openmp_t>)->MinTime(10)->UseRealTime();
 
+    // x86 SSE
+#if defined(__SSE__)
+    bm::RegisterBenchmark("sse<f32aligned>@threads", &make<threads_gt<sse_f32aligned_t>>)->MinTime(10)->UseRealTime();
+#endif
     // x86 AVX2
 #if defined(__AVX2__)
     bm::RegisterBenchmark("avx2<f32>", &make<avx2_f32_t>)->MinTime(10)->UseRealTime();
     bm::RegisterBenchmark("avx2<f32kahan>", &make<avx2_f32kahan_t>)->MinTime(10)->UseRealTime();
     bm::RegisterBenchmark("avx2<f64>", &make<avx2_f64_t>)->MinTime(10)->UseRealTime();
     bm::RegisterBenchmark("avx2<f32aligned>@threads", &make<threads_gt<avx2_f32aligned_t>>)->MinTime(10)->UseRealTime();
     bm::RegisterBenchmark("avx2<f64>@threads", &make<threads_gt<avx2_f64_t>>)->MinTime(10)->UseRealTime();
-    bm::RegisterBenchmark("sse<f32aligned>@threads", &make<threads_gt<sse_f32aligned_t>>)->MinTime(10)->UseRealTime();
 #endif
     // x86 AVX-512
 #if defined(__AVX512F__)
diff --git a/reduce_cpu.hpp b/reduce_cpu.hpp
@@ -62,7 +62,8 @@ template <typename accumulator_at = float> struct stl_accumulate_gt {
     accumulator_at operator()() const noexcept { return std::accumulate(begin_, end_, accumulator_at(0)); }
 };
 
-/// Computes the sum of a sequence of float values using parallel std::reduce with execution policy std::execution::par.
+/// Computes the sum of a sequence of float values using parallel `std::reduce` with execution
+/// policy @b `std::execution::par`.
 template <typename accumulator_at = float> struct stl_par_reduce_gt {
     float const *const begin_ = nullptr;
     float const *const end_ = nullptr;
@@ -72,9 +73,9 @@ template <typename accumulator_at = float> struct stl_par_reduce_gt {
     }
 };
 
-/// Computes the sum of a sequence of float values using parallel std::reduce with execution policy
-/// std::execution::par_unseq for non-blocking parallelism.
-template <typename accumulator_at = float> struct stl_parunseq_reduce_gt {
+/// Computes the sum of a sequence of float values using parallel `std::reduce` with execution
+/// policy @b `std::execution::par_unseq` for non-blocking parallelism.
+template <typename accumulator_at = float> struct stl_par_unseq_reduce_gt {
     float const *const begin_ = nullptr;
     float const *const end_ = nullptr;
 
@@ -83,6 +84,8 @@ template <typename accumulator_at = float> struct stl_parunseq_reduce_gt {
     }
 };
 
+#if defined(__SSE__)
+
 /// Computes the sum of a sequence of float values using SIMD @b SSE intrinsics,
 /// processing 128 bits of data on every logic thread.
 struct sse_f32aligned_t {
@@ -104,6 +107,8 @@ struct sse_f32aligned_t {
     }
 };
 
+#endif
+
 #if defined(__AVX2__)
 
 /// Reduces a __m256 vector to a single float by horizontal addition.
@@ -301,10 +306,12 @@ struct avx512_f32unrolled_t {
             fwd1 = _mm512_add_ps(fwd1, _mm512_castsi512_ps(_mm512_stream_load_si512((void *)(it_begin))));
 
         // Combine the accumulators
-        __m512 fwd = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(fwd0, fwd1), _mm512_add_ps(fwd2, fwd3)),
-                                   _mm512_add_ps(_mm512_add_ps(fwd4, fwd5), _mm512_add_ps(fwd5, fwd7)));
-        __m512 rev = _mm512_add_ps(_mm512_add_ps(_mm512_add_ps(rev0, rev1), _mm512_add_ps(rev2, rev3)),
-                                   _mm512_add_ps(_mm512_add_ps(rev4, rev5), _mm512_add_ps(rev5, rev7)));
+        __m512 fwd = _mm512_add_ps( //
+            _mm512_add_ps(_mm512_add_ps(fwd0, fwd1), _mm512_add_ps(fwd2, fwd3)),
+            _mm512_add_ps(_mm512_add_ps(fwd4, fwd5), _mm512_add_ps(fwd5, fwd7)));
+        __m512 rev = _mm512_add_ps( //
+            _mm512_add_ps(_mm512_add_ps(rev0, rev1), _mm512_add_ps(rev2, rev3)),
+            _mm512_add_ps(_mm512_add_ps(rev4, rev5), _mm512_add_ps(rev5, rev7)));
         __m512 acc = _mm512_add_ps(fwd, rev);
         float sum = _mm512_reduce_add_ps(acc);
         while (it_begin < it_end)
diff --git a/reduce_cuda.hpp b/reduce_cuda.hpp
@@ -15,15 +15,15 @@ namespace ashvardanian::reduce {
 /// Base class for CUDA-based reductions.
 struct cuda_base_t {
     static constexpr int max_block_size_k = 1024;
-    static constexpr int threads = 512;
+    static constexpr int threads_k = 512;
 
     int blocks = max_block_size_k;
     thrust::device_vector<float> gpu_inputs;
     thrust::device_vector<float> gpu_partial_sums;
     thrust::host_vector<float> cpu_partial_sums;
 
     cuda_base_t(float const *b, float const *e)
-        : blocks(std::min<int>(((e - b) + threads - 1) / threads, max_block_size_k)), gpu_inputs(b, e),
+        : blocks(std::min<int>(((e - b) + threads_k - 1) / threads_k, max_block_size_k)), gpu_inputs(b, e),
           gpu_partial_sums(max_block_size_k), cpu_partial_sums(max_block_size_k) {}
 };
 
@@ -59,14 +59,14 @@ struct cuda_blocks_t : public cuda_base_t {
     float operator()() {
 
         // Accumulate partial results...
-        int shared_memory = threads * sizeof(float);
-        cu_reduce_blocks<<<blocks, threads, shared_memory>>>(gpu_inputs.data().get(), gpu_inputs.size(),
-                                                             gpu_partial_sums.data().get());
+        int shared_memory = threads_k * sizeof(float);
+        cu_reduce_blocks<<<blocks, threads_k, shared_memory>>>( //
+            gpu_inputs.data().get(), gpu_inputs.size(), gpu_partial_sums.data().get());
 
         // Then reduce them further to inputs single scalar
         shared_memory = max_block_size_k * sizeof(float);
-        cu_reduce_blocks<<<1, max_block_size_k, shared_memory>>>(gpu_partial_sums.data().get(), blocks,
-                                                                 gpu_partial_sums.data().get());
+        cu_reduce_blocks<<<1, max_block_size_k, shared_memory>>>( //
+            gpu_partial_sums.data().get(), blocks, gpu_partial_sums.data().get());
 
         // Sync all queues and fetch results
         cudaDeviceSynchronize();
@@ -128,10 +128,12 @@ struct cuda_warps_t : public cuda_base_t {
     float operator()() {
 
         // Accumulate partial results...
-        cu_reduce_warps<<<blocks, threads>>>(gpu_inputs.data().get(), gpu_inputs.size(), gpu_partial_sums.data().get());
+        cu_reduce_warps<<<blocks, threads_k>>>( //
+            gpu_inputs.data().get(), gpu_inputs.size(), gpu_partial_sums.data().get());
 
         // Then reduce them further to inputs single scalar
-        cu_reduce_warps<<<1, max_block_size_k>>>(gpu_partial_sums.data().get(), blocks, gpu_partial_sums.data().get());
+        cu_reduce_warps<<<1, max_block_size_k>>>( //
+            gpu_partial_sums.data().get(), blocks, gpu_partial_sums.data().get());
 
         // Sync all queues and fetch results
         cudaDeviceSynchronize();
diff --git a/reduce_opencl.cl b/reduce_opencl.cl
@@ -1,11 +1,11 @@
 // Project: SandboxGPUs.
-// Author: Ashot Vardanian.
+// Author: Ash Vardanian.
 // Created: 04/09/2019.
 // Copyright: Check "License" file.
 //
 
 /**
- *  Most of the algorithms here have follwong properties:
+ *  Most of the algorithms here have following properties:
  *  - takes log(n) steps for n input elements,
  *  - uses n threads,
  *  - only works for power-of-2 arrays.