Docs: List notable features

ashvardanian · ashvardanian · commit c340f4c8d649 · 2025-01-19T11:31:10.000-08:00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -12,13 +12,15 @@
         "HUGEPAGE",
         "HUGETLB",
         "Kahan",
+        "Lookaside",
         "METALLIB",
         "NUMA",
         "opencl",
         "openmp",
         "shfl",
         "SPIR",
         "STREQUAL",
+        "strided",
         "threadgroup",
         "unseq",
         "Vardanian",
diff --git a/README.md b/README.md
@@ -5,21 +5,31 @@
 One of the canonical examples when designing parallel algorithms is implementing parallel tree-like reductions, which is a special case of accumulating a bunch of numbers located in a continuous block of memory.
 In modern C++, most developers would call `std::accumulate(array.begin(), array.end(), 0)`, and in Python, it's just a `sum(array)`.
 Implementing those operations with high utilization in many-core systems is surprisingly non-trivial and depends heavily on the hardware architecture.
-Moreover, on arrays with billions of elements, the default `float` error mounts, and the results become inaccurate unless a [Kahan-like scheme](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) is used.
-
 This repository contains several educational examples showcasing the performance differences between different solutions:
 
 - Single-threaded but SIMD-accelerated code:
   - SSE, AVX, AVX-512 on x86.
   - 🔜 NEON and SVE on Arm.
 - OpenMP `reduction` clause.
 - Thrust with its `thrust::reduce`.
-- CUDA kernels with and w/out warp-reductions.
+- CUB with its `cub::DeviceReduce::Sum`.
+- CUDA kernels with and w/out [warp-primitives](https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/).
+- CUDA kernels with [Tensor-Core](https://www.nvidia.com/en-gb/data-center/tensor-cores/) acceleration.
+- [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) and cuBLAS strided vector and matrix routines.
 - OpenCL kernels, eight of them.
 - Parallel STL `<algorithm>` in GCC with Intel oneTBB.
 
-Previously, it also compared ArrayFire, Halide, and Vulkan queues for SPIR-V kernels and SyCL.
-Examples were collected from early 2010s until 2019 and later updated in 2022.
+Notably:
+
+- on arrays with billions of elements, the default `float` error mounts, and the results become inaccurate unless a [Kahan-like scheme](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) is used.
+- to minimize the overhead [Translation Lookaside Buffer](https://en.wikipedia.org/wiki/Translation_lookaside_buffer) __(TLB)__ misses, the arrays are aligned to the OS page size and are allocated in [huge pages on Linux](https://wiki.debian.org/Hugepages), if possible.
+- to reduce the memory access latency on many-core  [Non-Uniform Memory Access](https://en.wikipedia.org/wiki/Non-uniform_memory_access) __(NUMA)__ systems, `libnuma` and `pthread` help maximize data affinity.
+- to "hide" latency on wide CPU registers (like `ZMM`), expensive Assembly instructions executed on different [CPU ports](https://easyperf.net/blog/2018/03/21/port-contention#utilizing-full-capacity-of-the-load-instructions) are interleaved.
+
+---
+
+The examples in this repository were originally written in early 2010s and were updated in 2019, 2022, and 2025.
+Previously, it also included ArrayFire, Halide, and Vulkan queues for SPIR-V kernels and SyCL.
 
 - [Lecture Slides](https://drive.google.com/file/d/16AicAl99t3ZZFnza04Wnw_Vuem0w8lc7/view?usp=sharing) from 2019.
 - [CppRussia Talk](https://youtu.be/AA4RI6o0h1U) in Russia in 2019.
diff --git a/reduce_opencl.hpp b/reduce_opencl.hpp
@@ -21,6 +21,10 @@
 
 namespace ashvardanian::reduce {
 
+/**
+ *  @brief OpenCL target device information, including its name, driver version,
+ *         the number of compute units, and the unique device ID.
+ */
 struct opencl_target_t {
     std::string device_name;
     std::string device_version;
@@ -38,6 +42,12 @@ static int const opencl_max_threads = 12000;
 std::vector<opencl_target_t> opencl_targets();
 char const *opencl_error_name(cl_int) noexcept;
 
+/**
+ *  @brief OpenCL kernel wrapper for parallel reductions.
+ *
+ *  ! The kernels are loaded from a file and compiled at runtime, so the working
+ *  ! directory must be the same as the executable.
+ */
 struct opencl_t {
 
     static constexpr std::size_t kernel_variants_k = 8;
@@ -46,9 +56,9 @@ struct opencl_t {
         "reduce_bi_step", "reduce_unrolled", "reduce_unrolled_fully", "reduce_w_brents_theorem",
     };
 
-    std::size_t const count_items = 0;
-    std::size_t const count_threads = 0;
-    std::size_t const items_per_group = 0;
+    std::size_t count_items = 0;
+    std::size_t count_threads = 0;
+    std::size_t items_per_group = 0;
 
   private:
     cl_context context = NULL;
@@ -68,8 +78,10 @@ struct opencl_t {
     std::vector<float> returned_outputs;
 
   public:
-    opencl_t(float const *b, float const *e, opencl_target_t target, std::size_t items_per_group = 1024,
-             char const *kernel_name_cstr = kernels_k[0])
+    opencl_t() = default;
+    opencl_t( //
+        float const *b, float const *e, opencl_target_t target, std::size_t items_per_group = 1024,
+        char const *kernel_name_cstr = kernels_k[0])
         : count_items(e - b), count_threads((opencl_max_threads / items_per_group) * items_per_group),
           items_per_group(items_per_group) {
         // Load the kernel source code into the array source_str
@@ -139,7 +151,7 @@ struct opencl_t {
         if (status != 0) throw std::logic_error(opencl_error_name(status));
     }
 
-    ~opencl_t() {
+    ~opencl_t() noexcept {
         cl_int status = 0;
         status = clFlush(queue);
         status = clFinish(queue);
@@ -157,7 +169,7 @@ struct opencl_t {
             (void)status;
     }
 
-    float operator()() {
+    float operator()() const {
         cl_int status = 0;
         std::size_t global_ws_offset = 0;
         status = clEnqueueNDRangeKernel( //
@@ -177,6 +189,10 @@ struct opencl_t {
     }
 };
 
+/**
+ *  @brief Returns a list of OpenCL target devices.
+ *  @return Array of `opencl_target_t` objects.
+ */
 std::vector<opencl_target_t> opencl_targets() {
 
     std::vector<opencl_target_t> result;