Add: fork_union parallel version

ashvardanian · ashvardanian · commit c40b7f387f91 · 2025-05-03T20:30:02.000Z
This is a work in progress light-weight thread-pool variant targeting OpenMP-like use-cases. It doesn't match OpenMP performance on small inputs and is still a long way from our goal #7.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -88,6 +88,13 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(fmt)
 
+FetchContent_Declare(
+    fork_union
+    GIT_REPOSITORY https://github.com/ashvardanian/fork_union.git
+    GIT_TAG main
+)
+FetchContent_MakeAvailable(fork_union)
+
 # Fetch GBenchmark and suppress internal tests.
 # https://github.com/google/benchmark/blob/main/docs/user_guide.md#using-register-benchmark
 FetchContent_Declare(
@@ -152,7 +159,7 @@ set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
 set(CMAKE_GCC_FLAGS "${CMAKE_GCC_FLAGS} -march=native -fopenmp")
 
 add_executable(reduce_bench reduce_bench.cpp)
-target_link_libraries(reduce_bench PRIVATE benchmark::benchmark fmt::fmt Threads::Threads BLAS::BLAS)
+target_link_libraries(reduce_bench PRIVATE benchmark::benchmark fmt::fmt fork_union Threads::Threads BLAS::BLAS)
 
 if (USE_INTEL_TBB)
     target_link_libraries(reduce_bench PRIVATE TBB::tbb)
diff --git a/README.md b/README.md
@@ -195,7 +195,7 @@ Observations:
 - 370 GB/s can be reached in dual-socket DDR5 setups with 12 channel memory.
 - Using Kahan-like schemes is 3x slower than pure `float` and 2x slower than `double`.
 
-One of the interesting observations is the effect of latency hiding, interleaving the operations executing on different ports of the same CPU.
+One of the interesting observations is the effect of [latency hiding, interleaving the operations executing on different ports of the same CPU](https://ashvardanian.com/posts/cpu-ports).
 It is evident when benchmarking AVX-512 kernels on very small arrays:
 
 ```sh
diff --git a/reduce_bench.cpp b/reduce_bench.cpp
@@ -188,9 +188,9 @@ std::size_t alignment_ram_page() {
  *  2. `std::aligned_alloc` aligned to the system page size with optional @b `madvise(MADV_HUGEPAGE)`.
  *  If NUMA is available (libNUMA on Linux), memory is distributed across NUMA nodes.
  *
- *  @param elements Number of float elements to allocate.
- *  @return dataset_t A dataset wrapper holding the pointer and type of allocation.
- *  @throws std::bad_alloc if allocation fails.
+ *  @param[in] needed_elements Number of float elements to allocate.
+ *  @return `dataset_t` A dataset wrapper holding the pointer and type of allocation.
+ *  @throws `std::bad_alloc` if allocation fails.
  *
  *  @see NUMA docs: https://man7.org/linux/man-pages/man3/numa.3.html
  *  @see MMAP docs: https://man7.org/linux/man-pages/man2/mmap.2.html
@@ -336,7 +336,11 @@ int main(int argc, char **argv) {
     register_("unrolled/f64", unrolled_gt<double> {}, dataset);
     register_("std::accumulate/f32", stl_accumulate_gt<float> {}, dataset);
     register_("std::accumulate/f64", stl_accumulate_gt<double> {}, dataset);
+    register_("serial/f32/av::fork_union", fork_union_gt<unrolled_gt<float>> {}, dataset);
+    register_("serial/f64/av::fork_union", fork_union_gt<unrolled_gt<double>> {}, dataset);
+#if defined(_OPENMP)
     register_("serial/f32/openmp", openmp_t {}, dataset);
+#endif // defined(_OPENMP)
 
     //! BLAS struggles with zero-strided arguments!
     //! register_("blas/f32", blas_dot_t {}, dataset);
@@ -375,11 +379,17 @@ int main(int argc, char **argv) {
     // Arm NEON
 #if defined(__ARM_NEON)
     register_("neon/f32", neon_f32_t {}, dataset);
+    register_("neon/f32/av::fork_union", fork_union_gt<neon_f32_t> {}, dataset);
+    register_("neon/f32/std::threads", threads_gt<neon_f32_t> {}, dataset);
+    register_("neon/f32/openmp", openmp_gt<neon_f32_t> {}, dataset);
 #endif
 
 // Arm SVE
 #if defined(__ARM_FEATURE_SVE)
     register_("sve/f32", sve_f32_t {}, dataset);
+    register_("sve/f32/av::fork_union", fork_union_gt<sve_f32_t> {}, dataset);
+    register_("sve/f32/std::threads", threads_gt<sve_f32_t> {}, dataset);
+    register_("sve/f32/openmp", openmp_gt<sve_f32_t> {}, dataset);
 #endif // defined(__ARM_FEATURE_SVE__)
 
     // CUDA
diff --git a/reduce_cpu.hpp b/reduce_cpu.hpp
@@ -26,6 +26,8 @@
 #include <arm_sve.h> // ARM SVE intrinsics
 #endif
 
+#include <fork_union.hpp>
+
 namespace ashvardanian {
 
 /**
@@ -604,7 +606,7 @@ class openmp_gt {
 #endif
 
 /**
- *  @brief Computes the sum of a sequence of float values using @b std::thread on-CPU
+ *  @brief Computes the sum of a sequence of float values using @b `std::thread` on-CPU
  *         multi-core reductions acceleration.
  *  @see   https://en.cppreference.com/w/cpp/thread/thread
  */
@@ -661,6 +663,37 @@ class threads_gt {
     }
 };
 
+/**
+ *  @brief Computes the sum of a sequence of float values using @b `std::thread` on-CPU
+ *         multi-core reductions acceleration, reusing a fixed-size thread pool.
+ *  @see   https://github.com/ashvardanian/fork_union
+ */
+template <typename serial_at = stl_accumulate_gt<float>>
+class fork_union_gt {
+    using pool_t = ::ashvardanian::fork_union_t;
+    float const *const begin_ = nullptr;
+    float const *const end_ = nullptr;
+    pool_t pool_;
+    std::vector<double> sums_;
+
+  public:
+    fork_union_gt() = default;
+    fork_union_gt(float const *b, float const *e) : begin_(b), end_(e), sums_() {
+        auto cores = total_cores();
+        if (!pool_.try_spawn(cores)) throw std::runtime_error("Failed to fork threads");
+        sums_.resize(cores);
+    }
+
+    double operator()() {
+        auto const input_size = static_cast<std::size_t>(end_ - begin_);
+        pool_.for_each_slice(input_size, [this](pool_t::task_t first_task, std::size_t slice_length) noexcept {
+            auto const slice_begin = begin_ + first_task.task_index;
+            sums_[first_task.thread_index] = serial_at {slice_begin, slice_begin + slice_length}();
+        });
+        return std::accumulate(sums_.begin(), sums_.end(), 0.0);
+    }
+};
+
 #pragma endregion - Multicore
 
 } // namespace ashvardanian