1010#include < numeric> // `std::accumulate`, `std::reduce`
1111#include < thread> // `std::thread`
1212
13+ #if defined(_OPENMP)
1314#include < omp.h> // `omp_set_num_threads`
15+ #endif
1416
1517#if defined(__AVX2__) || defined(__AVX512F__)
1618#include < immintrin.h> // x86 intrinsics
@@ -538,6 +540,8 @@ class sve_f32_t {
538540
539541#pragma region - Multicore
540542
543+ #if defined(_OPENMP)
544+
541545/* *
542546 * @brief Computes the sum of a sequence of float values using @b OpenMP on-CPU
543547 * for multi-core reductions acceleration.
@@ -563,6 +567,42 @@ class openmp_t {
563567 }
564568};
565569
570+ /* *
571+ * @brief Computes the sum of a sequence of float values using @b OpenMP on-CPU
572+ * for multi-core parallelism, combined with the given @b SIMD vectorization.
573+ * @see https://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-reduction.html
574+ */
575+ template <typename serial_at = stl_accumulate_gt<float >>
576+ class openmp_gt {
577+ float const *const begin_ = nullptr ;
578+ float const *const end_ = nullptr ;
579+ std::size_t const total_cores_ = 0 ;
580+ std::vector<double > sums_;
581+
582+ public:
583+ openmp_gt () = default ;
584+ openmp_gt (float const *b, float const *e) : begin_(b), end_(e), total_cores_(total_cores()), sums_(total_cores_) {
585+ omp_set_dynamic (0 );
586+ omp_set_num_threads (total_cores_);
587+ }
588+
589+ double operator ()() {
590+ auto const input_size = static_cast <std::size_t >(end_ - begin_);
591+ auto const chunk_size = input_size / total_cores_;
592+ #pragma omp parallel
593+ {
594+ std::size_t const thread_id = static_cast <std::size_t >(omp_get_thread_num ());
595+ std::size_t const start = thread_id * chunk_size;
596+ std::size_t const stop = std::min (start + chunk_size, input_size);
597+ double local_sum = serial_at {begin_ + start, begin_ + stop}();
598+ sums_[thread_id] = local_sum;
599+ }
600+ return std::accumulate (sums_.begin (), sums_.end (), 0.0 );
601+ }
602+ };
603+
604+ #endif
605+
566606/* *
567607 * @brief Computes the sum of a sequence of float values using @b std::thread on-CPU
568608 * multi-core reductions acceleration.
0 commit comments