Skip to content

Commit cce8be4

Browse files
committed
Add: Generic OpenMP pool
1 parent 5e73225 commit cce8be4

File tree

2 files changed

+41
-1
lines changed

2 files changed

+41
-1
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ This repository contains several educational examples showcasing the performance
99

1010
- Single-threaded but SIMD-accelerated code:
1111
- SSE, AVX, AVX-512 on x86.
12-
- 🔜 NEON and SVE on Arm.
12+
- NEON and SVE on Arm.
1313
- OpenMP `reduction` clause.
1414
- Thrust with its `thrust::reduce`.
1515
- CUB with its `cub::DeviceReduce::Sum`.

reduce_cpu.hpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
#include <numeric> // `std::accumulate`, `std::reduce`
1111
#include <thread> // `std::thread`
1212

13+
#if defined(_OPENMP)
1314
#include <omp.h> // `omp_set_num_threads`
15+
#endif
1416

1517
#if defined(__AVX2__) || defined(__AVX512F__)
1618
#include <immintrin.h> // x86 intrinsics
@@ -538,6 +540,8 @@ class sve_f32_t {
538540

539541
#pragma region - Multicore
540542

543+
#if defined(_OPENMP)
544+
541545
/**
542546
* @brief Computes the sum of a sequence of float values using @b OpenMP on-CPU
543547
* for multi-core reductions acceleration.
@@ -563,6 +567,42 @@ class openmp_t {
563567
}
564568
};
565569

570+
/**
571+
* @brief Computes the sum of a sequence of float values using @b OpenMP on-CPU
572+
* for multi-core parallelism, combined with the given @b SIMD vectorization.
573+
* @see https://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-reduction.html
574+
*/
575+
template <typename serial_at = stl_accumulate_gt<float>>
576+
class openmp_gt {
577+
float const *const begin_ = nullptr;
578+
float const *const end_ = nullptr;
579+
std::size_t const total_cores_ = 0;
580+
std::vector<double> sums_;
581+
582+
public:
583+
openmp_gt() = default;
584+
openmp_gt(float const *b, float const *e) : begin_(b), end_(e), total_cores_(total_cores()), sums_(total_cores_) {
585+
omp_set_dynamic(0);
586+
omp_set_num_threads(total_cores_);
587+
}
588+
589+
double operator()() {
590+
auto const input_size = static_cast<std::size_t>(end_ - begin_);
591+
auto const chunk_size = input_size / total_cores_;
592+
#pragma omp parallel
593+
{
594+
std::size_t const thread_id = static_cast<std::size_t>(omp_get_thread_num());
595+
std::size_t const start = thread_id * chunk_size;
596+
std::size_t const stop = std::min(start + chunk_size, input_size);
597+
double local_sum = serial_at {begin_ + start, begin_ + stop}();
598+
sums_[thread_id] = local_sum;
599+
}
600+
return std::accumulate(sums_.begin(), sums_.end(), 0.0);
601+
}
602+
};
603+
604+
#endif
605+
566606
/**
567607
* @brief Computes the sum of a sequence of float values using @b std::thread on-CPU
568608
* multi-core reductions acceleration.

0 commit comments

Comments
 (0)