@@ -29,7 +29,7 @@ static const auto pi = 3.1415926535897932384626433832795;
29
29
static const auto seed = 7777 ;
30
30
31
31
// Default Number of 2D points
32
- static const auto n_samples = 120000000 ;
32
+ static const auto n_samples = 120'000'000 ;
33
33
34
34
double estimate_pi (sycl::queue& q, size_t n_points) {
35
35
double estimated_pi; // Estimated value of Pi
@@ -48,37 +48,30 @@ double estimate_pi(sycl::queue& q, size_t n_points) {
48
48
mkl::rng::generate (distr, engine, n_points * 2 , rng_buf);
49
49
50
50
// Step 2. Count points under curve (x ^ 2 + y ^ 2 < 1.0f)
51
- size_t wg_size = std::min (q.get_device ().get_info <sycl::info::device::max_work_group_size>(), n_points);
52
- size_t max_compute_units = q.get_device ().get_info <sycl::info::device::max_compute_units>();
53
- size_t wg_num = (n_points > wg_size * max_compute_units) ? max_compute_units : 1 ;
54
-
55
- size_t count_per_thread = n_points / (wg_size * wg_num);
56
-
57
- std::vector<size_t > count (wg_num);
51
+ size_t count_per_thread = 32 ;
58
52
59
53
{
60
- sycl::buffer<size_t , 1 > count_buf (count) ;
54
+ sycl::buffer<size_t > count_buf{ &n_under_curve , 1 } ;
61
55
62
56
q.submit ([&] (sycl::handler& h) {
63
57
auto rng_acc = rng_buf.template get_access <sycl::access::mode::read>(h);
64
- auto count_acc = count_buf.template get_access <sycl::access::mode::write>(h);
65
- h.parallel_for (sycl::nd_range<1 >(wg_size * wg_num, wg_size),
66
- [=](sycl::nd_item<1 > item) {
67
- sycl::vec<float , 2 > r;
68
- size_t count = 0 ;
69
- for (int i = 0 ; i < count_per_thread; i++) {
70
- r.load (i + item.get_global_linear_id () * count_per_thread, rng_acc.template get_multi_ptr <sycl::access::decorated::yes>());
71
- if (sycl::length (r) <= 1 .0f ) {
72
- count += 1 ;
58
+ auto reductor = sycl::reduction (count_buf, h, size_t (0 ), std::plus<size_t >());
59
+
60
+ h.parallel_for (sycl::range<1 >(n_points / count_per_thread), reductor,
61
+ [=](sycl::item<1 > item, auto & sum) {
62
+ sycl::vec<float , 2 > r;
63
+ size_t count = 0 ;
64
+ for (int i = 0 ; i < count_per_thread; i++) {
65
+ r.load (i + item.get_id (0 ) * count_per_thread, rng_acc.template get_multi_ptr <sycl::access::decorated::yes>());
66
+ if (sycl::length (r) <= 1 .0f ) {
67
+ count++;
68
+ }
73
69
}
74
- }
75
- count_acc[item.get_group_linear_id ()] = sycl::reduce_over_group (item.get_group (), count, std::plus<size_t >());
70
+ sum += count;
76
71
});
77
72
});
78
73
}
79
74
80
- n_under_curve = std::accumulate (count.begin (), count.end (), 0 );
81
-
82
75
// Step 3. Calculate approximated value of Pi
83
76
estimated_pi = n_under_curve / ((double )n_points) * 4.0 ;
84
77
return estimated_pi;
@@ -132,7 +125,7 @@ int main(int argc, char ** argv) {
132
125
std::cout << " Absolute error = " << abs_error << std::endl;
133
126
std::cout << std::endl;
134
127
135
- if (abs_error > 1.0e-3 ) {
128
+ if (abs_error > 1.0e-4 ) {
136
129
std::cout << " TEST FAILED" << std::endl;
137
130
return 1 ;
138
131
}
0 commit comments