@@ -29,11 +29,10 @@ static const auto pi = 3.1415926535897932384626433832795;
29
29
static const auto seed = 7777 ;
30
30
31
31
// Default Number of 2D points
32
- static const auto n_samples = 120000000 ;
32
+ static const auto n_samples = 120'000'000 ;
33
33
34
34
double estimate_pi (sycl::queue& q, size_t n_points) {
35
35
double estimated_pi; // Estimated value of Pi
36
- size_t n_under_curve = 0 ; // Number of points fallen under the curve
37
36
38
37
// Step 1. Generate n_points * 2 random numbers
39
38
// 1.1. Generator initialization
@@ -42,47 +41,35 @@ double estimate_pi(sycl::queue& q, size_t n_points) {
42
41
// Create an object of distribution (by default float, a = 0.0f, b = 1.0f)
43
42
mkl::rng::uniform distr;
44
43
45
- float * rng_ptr = sycl::malloc_device <float >(n_points * 2 , q);
44
+ float * rng_ptr = sycl::malloc_shared <float >(n_points * 2 , q);
46
45
47
46
// 1.2. Random number generation
48
47
auto event = mkl::rng::generate (distr, engine, n_points * 2 , rng_ptr);
49
48
50
49
// Step 2. Count points under curve (x ^ 2 + y ^ 2 < 1.0f)
51
- size_t wg_size = std::min (q.get_device ().get_info <sycl::info::device::max_work_group_size>(), n_points);
52
- size_t max_compute_units = q.get_device ().get_info <sycl::info::device::max_compute_units>();
53
- size_t wg_num = (n_points > wg_size * max_compute_units) ? max_compute_units : 1 ;
54
-
55
- size_t count_per_thread = n_points / (wg_size * wg_num);
56
-
57
- size_t * count_ptr = sycl::malloc_shared<size_t >(wg_num, q);
58
-
59
- // Make sure, that generation is finished
60
- event.wait_and_throw ();
61
-
62
- event = q.submit ([&] (sycl::handler& h) {
63
- h.parallel_for (sycl::nd_range<1 >(wg_size * wg_num, wg_size),
64
- [=](sycl::nd_item<1 > item) {
65
- sycl::vec<float , 2 > r;
66
- size_t count = 0 ;
67
- for (int i = 0 ; i < count_per_thread; i++) {
68
- r.load (i + item.get_global_linear_id () * count_per_thread, sycl::global_ptr<float >(rng_ptr));
69
- if (sycl::length (r) <= 1 .0f ) {
70
- count += 1 ;
71
- }
72
- }
73
- count_ptr[item.get_group_linear_id ()] = sycl::reduce_over_group (item.get_group (), count, std::plus<size_t >());
74
- });
75
- });
76
-
77
- event.wait_and_throw ();
78
-
79
- n_under_curve = std::accumulate (count_ptr, count_ptr + wg_num, 0 );
50
+ constexpr size_t count_per_thread = 32 ;
51
+ size_t *n_under_curve = sycl::malloc_host<size_t >(1 , q); // Number of points fallen under the curve
52
+ *n_under_curve = 0 ;
53
+ auto reductor = sycl::reduction (n_under_curve, size_t (0 ), std::plus<size_t >{});
54
+
55
+ q.parallel_for (sycl::range<1 >(n_points / count_per_thread), event, reductor,
56
+ [=](sycl::item<1 > item, auto & sum) {
57
+ sycl::vec<float , 2 > r;
58
+ size_t count = 0 ;
59
+ for (int i = 0 ; i < count_per_thread; i++) {
60
+ r.load (i + item.get_id (0 ) * count_per_thread, sycl::global_ptr<float >(rng_ptr));
61
+ if (sycl::length (r) <= 1 .0f ) {
62
+ count++;
63
+ }
64
+ }
65
+ sum += count;
66
+ }).wait_and_throw ();
80
67
81
68
// Step 3. Calculate approximated value of Pi
82
- estimated_pi = n_under_curve / ((double )n_points) * 4.0 ;
69
+ estimated_pi = * n_under_curve / ((double )n_points) * 4.0 ;
83
70
84
71
sycl::free (rng_ptr, q);
85
- sycl::free (count_ptr , q);
72
+ sycl::free (n_under_curve , q);
86
73
87
74
return estimated_pi;
88
75
@@ -135,7 +122,7 @@ int main(int argc, char ** argv) {
135
122
std::cout << " Absolute error = " << abs_error << std::endl;
136
123
std::cout << std::endl;
137
124
138
- if (abs_error > 1.0e-3 ) {
125
+ if (abs_error > 1.0e-4 ) {
139
126
std::cout << " TEST FAILED" << std::endl;
140
127
return 1 ;
141
128
}
0 commit comments