Skip to content

Commit 663a250

Browse files
authored
Merge pull request #2523 from andreyfe1/pi_monte_carlo_fix
[oneMKL] Reworked Monte Carlo Pi Sample to fix verification
2 parents a862868 + 80ca34c commit 663a250

File tree

3 files changed

+41
-61
lines changed

3 files changed

+41
-61
lines changed

Libraries/oneMKL/monte_carlo_pi/mc_pi.cpp

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ static const auto pi = 3.1415926535897932384626433832795;
2929
static const auto seed = 7777;
3030

3131
// Default Number of 2D points
32-
static const auto n_samples = 120000000;
32+
static const auto n_samples = 120'000'000;
3333

3434
double estimate_pi(sycl::queue& q, size_t n_points) {
3535
double estimated_pi; // Estimated value of Pi
@@ -48,37 +48,30 @@ double estimate_pi(sycl::queue& q, size_t n_points) {
4848
mkl::rng::generate(distr, engine, n_points * 2, rng_buf);
4949

5050
// Step 2. Count points under curve (x ^ 2 + y ^ 2 < 1.0f)
51-
size_t wg_size = std::min(q.get_device().get_info<sycl::info::device::max_work_group_size>(), n_points);
52-
size_t max_compute_units = q.get_device().get_info<sycl::info::device::max_compute_units>();
53-
size_t wg_num = (n_points > wg_size * max_compute_units) ? max_compute_units : 1;
54-
55-
size_t count_per_thread = n_points / (wg_size * wg_num);
56-
57-
std::vector<size_t> count(wg_num);
51+
constexpr size_t count_per_thread = 32;
5852

5953
{
60-
sycl::buffer<size_t, 1> count_buf(count);
54+
sycl::buffer<size_t> count_buf{ &n_under_curve, 1 };
6155

6256
q.submit([&] (sycl::handler& h) {
6357
auto rng_acc = rng_buf.template get_access<sycl::access::mode::read>(h);
64-
auto count_acc = count_buf.template get_access<sycl::access::mode::write>(h);
65-
h.parallel_for(sycl::nd_range<1>(wg_size * wg_num, wg_size),
66-
[=](sycl::nd_item<1> item) {
67-
sycl::vec<float, 2> r;
68-
size_t count = 0;
69-
for(int i = 0; i < count_per_thread; i++) {
70-
r.load(i + item.get_global_linear_id() * count_per_thread, rng_acc.template get_multi_ptr<sycl::access::decorated::yes>());
71-
if(sycl::length(r) <= 1.0f) {
72-
count += 1;
58+
auto reductor = sycl::reduction(count_buf, h, size_t(0), std::plus<size_t>());
59+
60+
h.parallel_for(sycl::range<1>(n_points / count_per_thread), reductor,
61+
[=](sycl::item<1> item, auto& sum) {
62+
sycl::vec<float, 2> r;
63+
size_t count = 0;
64+
for(int i = 0; i < count_per_thread; i++) {
65+
r.load(i + item.get_id(0) * count_per_thread, rng_acc.template get_multi_ptr<sycl::access::decorated::yes>());
66+
if(sycl::length(r) <= 1.0f) {
67+
count++;
68+
}
7369
}
74-
}
75-
count_acc[item.get_group_linear_id()] = sycl::reduce_over_group(item.get_group(), count, std::plus<size_t>());
70+
sum += count;
7671
});
7772
});
7873
}
7974

80-
n_under_curve = std::accumulate(count.begin(), count.end(), 0);
81-
8275
// Step 3. Calculate approximated value of Pi
8376
estimated_pi = n_under_curve / ((double)n_points) * 4.0;
8477
return estimated_pi;
@@ -132,7 +125,7 @@ int main(int argc, char ** argv) {
132125
std::cout << "Absolute error = " << abs_error << std::endl;
133126
std::cout << std::endl;
134127

135-
if(abs_error > 1.0e-3) {
128+
if(abs_error > 1.0e-4) {
136129
std::cout << "TEST FAILED" << std::endl;
137130
return 1;
138131
}

Libraries/oneMKL/monte_carlo_pi/mc_pi_device_api.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,13 @@ static const auto pi = 3.1415926535897932384626433832795;
2828
static const auto seed = 7777;
2929

3030
// Default Number of 2D points
31-
static const auto n_samples = 120000000;
31+
static const auto n_samples = 120'000'000;
3232

3333
double estimate_pi(sycl::queue& q, size_t n_points) {
3434
double estimated_pi; // Estimated value of Pi
3535
size_t n_under_curve = 0; // Number of points fallen under the curve
3636

37-
size_t count_per_thread = 32;
37+
constexpr size_t count_per_thread = 32;
3838

3939
constexpr size_t vec_size = 2;
4040

@@ -62,7 +62,7 @@ double estimate_pi(sycl::queue& q, size_t n_points) {
6262
r = mkl::rng::device::generate(distr, engine);
6363
// Step 2. Increment counter if point is under curve (x ^ 2 + y ^ 2 < 1.0f)
6464
if(sycl::length(r) <= 1.0f) {
65-
count += 1;
65+
count++;
6666
}
6767
}
6868
atomic_counter.fetch_add(count);

Libraries/oneMKL/monte_carlo_pi/mc_pi_usm.cpp

Lines changed: 22 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,10 @@ static const auto pi = 3.1415926535897932384626433832795;
2929
static const auto seed = 7777;
3030

3131
// Default Number of 2D points
32-
static const auto n_samples = 120000000;
32+
static const auto n_samples = 120'000'000;
3333

3434
double estimate_pi(sycl::queue& q, size_t n_points) {
3535
double estimated_pi; // Estimated value of Pi
36-
size_t n_under_curve = 0; // Number of points fallen under the curve
3736

3837
// Step 1. Generate n_points * 2 random numbers
3938
// 1.1. Generator initialization
@@ -42,47 +41,35 @@ double estimate_pi(sycl::queue& q, size_t n_points) {
4241
// Create an object of distribution (by default float, a = 0.0f, b = 1.0f)
4342
mkl::rng::uniform distr;
4443

45-
float* rng_ptr = sycl::malloc_device<float>(n_points * 2, q);
44+
float* rng_ptr = sycl::malloc_shared<float>(n_points * 2, q);
4645

4746
// 1.2. Random number generation
4847
auto event = mkl::rng::generate(distr, engine, n_points * 2, rng_ptr);
4948

5049
// Step 2. Count points under curve (x ^ 2 + y ^ 2 < 1.0f)
51-
size_t wg_size = std::min(q.get_device().get_info<sycl::info::device::max_work_group_size>(), n_points);
52-
size_t max_compute_units = q.get_device().get_info<sycl::info::device::max_compute_units>();
53-
size_t wg_num = (n_points > wg_size * max_compute_units) ? max_compute_units : 1;
54-
55-
size_t count_per_thread = n_points / (wg_size * wg_num);
56-
57-
size_t* count_ptr = sycl::malloc_shared<size_t>(wg_num, q);
58-
59-
// Make sure, that generation is finished
60-
event.wait_and_throw();
61-
62-
event = q.submit([&] (sycl::handler& h) {
63-
h.parallel_for(sycl::nd_range<1>(wg_size * wg_num, wg_size),
64-
[=](sycl::nd_item<1> item) {
65-
sycl::vec<float, 2> r;
66-
size_t count = 0;
67-
for(int i = 0; i < count_per_thread; i++) {
68-
r.load(i + item.get_global_linear_id() * count_per_thread, sycl::global_ptr<float>(rng_ptr));
69-
if(sycl::length(r) <= 1.0f) {
70-
count += 1;
71-
}
72-
}
73-
count_ptr[item.get_group_linear_id()] = sycl::reduce_over_group(item.get_group(), count, std::plus<size_t>());
74-
});
75-
});
76-
77-
event.wait_and_throw();
78-
79-
n_under_curve = std::accumulate(count_ptr, count_ptr + wg_num, 0);
50+
constexpr size_t count_per_thread = 32;
51+
size_t *n_under_curve = sycl::malloc_host<size_t>(1, q); // Number of points fallen under the curve
52+
*n_under_curve = 0;
53+
auto reductor = sycl::reduction(n_under_curve, size_t(0), std::plus<size_t>{});
54+
55+
q.parallel_for(sycl::range<1>(n_points / count_per_thread), event, reductor,
56+
[=](sycl::item<1> item, auto& sum) {
57+
sycl::vec<float, 2> r;
58+
size_t count = 0;
59+
for(int i = 0; i < count_per_thread; i++) {
60+
r.load(i + item.get_id(0) * count_per_thread, sycl::global_ptr<float>(rng_ptr));
61+
if(sycl::length(r) <= 1.0f) {
62+
count++;
63+
}
64+
}
65+
sum += count;
66+
}).wait_and_throw();
8067

8168
// Step 3. Calculate approximated value of Pi
82-
estimated_pi = n_under_curve / ((double)n_points) * 4.0;
69+
estimated_pi = *n_under_curve / ((double)n_points) * 4.0;
8370

8471
sycl::free(rng_ptr, q);
85-
sycl::free(count_ptr, q);
72+
sycl::free(n_under_curve, q);
8673

8774
return estimated_pi;
8875

@@ -135,7 +122,7 @@ int main(int argc, char ** argv) {
135122
std::cout << "Absolute error = " << abs_error << std::endl;
136123
std::cout << std::endl;
137124

138-
if(abs_error > 1.0e-3) {
125+
if(abs_error > 1.0e-4) {
139126
std::cout << "TEST FAILED" << std::endl;
140127
return 1;
141128
}

0 commit comments

Comments
 (0)