Skip to content

Commit 80163a0

Browse files
committed
reworked monte carlo pi
1 parent a862868 commit 80163a0

File tree

3 files changed

+35
-57
lines changed

3 files changed

+35
-57
lines changed

Libraries/oneMKL/monte_carlo_pi/mc_pi.cpp

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ static const auto pi = 3.1415926535897932384626433832795;
2929
static const auto seed = 7777;
3030

3131
// Default Number of 2D points
32-
static const auto n_samples = 120000000;
32+
static const auto n_samples = 120'000'000;
3333

3434
double estimate_pi(sycl::queue& q, size_t n_points) {
3535
double estimated_pi; // Estimated value of Pi
@@ -48,37 +48,30 @@ double estimate_pi(sycl::queue& q, size_t n_points) {
4848
mkl::rng::generate(distr, engine, n_points * 2, rng_buf);
4949

5050
// Step 2. Count points under curve (x ^ 2 + y ^ 2 < 1.0f)
51-
size_t wg_size = std::min(q.get_device().get_info<sycl::info::device::max_work_group_size>(), n_points);
52-
size_t max_compute_units = q.get_device().get_info<sycl::info::device::max_compute_units>();
53-
size_t wg_num = (n_points > wg_size * max_compute_units) ? max_compute_units : 1;
54-
55-
size_t count_per_thread = n_points / (wg_size * wg_num);
56-
57-
std::vector<size_t> count(wg_num);
51+
size_t count_per_thread = 32;
5852

5953
{
60-
sycl::buffer<size_t, 1> count_buf(count);
54+
sycl::buffer<size_t> count_buf{ &n_under_curve, 1 };
6155

6256
q.submit([&] (sycl::handler& h) {
6357
auto rng_acc = rng_buf.template get_access<sycl::access::mode::read>(h);
64-
auto count_acc = count_buf.template get_access<sycl::access::mode::write>(h);
65-
h.parallel_for(sycl::nd_range<1>(wg_size * wg_num, wg_size),
66-
[=](sycl::nd_item<1> item) {
67-
sycl::vec<float, 2> r;
68-
size_t count = 0;
69-
for(int i = 0; i < count_per_thread; i++) {
70-
r.load(i + item.get_global_linear_id() * count_per_thread, rng_acc.template get_multi_ptr<sycl::access::decorated::yes>());
71-
if(sycl::length(r) <= 1.0f) {
72-
count += 1;
58+
auto reductor = sycl::reduction(count_buf, h, size_t(0), std::plus<size_t>());
59+
60+
h.parallel_for(sycl::range<1>(n_points / count_per_thread), reductor,
61+
[=](sycl::item<1> item, auto& sum) {
62+
sycl::vec<float, 2> r;
63+
size_t count = 0;
64+
for(int i = 0; i < count_per_thread; i++) {
65+
r.load(i + item.get_id(0) * count_per_thread, rng_acc.template get_multi_ptr<sycl::access::decorated::yes>());
66+
if(sycl::length(r) <= 1.0f) {
67+
count++;
68+
}
7369
}
74-
}
75-
count_acc[item.get_group_linear_id()] = sycl::reduce_over_group(item.get_group(), count, std::plus<size_t>());
70+
sum += count;
7671
});
7772
});
7873
}
7974

80-
n_under_curve = std::accumulate(count.begin(), count.end(), 0);
81-
8275
// Step 3. Calculate approximated value of Pi
8376
estimated_pi = n_under_curve / ((double)n_points) * 4.0;
8477
return estimated_pi;
@@ -132,7 +125,7 @@ int main(int argc, char ** argv) {
132125
std::cout << "Absolute error = " << abs_error << std::endl;
133126
std::cout << std::endl;
134127

135-
if(abs_error > 1.0e-3) {
128+
if(abs_error > 1.0e-4) {
136129
std::cout << "TEST FAILED" << std::endl;
137130
return 1;
138131
}

Libraries/oneMKL/monte_carlo_pi/mc_pi_device_api.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ static const auto pi = 3.1415926535897932384626433832795;
2828
static const auto seed = 7777;
2929

3030
// Default Number of 2D points
31-
static const auto n_samples = 120000000;
31+
static const auto n_samples = 120'000'000;
3232

3333
double estimate_pi(sycl::queue& q, size_t n_points) {
3434
double estimated_pi; // Estimated value of Pi
@@ -62,7 +62,7 @@ double estimate_pi(sycl::queue& q, size_t n_points) {
6262
r = mkl::rng::device::generate(distr, engine);
6363
// Step 2. Increment counter if point is under curve (x ^ 2 + y ^ 2 < 1.0f)
6464
if(sycl::length(r) <= 1.0f) {
65-
count += 1;
65+
count++;
6666
}
6767
}
6868
atomic_counter.fetch_add(count);

Libraries/oneMKL/monte_carlo_pi/mc_pi_usm.cpp

Lines changed: 17 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ static const auto pi = 3.1415926535897932384626433832795;
2929
static const auto seed = 7777;
3030

3131
// Default Number of 2D points
32-
static const auto n_samples = 120000000;
32+
static const auto n_samples = 120'000'000;
3333

3434
double estimate_pi(sycl::queue& q, size_t n_points) {
3535
double estimated_pi; // Estimated value of Pi
@@ -48,41 +48,26 @@ double estimate_pi(sycl::queue& q, size_t n_points) {
4848
auto event = mkl::rng::generate(distr, engine, n_points * 2, rng_ptr);
4949

5050
// Step 2. Count points under curve (x ^ 2 + y ^ 2 < 1.0f)
51-
size_t wg_size = std::min(q.get_device().get_info<sycl::info::device::max_work_group_size>(), n_points);
52-
size_t max_compute_units = q.get_device().get_info<sycl::info::device::max_compute_units>();
53-
size_t wg_num = (n_points > wg_size * max_compute_units) ? max_compute_units : 1;
54-
55-
size_t count_per_thread = n_points / (wg_size * wg_num);
56-
57-
size_t* count_ptr = sycl::malloc_shared<size_t>(wg_num, q);
58-
59-
// Make sure, that generation is finished
60-
event.wait_and_throw();
61-
62-
event = q.submit([&] (sycl::handler& h) {
63-
h.parallel_for(sycl::nd_range<1>(wg_size * wg_num, wg_size),
64-
[=](sycl::nd_item<1> item) {
65-
sycl::vec<float, 2> r;
66-
size_t count = 0;
67-
for(int i = 0; i < count_per_thread; i++) {
68-
r.load(i + item.get_global_linear_id() * count_per_thread, sycl::global_ptr<float>(rng_ptr));
69-
if(sycl::length(r) <= 1.0f) {
70-
count += 1;
71-
}
72-
}
73-
count_ptr[item.get_group_linear_id()] = sycl::reduce_over_group(item.get_group(), count, std::plus<size_t>());
74-
});
75-
});
76-
77-
event.wait_and_throw();
78-
79-
n_under_curve = std::accumulate(count_ptr, count_ptr + wg_num, 0);
51+
size_t count_per_thread = 32;
52+
53+
auto reductor = sycl::reduction(&n_under_curve, size_t(0), std::plus<size_t>{});
54+
q.parallel_for(sycl::range<1>(n_points / count_per_thread), event, reductor,
55+
[=](sycl::item<1> item, auto& sum) {
56+
sycl::vec<float, 2> r;
57+
size_t count = 0;
58+
for(int i = 0; i < count_per_thread; i++) {
59+
r.load(i + item.get_id(0) * count_per_thread, sycl::global_ptr<float>(rng_ptr));
60+
if(sycl::length(r) <= 1.0f) {
61+
count++;
62+
}
63+
}
64+
sum += count;
65+
}).wait_and_throw();
8066

8167
// Step 3. Calculate approximated value of Pi
8268
estimated_pi = n_under_curve / ((double)n_points) * 4.0;
8369

8470
sycl::free(rng_ptr, q);
85-
sycl::free(count_ptr, q);
8671

8772
return estimated_pi;
8873

@@ -135,7 +120,7 @@ int main(int argc, char ** argv) {
135120
std::cout << "Absolute error = " << abs_error << std::endl;
136121
std::cout << std::endl;
137122

138-
if(abs_error > 1.0e-3) {
123+
if(abs_error > 1.0e-4) {
139124
std::cout << "TEST FAILED" << std::endl;
140125
return 1;
141126
}

0 commit comments

Comments
 (0)