Skip to content

Commit 1bdf0ee

Browse files
[Libraries/MPI]: Fixed omp decomposition in jacobian samples
1 parent 605cd2f commit 1bdf0ee

File tree

5 files changed

+34
-28
lines changed

5 files changed

+34
-28
lines changed

Libraries/MPI/jacobian_solver/src/02_jacobian_device_mpi_one-sided_gpu_aware/mpi3_onesided_jacobian_gpu_openmp.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ int main(int argc, char *argv[])
4747
/* Timestamp start time to measure overall execution time */
4848
BEGIN_PROFILING
4949
/* Main computation loop partly offloaded to the device:
50-
* "#pragma omp target data clauser" map the data to the device memory for a following code region*/
51-
#pragma omp target data map(to: Niter, my_subarray, win[0:2], NormIteration) use_device_ptr(b1,b2)
50+
* "#pragma omp target data" maps the data to the device memory for a following code region*/
51+
#pragma omp target data map(to: my_subarray) use_device_ptr(b1,b2)
5252
{
5353
for (int passed_iters = 0; passed_iters < Niter; passed_iters += iterations_batch) {
5454
/* Perfrom a batch of iterations before checking norm */
@@ -60,11 +60,13 @@ int main(int argc, char *argv[])
6060
MPI_Win current_win = win[(i + 1) % 2];
6161

6262
/* Offload compute loop to the device:
63-
* "#pragma omp target teams distribute parallel for" offloads the loop to the device
63+
* "#pragma omp target" offloads the code to the device
64+
* "#pragma omp parallel for" parallelizes the loop on the device using single team
6465
*
6566
* NOTE: For simplification and unification across samples we use single team
6667
* to avoid extra syncronization across teams in the future */
67-
#pragma omp target teams distribute parallel for is_device_ptr(in, out) num_teams(1)
68+
#pragma omp target is_device_ptr(in, out) thread_limit(1024)
69+
#pragma omp parallel loop
6870
/* Calculate values on borders to initiate communications early */
6971
for (int column = 0; column < my_subarray.x_size; ++column) {
7072
RECALCULATE_POINT(out, in, column, 0, row_size);
@@ -87,7 +89,8 @@ int main(int argc, char *argv[])
8789
}
8890

8991
/* Offload compute loop to the device */
90-
#pragma omp target teams distribute parallel for is_device_ptr(in, out) collapse(2) num_teams(1)
92+
#pragma omp target is_device_ptr(in, out) thread_limit(1024)
93+
#pragma omp parallel loop collapse(2)
9194
/* Recalculate internal points in parallel with communication */
9295
for (int row = 1; row < my_subarray.y_size - 1; ++row) {
9396
for (int column = 0; column < my_subarray.x_size; ++column) {

Libraries/MPI/jacobian_solver/src/03_jacobian_device_mpi_one-sided_device_initiated/mpi3_onesided_jacobian_gpu_omp_device_initiated.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,16 @@ int main(int argc, char *argv[])
5454
/* Timestamp start time to measure overall execution time */
5555
BEGIN_PROFILING
5656
/* Main computation loop offloaded to the device:
57-
* "#pragma omp target data clauser" map the data to the device memory for a following code region*/
58-
#pragma omp target data map(to: Niter, my_subarray, win[0:2], NormIteration) use_device_ptr(b1, b2)
57+
* "#pragma omp target data" maps the data to the device memory for a following code region*/
58+
#pragma omp target data map(to: iterations_batch, my_subarray, win[0:2]) use_device_ptr(b1, b2)
5959
{
6060
for (int passed_iters = 0; passed_iters < Niter; passed_iters += iterations_batch) {
6161
/* Offload compute loop to the device:
62-
* "#pragma omp target teams" start a target region with a single team
62+
* "#pragma omp target" start a target region with a single team
6363
*
6464
* NOTE: For simplification and unification across samples we use single team
6565
* to avoid extra syncronization across teams in the future */
66-
#pragma omp target teams num_teams(1)
66+
#pragma omp target thread_limit(1024)
6767
{
6868
for (int k = 0; k < iterations_batch; ++k)
6969
{
@@ -73,7 +73,7 @@ int main(int argc, char *argv[])
7373
MPI_Win current_win = win[(i + 1) % 2];
7474

7575
/* Start parallel loop on the device, to accelerate a calculation */
76-
#pragma omp parallel for simd
76+
#pragma omp parallel loop
7777
/* Calculate values on borders to initiate communications early */
7878
for (int column = 0; column < my_subarray.x_size; column ++) {
7979
RECALCULATE_POINT(out, in, column, 0, row_size);
@@ -102,7 +102,7 @@ int main(int argc, char *argv[])
102102

103103

104104
/* Start parallel loop on the device, to accelerate a calculation */
105-
#pragma omp parallel for simd collapse(2)
105+
#pragma omp parallel loop collapse(2)
106106
/* Recalculate internal points in parallel with communication */
107107
for (int row = 1; row < my_subarray.y_size - 1; ++row) {
108108
for (int column = 0; column < my_subarray.x_size; ++column) {

Libraries/MPI/jacobian_solver/src/04_jacobian_device_mpi_one-sided_device_initiated_notify/mpi3_onesided_jacobian_gpu_omp_device_initiated_notify.c

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ int main(int argc, char *argv[])
4848
InitSubarryAndWindows(&my_subarray, buffs, win, "device", true);
4949

5050
/* Enable notification counters */
51-
MPI_Win_notify_attach(win[0], 1, MPI_INFO_NULL);
52-
MPI_Win_notify_attach(win[1], 1, MPI_INFO_NULL);
51+
MPI_Win_notify_set_num(win[0], MPI_INFO_NULL, 1);
52+
MPI_Win_notify_set_num(win[1], MPI_INFO_NULL, 1);
5353
/* Start RMA exposure epoch */
5454
MPI_Win_lock_all(0, win[0]);
5555
MPI_Win_lock_all(0, win[1]);
@@ -67,16 +67,16 @@ int main(int argc, char *argv[])
6767
/* Timestamp start time to measure overall execution time */
6868
BEGIN_PROFILING
6969
/* Main computation loop offloaded to the device:
70-
* "#pragma omp target data clauser" map the data to the device memory for a following code region*/
71-
#pragma omp target data map(to: Niter, my_subarray, win[0:2], NormIteration, iter_counter_step) use_device_ptr(b1, b2)
70+
* "#pragma omp target data" maps the data to the device memory for a following code region*/
71+
#pragma omp target data map(to: my_subarray, win[0:2], iterations_batch, iter_counter_step) use_device_ptr(b1, b2)
7272
{
7373
for (int passed_iters = 0; passed_iters < Niter; passed_iters += iterations_batch) {
7474
/* Offload compute loop to the device:
7575
* "#pragma omp target teams" start a target region with a single team
7676
*
7777
* NOTE: For simplification and unification across samples we use single team
7878
* to avoid extra syncronization across teams in the future */
79-
#pragma omp target teams num_teams(1)
79+
#pragma omp target thread_limit(1024)
8080
{
8181
for (int k = 0; k < iterations_batch; ++k)
8282
{
@@ -94,14 +94,15 @@ int main(int argc, char *argv[])
9494
* To be completely standard compliant, application should check memory model
9595
* and call MPI_Win_sync(prev_win) in case of MPI_WIN_SEPARATE mode after notification has been recieved.
9696
* Although, IntelMPI uses MPI_WIN_UNIFIED memory model, so this call could be omitted.
97-
*/
97+
*/
9898
MPI_Count c = 0;
99+
MPI_Win_flush_local_all(current_win);
99100
while (c < (iter_counter_step*i)) {
100101
MPI_Win_notify_get_value(prev_win, 0, &c);
101102
}
102103

103104
/* Start parallel loop on the device, to accelerate a calculation */
104-
#pragma omp parallel for simd
105+
#pragma omp parallel for
105106
/* Calculate values on borders to initiate communications early */
106107
for (int column = 0; column < my_subarray.x_size; column ++) {
107108
RECALCULATE_POINT(out, in, column, 0, row_size);
@@ -133,7 +134,7 @@ int main(int argc, char *argv[])
133134

134135

135136
/* Start parallel loop on the device, to accelerate a calculation */
136-
#pragma omp parallel for simd collapse(2)
137+
#pragma omp parallel for collapse(2)
137138
/* Recalculate internal points in parallel with communication */
138139
for (int row = 1; row < my_subarray.y_size - 1; ++row) {
139140
for (int column = 0; column < my_subarray.x_size; ++column) {

Libraries/MPI/jacobian_solver/src/04_jacobian_device_mpi_one-sided_device_initiated_notify/mpi3_onesided_jacobian_gpu_sycl_device_initiated_notify.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ int main(int argc, char *argv[])
3535
fprintf(stderr, "MPI_THREAD_MULTIPLE is required for this sample\n");
3636
MPI_Abort(MPI_COMM_WORLD, -1);
3737
}
38-
38+
3939
/* Initialize subarray owned by current process
4040
* and create RMA-windows for MPI-3 one-sided communications.
4141
* - For this sample, we use GPU memory for buffers and windows.
@@ -56,8 +56,8 @@ int main(int argc, char *argv[])
5656
#endif
5757

5858
/* Enable notification counters */
59-
MPI_Win_notify_attach(win[0], 1, MPI_INFO_NULL);
60-
MPI_Win_notify_attach(win[1], 1, MPI_INFO_NULL);
59+
MPI_Win_notify_set_num(win[0], MPI_INFO_NULL, 1);
60+
MPI_Win_notify_set_num(win[1], MPI_INFO_NULL, 1);
6161
/* Start RMA exposure epoch */
6262
MPI_Win_lock_all(0, win[0]);
6363
MPI_Win_lock_all(0, win[1]);
@@ -141,6 +141,7 @@ int main(int argc, char *argv[])
141141
item.barrier(sycl::access::fence_space::global_space);
142142
if (id == 0) {
143143
MPI_Count c = 0;
144+
MPI_Win_flush_all(current_win);
144145
/* Wait till the moment counter would reach expected value */
145146
while (c < c_expected) MPI_Win_notify_get_value(current_win, 0, &c);
146147
/* Reset counter value to 0 */

Libraries/MPI/jacobian_solver/src/04_jacobian_device_mpi_one-sided_device_initiated_notify/mpix_compat.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,13 @@
99
#define MPI_ERR_INVALID_NOTIFICATION MPI_ERR_OTHER
1010

1111
/* int MPI_Win_notify_attach(MPI_Win win, int notification_num, MPI_Info info); */
12-
#define MPI_Win_notify_attach(win, notification_num, info) \
13-
MPIX_Win_create_notify(win, notification_num)
14-
15-
/* int MPI_Win_notify_detach(MPI_Win win); */
16-
#define MPI_Win_notify_detach(win) \
17-
MPIX_Win_free_notify(win)
12+
static inline int MPI_Win_notify_set_num(MPI_Win win, MPI_Info info, int notification_num)
13+
{
14+
if (notification_num == 0) {
15+
return MPIX_Win_free_notify(win);
16+
}
17+
return MPIX_Win_create_notify(win, notification_num);
18+
}
1819

1920
/* int MPI_Win_notify_get_value(MPI_Win win, int notification_idx, MPI_Count *value) */
2021
#define MPI_Win_notify_get_value(win, notification_idx, value) \

0 commit comments

Comments
 (0)