[Libraries/MPI]: Fixed omp decomposition in jacobian samples

alexander-sannikov · alexander-sannikov · commit 1bdf0ee1f3a1 · 2024-09-13T15:30:51.000+01:00
diff --git a/Libraries/MPI/jacobian_solver/src/02_jacobian_device_mpi_one-sided_gpu_aware/mpi3_onesided_jacobian_gpu_openmp.c b/Libraries/MPI/jacobian_solver/src/02_jacobian_device_mpi_one-sided_gpu_aware/mpi3_onesided_jacobian_gpu_openmp.c
@@ -47,8 +47,8 @@ int main(int argc, char *argv[])
     /* Timestamp start time to measure overall execution time */
     BEGIN_PROFILING
     /* Main computation loop partly offloaded to the device:
-     * "#pragma omp target data clauser" map the data to the device memory for a following code region*/
-    #pragma omp target data map(to: Niter, my_subarray, win[0:2], NormIteration) use_device_ptr(b1,b2)
+     * "#pragma omp target data" maps the data to the device memory for a following code region*/
+    #pragma omp target data map(to: my_subarray) use_device_ptr(b1,b2)
     {
         for (int passed_iters = 0; passed_iters < Niter; passed_iters += iterations_batch) {
             /* Perfrom a batch of iterations before checking norm */
@@ -60,11 +60,13 @@ int main(int argc, char *argv[])
                 MPI_Win current_win = win[(i + 1) % 2];
 
                 /* Offload compute loop to the device:
-                 * "#pragma omp target teams distribute parallel for" offloads the loop to the device 
+                 * "#pragma omp target" offloads the code to the device 
+                 * "#pragma omp parallel for" parallelizes the loop on the device using single team
                  *
                  * NOTE: For simplification and unification across samples we use single team
                  *       to avoid extra syncronization across teams in the future */ 
-                #pragma omp target teams distribute parallel for is_device_ptr(in, out) num_teams(1)
+                #pragma omp target is_device_ptr(in, out) thread_limit(1024)
+                #pragma omp parallel loop
                 /* Calculate values on borders to initiate communications early */
                 for (int column = 0; column < my_subarray.x_size; ++column) {
                     RECALCULATE_POINT(out, in, column, 0, row_size);
@@ -87,7 +89,8 @@ int main(int argc, char *argv[])
                 }
 
                 /* Offload compute loop to the device */
-                #pragma omp target teams distribute parallel for is_device_ptr(in, out) collapse(2) num_teams(1)
+                #pragma omp target is_device_ptr(in, out) thread_limit(1024)
+                #pragma omp parallel loop collapse(2)
                 /* Recalculate internal points in parallel with communication */
                 for (int row = 1; row < my_subarray.y_size - 1; ++row) {
                     for (int column = 0; column < my_subarray.x_size; ++column) {
diff --git a/Libraries/MPI/jacobian_solver/src/03_jacobian_device_mpi_one-sided_device_initiated/mpi3_onesided_jacobian_gpu_omp_device_initiated.c b/Libraries/MPI/jacobian_solver/src/03_jacobian_device_mpi_one-sided_device_initiated/mpi3_onesided_jacobian_gpu_omp_device_initiated.c
@@ -54,16 +54,16 @@ int main(int argc, char *argv[])
     /* Timestamp start time to measure overall execution time */
     BEGIN_PROFILING
     /* Main computation loop offloaded to the device:
-     * "#pragma omp target data clauser" map the data to the device memory for a following code region*/
-    #pragma omp target data map(to: Niter, my_subarray, win[0:2], NormIteration) use_device_ptr(b1, b2)
+     * "#pragma omp target data" maps the data to the device memory for a following code region*/
+    #pragma omp target data map(to: iterations_batch, my_subarray, win[0:2]) use_device_ptr(b1, b2)
     {
         for (int passed_iters = 0; passed_iters < Niter; passed_iters += iterations_batch) {
             /* Offload compute loop to the device:
-             * "#pragma omp target teams" start a target region with a single team 
+             * "#pragma omp target" start a target region with a single team 
              *
              * NOTE: For simplification and unification across samples we use single team
              *       to avoid extra syncronization across teams in the future */
-            #pragma omp target teams num_teams(1)
+            #pragma omp target thread_limit(1024)
             {
                 for (int k = 0; k < iterations_batch; ++k)
                 {
@@ -73,7 +73,7 @@ int main(int argc, char *argv[])
                     MPI_Win current_win = win[(i + 1) % 2];
 
                     /* Start parallel loop on the device, to accelerate a calculation */
-                    #pragma omp parallel for simd
+                    #pragma omp parallel loop
                     /* Calculate values on borders to initiate communications early */
                     for (int column = 0; column < my_subarray.x_size;  column ++) {
                         RECALCULATE_POINT(out, in, column, 0, row_size);
@@ -102,7 +102,7 @@ int main(int argc, char *argv[])
 
 
                     /* Start parallel loop on the device, to accelerate a calculation */
-                    #pragma omp parallel for simd collapse(2)
+                    #pragma omp parallel loop collapse(2)
                     /* Recalculate internal points in parallel with communication */
                     for (int row = 1; row < my_subarray.y_size - 1; ++row) {
                         for (int column = 0; column < my_subarray.x_size; ++column) {
diff --git a/Libraries/MPI/jacobian_solver/src/04_jacobian_device_mpi_one-sided_device_initiated_notify/mpi3_onesided_jacobian_gpu_omp_device_initiated_notify.c b/Libraries/MPI/jacobian_solver/src/04_jacobian_device_mpi_one-sided_device_initiated_notify/mpi3_onesided_jacobian_gpu_omp_device_initiated_notify.c
@@ -48,8 +48,8 @@ int main(int argc, char *argv[])
     InitSubarryAndWindows(&my_subarray, buffs, win, "device", true);
 
     /* Enable notification counters */
-    MPI_Win_notify_attach(win[0], 1, MPI_INFO_NULL);
-    MPI_Win_notify_attach(win[1], 1, MPI_INFO_NULL);
+    MPI_Win_notify_set_num(win[0], MPI_INFO_NULL, 1);
+    MPI_Win_notify_set_num(win[1], MPI_INFO_NULL, 1);
     /* Start RMA exposure epoch */
     MPI_Win_lock_all(0, win[0]);
     MPI_Win_lock_all(0, win[1]);
@@ -67,16 +67,16 @@ int main(int argc, char *argv[])
     /* Timestamp start time to measure overall execution time */
     BEGIN_PROFILING
     /* Main computation loop offloaded to the device:
-     * "#pragma omp target data clauser" map the data to the device memory for a following code region*/
-    #pragma omp target data map(to: Niter, my_subarray, win[0:2], NormIteration, iter_counter_step) use_device_ptr(b1, b2)
+     * "#pragma omp target data" maps the data to the device memory for a following code region*/
+    #pragma omp target data map(to: my_subarray, win[0:2], iterations_batch, iter_counter_step) use_device_ptr(b1, b2)
     {
         for (int passed_iters = 0; passed_iters < Niter; passed_iters += iterations_batch) {
             /* Offload compute loop to the device:
              * "#pragma omp target teams" start a target region with a single team 
              *
              * NOTE: For simplification and unification across samples we use single team
              *       to avoid extra syncronization across teams in the future */
-            #pragma omp target teams num_teams(1)
+            #pragma omp target thread_limit(1024)
             {
                 for (int k = 0; k < iterations_batch; ++k)
                 {
@@ -94,14 +94,15 @@ int main(int argc, char *argv[])
                      *  To be completely standard compliant, application should check memory model
                      *  and call MPI_Win_sync(prev_win) in case of MPI_WIN_SEPARATE mode after notification has been recieved.
                      *  Although, IntelMPI uses MPI_WIN_UNIFIED memory model, so this call could be omitted.
-                     */ 
+                     */
                     MPI_Count c = 0;
+                    MPI_Win_flush_local_all(current_win);
                     while (c < (iter_counter_step*i)) {
                         MPI_Win_notify_get_value(prev_win, 0, &c);
                     }
 
                     /* Start parallel loop on the device, to accelerate a calculation */
-                    #pragma omp parallel for simd
+                    #pragma omp parallel for
                     /* Calculate values on borders to initiate communications early */
                     for (int column = 0; column < my_subarray.x_size;  column ++) {
                         RECALCULATE_POINT(out, in, column, 0, row_size);
@@ -133,7 +134,7 @@ int main(int argc, char *argv[])
 
 
                     /* Start parallel loop on the device, to accelerate a calculation */
-                    #pragma omp parallel for simd collapse(2)
+                    #pragma omp parallel for collapse(2)
                     /* Recalculate internal points in parallel with communication */
                     for (int row = 1; row < my_subarray.y_size - 1; ++row) {
                         for (int column = 0; column < my_subarray.x_size; ++column) {
diff --git a/Libraries/MPI/jacobian_solver/src/04_jacobian_device_mpi_one-sided_device_initiated_notify/mpi3_onesided_jacobian_gpu_sycl_device_initiated_notify.cpp b/Libraries/MPI/jacobian_solver/src/04_jacobian_device_mpi_one-sided_device_initiated_notify/mpi3_onesided_jacobian_gpu_sycl_device_initiated_notify.cpp
@@ -35,7 +35,7 @@ int main(int argc, char *argv[])
         fprintf(stderr, "MPI_THREAD_MULTIPLE is required for this sample\n");
         MPI_Abort(MPI_COMM_WORLD, -1);
     }
-      
+
     /* Initialize subarray owned by current process
      * and create RMA-windows for MPI-3 one-sided communications.
      *  - For this sample, we use GPU memory for buffers and windows.
@@ -56,8 +56,8 @@ int main(int argc, char *argv[])
 #endif
 
     /* Enable notification counters */
-    MPI_Win_notify_attach(win[0], 1, MPI_INFO_NULL);
-    MPI_Win_notify_attach(win[1], 1, MPI_INFO_NULL);
+    MPI_Win_notify_set_num(win[0], MPI_INFO_NULL, 1);
+    MPI_Win_notify_set_num(win[1], MPI_INFO_NULL, 1);
     /* Start RMA exposure epoch */
     MPI_Win_lock_all(0, win[0]);
     MPI_Win_lock_all(0, win[1]);
@@ -141,6 +141,7 @@ int main(int argc, char *argv[])
                     item.barrier(sycl::access::fence_space::global_space);
                     if (id == 0) {
                         MPI_Count c = 0;
+                        MPI_Win_flush_all(current_win);
                         /* Wait till the moment counter would reach expected value */
                         while (c < c_expected) MPI_Win_notify_get_value(current_win, 0, &c);
                         /* Reset counter value to 0 */
diff --git a/Libraries/MPI/jacobian_solver/src/04_jacobian_device_mpi_one-sided_device_initiated_notify/mpix_compat.h b/Libraries/MPI/jacobian_solver/src/04_jacobian_device_mpi_one-sided_device_initiated_notify/mpix_compat.h
@@ -9,12 +9,13 @@
 #define MPI_ERR_INVALID_NOTIFICATION MPI_ERR_OTHER
 
 /* int MPI_Win_notify_attach(MPI_Win win, int notification_num, MPI_Info info); */
-#define MPI_Win_notify_attach(win, notification_num, info) \
-            MPIX_Win_create_notify(win, notification_num)
-
-/* int MPI_Win_notify_detach(MPI_Win win); */
-#define MPI_Win_notify_detach(win) \
-            MPIX_Win_free_notify(win)
+static inline int MPI_Win_notify_set_num(MPI_Win win, MPI_Info info, int notification_num)
+{
+    if (notification_num == 0) {
+        return MPIX_Win_free_notify(win);
+    }
+    return MPIX_Win_create_notify(win, notification_num);
+}
 
 /* int MPI_Win_notify_get_value(MPI_Win win, int notification_idx, MPI_Count *value) */
 #define MPI_Win_notify_get_value(win, notification_idx, value) \