|
7 | 7 | #include <omp.h> |
8 | 8 | #endif |
9 | 9 |
|
| 10 | +#if defined(GPU_OPENACC) |
| 11 | + #include <openacc.h> |
| 12 | +#endif |
| 13 | + |
10 | 14 | // use single (float) or double precision |
11 | 15 | // according to the value passed in the compilation cmd |
12 | 16 | #if defined(FLOAT) |
@@ -75,6 +79,31 @@ double forward(f_type *u, f_type *velocity, f_type *damp, |
75 | 79 | #pragma omp target enter data map(to: receivers[:shot_record_size]) |
76 | 80 | #endif |
77 | 81 |
|
| 82 | + #ifdef GPU_OPENACC |
| 83 | + |
| 84 | + // select the device |
| 85 | + #ifdef DEVICEID |
| 86 | + acc_init(acc_device_nvidia); |
| 87 | + acc_set_device_num(DEVICEID, acc_device_nvidia); |
| 88 | + #endif |
| 89 | + |
| 90 | + size_t shot_record_size = wavelet_size * num_receivers; |
| 91 | + size_t u_size = num_snapshots * domain_size; |
| 92 | + |
| 93 | + #pragma acc enter data copyin(u[:u_size]) |
| 94 | + #pragma acc enter data copyin(velocity[:domain_size]) |
| 95 | + #pragma acc enter data copyin(damp[:domain_size]) |
| 96 | + #pragma acc enter data copyin(coeff[:stencil_radius+1]) |
| 97 | + #pragma acc enter data copyin(src_points_interval[:src_points_interval_size]) |
| 98 | + #pragma acc enter data copyin(src_points_values[:src_points_values_size]) |
| 99 | + #pragma acc enter data copyin(src_points_values_offset[:num_sources]) |
| 100 | + #pragma acc enter data copyin(rec_points_interval[:rec_points_interval_size]) |
| 101 | + #pragma acc enter data copyin(rec_points_values[:rec_points_values_size]) |
| 102 | + #pragma acc enter data copyin(rec_points_values_offset[:num_receivers]) |
| 103 | + #pragma acc enter data copyin(wavelet[:wavelet_size * wavelet_count]) |
| 104 | + #pragma acc enter data copyin(receivers[:shot_record_size]) |
| 105 | + #endif |
| 106 | + |
78 | 107 | // wavefield modeling |
79 | 108 | for(size_t n = begin_timestep; n <= end_timestep; n++) { |
80 | 109 |
|
@@ -104,6 +133,10 @@ double forward(f_type *u, f_type *velocity, f_type *damp, |
104 | 133 | #pragma omp target teams distribute parallel for collapse(2) |
105 | 134 | #endif |
106 | 135 |
|
| 136 | + #ifdef GPU_OPENACC |
| 137 | + #pragma acc parallel loop collapse(2) present(coeff,damp,u,velocity) |
| 138 | + #endif |
| 139 | + |
107 | 140 | for(size_t i = stencil_radius; i < nz - stencil_radius; i++) { |
108 | 141 | for(size_t j = stencil_radius; j < nx - stencil_radius; j++) { |
109 | 142 | // index of the current point in the grid |
@@ -151,6 +184,10 @@ double forward(f_type *u, f_type *velocity, f_type *damp, |
151 | 184 | #pragma omp target teams distribute parallel for |
152 | 185 | #endif |
153 | 186 |
|
| 187 | + #ifdef GPU_OPENACC |
| 188 | + #pragma acc parallel loop present(src_points_interval,src_points_values,src_points_values_offset,u,velocity,wavelet) |
| 189 | + #endif |
| 190 | + |
154 | 191 | // for each source |
155 | 192 | for(size_t src = 0; src < num_sources; src++){ |
156 | 193 |
|
@@ -201,6 +238,10 @@ double forward(f_type *u, f_type *velocity, f_type *damp, |
201 | 238 | #if defined(CPU_OPENMP) || defined(GPU_OPENMP) |
202 | 239 | #pragma omp atomic |
203 | 240 | #endif |
| 241 | + |
| 242 | + #ifdef GPU_OPENACC |
| 243 | + #pragma acc atomic update |
| 244 | + #endif |
204 | 245 | u[next_snapshot] += value; |
205 | 246 |
|
206 | 247 | kws_index_x++; |
@@ -230,6 +271,11 @@ double forward(f_type *u, f_type *velocity, f_type *damp, |
230 | 271 | #ifdef GPU_OPENMP |
231 | 272 | #pragma omp target teams distribute parallel for |
232 | 273 | #endif |
| 274 | + |
| 275 | + #ifdef GPU_OPENACC |
| 276 | + #pragma acc parallel loop present(u) |
| 277 | + #endif |
| 278 | + |
233 | 279 | for(size_t i = stencil_radius; i < nz - stencil_radius; i++){ |
234 | 280 |
|
235 | 281 | // null dirichlet on the left |
@@ -274,6 +320,11 @@ double forward(f_type *u, f_type *velocity, f_type *damp, |
274 | 320 | #ifdef GPU_OPENMP |
275 | 321 | #pragma omp target teams distribute parallel for |
276 | 322 | #endif |
| 323 | + |
| 324 | + #ifdef GPU_OPENACC |
| 325 | + #pragma acc parallel loop present(u) |
| 326 | + #endif |
| 327 | + |
277 | 328 | for(size_t j = stencil_radius; j < nx - stencil_radius; j++){ |
278 | 329 |
|
279 | 330 | // null dirichlet on the top |
@@ -322,6 +373,10 @@ double forward(f_type *u, f_type *velocity, f_type *damp, |
322 | 373 | #pragma omp target teams distribute parallel for |
323 | 374 | #endif |
324 | 375 |
|
| 376 | + #ifdef GPU_OPENACC |
| 377 | + #pragma acc parallel loop present(rec_points_interval,rec_points_values,rec_points_values_offset,u,receivers) |
| 378 | + #endif |
| 379 | + |
325 | 380 | // for each receiver |
326 | 381 | for(size_t rec = 0; rec < num_receivers; rec++){ |
327 | 382 |
|
@@ -391,7 +446,11 @@ double forward(f_type *u, f_type *velocity, f_type *damp, |
391 | 446 | #endif |
392 | 447 |
|
393 | 448 | #ifdef GPU_OPENMP |
394 | | - #pragma omp target teams distribute parallel for |
| 449 | + #pragma omp target teams distribute parallel for collapse(2) |
| 450 | + #endif |
| 451 | + |
| 452 | + #ifdef GPU_OPENACC |
| 453 | + #pragma acc parallel loop collapse(2) present(u) |
395 | 454 | #endif |
396 | 455 |
|
397 | 456 | // exchange of values required |
@@ -435,6 +494,25 @@ double forward(f_type *u, f_type *velocity, f_type *damp, |
435 | 494 | #pragma omp target exit data map(delete: wavelet[:wavelet_size * wavelet_count]) |
436 | 495 | #endif |
437 | 496 |
|
| 497 | + #ifdef GPU_OPENACC |
| 498 | + #pragma acc exit data copyout(receivers[:shot_record_size]) |
| 499 | + #pragma acc exit data copyout(u[:u_size]) |
| 500 | + |
| 501 | + #pragma acc exit data delete(receivers[:shot_record_size]) |
| 502 | + #pragma acc exit data delete(u[:u_size]) |
| 503 | + |
| 504 | + #pragma acc exit data delete(velocity[:domain_size]) |
| 505 | + #pragma acc exit data delete(damp[:domain_size]) |
| 506 | + #pragma acc exit data delete(coeff[:stencil_radius+1]) |
| 507 | + #pragma acc exit data delete(src_points_interval[:src_points_interval_size]) |
| 508 | + #pragma acc exit data delete(src_points_values[:src_points_values_size]) |
| 509 | + #pragma acc exit data delete(src_points_values_offset[:num_sources]) |
| 510 | + #pragma acc exit data delete(rec_points_interval[:rec_points_interval_size]) |
| 511 | + #pragma acc exit data delete(rec_points_values[:rec_points_values_size]) |
| 512 | + #pragma acc exit data delete(rec_points_values_offset[:num_receivers]) |
| 513 | + #pragma acc exit data delete(wavelet[:wavelet_size * wavelet_count]) |
| 514 | + #endif |
| 515 | + |
438 | 516 | // get the end time |
439 | 517 | gettimeofday(&time_end, NULL); |
440 | 518 |
|
|
0 commit comments