Skip to content

Commit 1736fa8

Browse files
committed
fix OpenACC performance issue
1 parent 8a20f26 commit 1736fa8

File tree

4 files changed

+132
-0
lines changed

4 files changed

+132
-0
lines changed

simwave/kernel/backend/c_code/forward/constant_density/2d/wave.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
153153
f_type sum_z = coeff[0] * u[current_snapshot];
154154

155155
// radius of the stencil
156+
#ifdef GPU_OPENACC
157+
#pragma acc loop seq
158+
#endif
156159
for(size_t ir = 1; ir <= stencil_radius; ir++){
157160
//neighbors in the horizontal direction
158161
sum_x += coeff[ir] * (u[current_snapshot + ir] + u[current_snapshot - ir]);
@@ -221,10 +224,16 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
221224
size_t kws_index_z = offset_src_kws_index_z;
222225

223226
// for each source point in the Z axis
227+
#ifdef GPU_OPENACC
228+
#pragma acc loop seq
229+
#endif
224230
for(size_t i = src_z_begin; i <= src_z_end; i++){
225231
size_t kws_index_x = offset_src_kws_index_z + src_z_num_points;
226232

227233
// for each source point in the X axis
234+
#ifdef GPU_OPENACC
235+
#pragma acc loop seq
236+
#endif
228237
for(size_t j = src_x_begin; j <= src_x_end; j++){
229238

230239
f_type kws = src_points_values[kws_index_z] * src_points_values[kws_index_x];
@@ -287,6 +296,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
287296

288297
// null neumann on the left
289298
if(x_before == 2){
299+
#ifdef GPU_OPENACC
300+
#pragma acc loop seq
301+
#endif
290302
for(size_t ir = 1; ir <= stencil_radius; ir++){
291303
size_t domain_offset = i * nx + stencil_radius;
292304
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -303,6 +315,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
303315

304316
// null neumann on the right
305317
if(x_after == 2){
318+
#ifdef GPU_OPENACC
319+
#pragma acc loop seq
320+
#endif
306321
for(size_t ir = 1; ir <= stencil_radius; ir++){
307322
size_t domain_offset = i * nx + (nx - stencil_radius - 1);
308323
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -336,6 +351,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
336351

337352
// null neumann on the top
338353
if(z_before == 2){
354+
#ifdef GPU_OPENACC
355+
#pragma acc loop seq
356+
#endif
339357
for(size_t ir = 1; ir <= stencil_radius; ir++){
340358
size_t domain_offset = stencil_radius * nx + j;
341359
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -352,6 +370,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
352370

353371
// null neumann on the bottom
354372
if(z_after == 2){
373+
#ifdef GPU_OPENACC
374+
#pragma acc loop seq
375+
#endif
355376
for(size_t ir = 1; ir <= stencil_radius; ir++){
356377
size_t domain_offset = (nz - stencil_radius - 1) * nx + j;
357378
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -404,10 +425,16 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
404425
size_t kws_index_z = offset_rec_kws_index_z;
405426

406427
// for each receiver point in the Z axis
428+
#ifdef GPU_OPENACC
429+
#pragma acc loop seq
430+
#endif
407431
for(size_t i = rec_z_begin; i <= rec_z_end; i++){
408432
size_t kws_index_x = offset_rec_kws_index_z + rec_z_num_points;
409433

410434
// for each receiver point in the X axis
435+
#ifdef GPU_OPENACC
436+
#pragma acc loop seq
437+
#endif
411438
for(size_t j = rec_x_begin; j <= rec_x_end; j++){
412439

413440
f_type kws = rec_points_values[kws_index_z] * rec_points_values[kws_index_x];

simwave/kernel/backend/c_code/forward/constant_density/3d/wave.c

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
157157
f_type sum_z = coeff[0] * u[current_snapshot];
158158

159159
// radius of the stencil
160+
#ifdef GPU_OPENACC
161+
#pragma acc loop seq
162+
#endif
160163
for(size_t ir = 1; ir <= stencil_radius; ir++){
161164
//neighbors in the Y direction
162165
sum_y += coeff[ir] * (u[current_snapshot + ir] + u[current_snapshot - ir]);
@@ -235,15 +238,24 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
235238
size_t kws_index_z = offset_src_kws_index_z;
236239

237240
// for each source point in the Z axis
241+
#ifdef GPU_OPENACC
242+
#pragma acc loop seq
243+
#endif
238244
for(size_t i = src_z_begin; i <= src_z_end; i++){
239245
size_t kws_index_x = offset_src_kws_index_z + src_z_num_points;
240246

241247
// for each source point in the X axis
248+
#ifdef GPU_OPENACC
249+
#pragma acc loop seq
250+
#endif
242251
for(size_t j = src_x_begin; j <= src_x_end; j++){
243252

244253
size_t kws_index_y = offset_src_kws_index_z + src_z_num_points + src_x_num_points;
245254

246255
// for each source point in the Y axis
256+
#ifdef GPU_OPENACC
257+
#pragma acc loop seq
258+
#endif
247259
for(size_t k = src_y_begin; k <= src_y_end; k++){
248260

249261
f_type kws = src_points_values[kws_index_z] * src_points_values[kws_index_x] * src_points_values[kws_index_y];
@@ -311,6 +323,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
311323

312324
// null neumann on the left
313325
if(y_before == 2){
326+
#ifdef GPU_OPENACC
327+
#pragma acc loop seq
328+
#endif
314329
for(size_t ir = 1; ir <= stencil_radius; ir++){
315330
size_t domain_offset = (i * nx + j) * ny + stencil_radius;
316331
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -327,6 +342,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
327342

328343
// null neumann on the right
329344
if(y_after == 2){
345+
#ifdef GPU_OPENACC
346+
#pragma acc loop seq
347+
#endif
330348
for(size_t ir = 1; ir <= stencil_radius; ir++){
331349
size_t domain_offset = (i * nx + j) * ny + (ny - stencil_radius - 1);
332350
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -362,6 +380,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
362380

363381
// null neumann on the front
364382
if(x_before == 2){
383+
#ifdef GPU_OPENACC
384+
#pragma acc loop seq
385+
#endif
365386
for(size_t ir = 1; ir <= stencil_radius; ir++){
366387
size_t domain_offset = (i * nx + stencil_radius) * ny + k;
367388
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -378,6 +399,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
378399

379400
// null neumann on the back
380401
if(x_after == 2){
402+
#ifdef GPU_OPENACC
403+
#pragma acc loop seq
404+
#endif
381405
for(size_t ir = 1; ir <= stencil_radius; ir++){
382406
size_t domain_offset = (i * nx + (nx - stencil_radius - 1)) * ny + k;
383407
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -413,6 +437,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
413437

414438
// null neumann on the top
415439
if(z_before == 2){
440+
#ifdef GPU_OPENACC
441+
#pragma acc loop seq
442+
#endif
416443
for(size_t ir = 1; ir <= stencil_radius; ir++){
417444
size_t domain_offset = (stencil_radius * nx + j) * ny + k;
418445
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -429,6 +456,9 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
429456

430457
// null neumann on the bottom
431458
if(z_after == 2){
459+
#ifdef GPU_OPENACC
460+
#pragma acc loop seq
461+
#endif
432462
for(size_t ir = 1; ir <= stencil_radius; ir++){
433463
size_t domain_offset = ((nz - stencil_radius - 1) * nx + j) * ny + k;
434464
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -487,15 +517,24 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
487517
size_t kws_index_z = offset_rec_kws_index_z;
488518

489519
// for each receiver point in the Z axis
520+
#ifdef GPU_OPENACC
521+
#pragma acc loop seq
522+
#endif
490523
for(size_t i = rec_z_begin; i <= rec_z_end; i++){
491524
size_t kws_index_x = offset_rec_kws_index_z + rec_z_num_points;
492525

493526
// for each receiver point in the X axis
527+
#ifdef GPU_OPENACC
528+
#pragma acc loop seq
529+
#endif
494530
for(size_t j = rec_x_begin; j <= rec_x_end; j++){
495531

496532
size_t kws_index_y = offset_rec_kws_index_z + rec_z_num_points + rec_x_num_points;
497533

498534
// for each source point in the Y axis
535+
#ifdef GPU_OPENACC
536+
#pragma acc loop seq
537+
#endif
499538
for(size_t k = rec_y_begin; k <= rec_y_end; k++){
500539

501540
f_type kws = rec_points_values[kws_index_z] * rec_points_values[kws_index_x] * rec_points_values[kws_index_y];

simwave/kernel/backend/c_code/forward/variable_density/2d/wave.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ double forward(f_type *u, f_type *velocity, f_type *density, f_type *damp,
166166
f_type fd_density_z = 0.0;
167167

168168
// radius of the stencil
169+
#ifdef GPU_OPENACC
170+
#pragma acc loop seq
171+
#endif
169172
for(size_t ir = 1; ir <= stencil_radius; ir++){
170173
//neighbors in the horizontal direction
171174
sd_pressure_x += coeff_order2[ir] * (u[current_snapshot + ir] + u[current_snapshot - ir]);
@@ -244,10 +247,16 @@ double forward(f_type *u, f_type *velocity, f_type *density, f_type *damp,
244247
size_t kws_index_z = offset_src_kws_index_z;
245248

246249
// for each source point in the Z axis
250+
#ifdef GPU_OPENACC
251+
#pragma acc loop seq
252+
#endif
247253
for(size_t i = src_z_begin; i <= src_z_end; i++){
248254
size_t kws_index_x = offset_src_kws_index_z + src_z_num_points;
249255

250256
// for each source point in the X axis
257+
#ifdef GPU_OPENACC
258+
#pragma acc loop seq
259+
#endif
251260
for(size_t j = src_x_begin; j <= src_x_end; j++){
252261

253262
f_type kws = src_points_values[kws_index_z] * src_points_values[kws_index_x];
@@ -309,6 +318,9 @@ double forward(f_type *u, f_type *velocity, f_type *density, f_type *damp,
309318

310319
// null neumann on the left
311320
if(x_before == 2){
321+
#ifdef GPU_OPENACC
322+
#pragma acc loop seq
323+
#endif
312324
for(size_t ir = 1; ir <= stencil_radius; ir++){
313325
size_t domain_offset = i * nx + stencil_radius;
314326
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -325,6 +337,9 @@ double forward(f_type *u, f_type *velocity, f_type *density, f_type *damp,
325337

326338
// null neumann on the right
327339
if(x_after == 2){
340+
#ifdef GPU_OPENACC
341+
#pragma acc loop seq
342+
#endif
328343
for(size_t ir = 1; ir <= stencil_radius; ir++){
329344
size_t domain_offset = i * nx + (nx - stencil_radius - 1);
330345
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -358,6 +373,9 @@ double forward(f_type *u, f_type *velocity, f_type *density, f_type *damp,
358373

359374
// null neumann on the top
360375
if(z_before == 2){
376+
#ifdef GPU_OPENACC
377+
#pragma acc loop seq
378+
#endif
361379
for(size_t ir = 1; ir <= stencil_radius; ir++){
362380
size_t domain_offset = stencil_radius * nx + j;
363381
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -374,6 +392,9 @@ double forward(f_type *u, f_type *velocity, f_type *density, f_type *damp,
374392

375393
// null neumann on the bottom
376394
if(z_after == 2){
395+
#ifdef GPU_OPENACC
396+
#pragma acc loop seq
397+
#endif
377398
for(size_t ir = 1; ir <= stencil_radius; ir++){
378399
size_t domain_offset = (nz - stencil_radius - 1) * nx + j;
379400
size_t next_snapshot = next_t * domain_size + domain_offset;
@@ -426,10 +447,16 @@ double forward(f_type *u, f_type *velocity, f_type *density, f_type *damp,
426447
size_t kws_index_z = offset_rec_kws_index_z;
427448

428449
// for each receiver point in the Z axis
450+
#ifdef GPU_OPENACC
451+
#pragma acc loop seq
452+
#endif
429453
for(size_t i = rec_z_begin; i <= rec_z_end; i++){
430454
size_t kws_index_x = offset_rec_kws_index_z + rec_z_num_points;
431455

432456
// for each receiver point in the X axis
457+
#ifdef GPU_OPENACC
458+
#pragma acc loop seq
459+
#endif
433460
for(size_t j = rec_x_begin; j <= rec_x_end; j++){
434461

435462
f_type kws = rec_points_values[kws_index_z] * rec_points_values[kws_index_x];

0 commit comments

Comments
 (0)