Skip to content

Commit 8d56dc5

Browse files
committed
add OpenACC support
1 parent 81d3031 commit 8d56dc5

File tree

9 files changed

+479
-30
lines changed

9 files changed

+479
-30
lines changed

examples/acoustic_2D.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,43 @@
55
import numpy as np
66

77

8+
# available language options:
9+
# c (sequential)
10+
# cpu_openmp (parallel CPU)
11+
# gpu_openmp (GPU)
12+
# gpu_openacc (GPU)
13+
compiler_options = {
14+
'c': {
15+
'cc': 'gcc',
16+
'language': 'c',
17+
'cflags': '-O3 -fPIC -ffast-math -Wall -std=c99 -shared'
18+
},
19+
'cpu_openmp': {
20+
'cc': 'gcc',
21+
'language': 'cpu_openmp',
22+
'cflags': '-O3 -fPIC -ffast-math -Wall -std=c99 -shared -fopenmp'
23+
},
24+
'gpu_openmp': {
25+
'cc': 'clang',
26+
'language': 'gpu_openmp',
27+
'cflags': '-O3 -fPIC -ffast-math -fopenmp \
28+
-fopenmp-targets=nvptx64-nvidia-cuda \
29+
-Xopenmp-target -march=sm_75'
30+
},
31+
'gpu_openacc': {
32+
'cc': 'pgcc',
33+
'language': 'gpu_openacc',
34+
'cflags': '-O3 -fPIC -acc:gpu -gpu=pinned -mp'
35+
},
36+
}
37+
38+
selected_compiler = compiler_options['c']
39+
840
# set compiler options
9-
# available language options: c (sequential) or cpu_openmp (parallel CPU)
1041
compiler = Compiler(
11-
cc='gcc',
12-
language='cpu_openmp',
13-
cflags='-O3 -fPIC -ffast-math -Wall -std=c99 -shared'
14-
# cflags='-O3 -fPIC -ffast-math -fopenmp \
15-
# -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_75'
42+
cc=selected_compiler['cc'],
43+
language=selected_compiler['language'],
44+
cflags=selected_compiler['cflags']
1645
)
1746

1847
# Velocity model

examples/acoustic_2D_density.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,43 @@
55
import numpy as np
66

77

8+
# available language options:
9+
# c (sequential)
10+
# cpu_openmp (parallel CPU)
11+
# gpu_openmp (GPU)
12+
# gpu_openacc (GPU)
13+
compiler_options = {
14+
'c': {
15+
'cc': 'gcc',
16+
'language': 'c',
17+
'cflags': '-O3 -fPIC -ffast-math -Wall -std=c99 -shared'
18+
},
19+
'cpu_openmp': {
20+
'cc': 'gcc',
21+
'language': 'cpu_openmp',
22+
'cflags': '-O3 -fPIC -ffast-math -Wall -std=c99 -shared -fopenmp'
23+
},
24+
'gpu_openmp': {
25+
'cc': 'clang',
26+
'language': 'gpu_openmp',
27+
'cflags': '-O3 -fPIC -ffast-math -fopenmp \
28+
-fopenmp-targets=nvptx64-nvidia-cuda \
29+
-Xopenmp-target -march=sm_75'
30+
},
31+
'gpu_openacc': {
32+
'cc': 'pgcc',
33+
'language': 'gpu_openacc',
34+
'cflags': '-O3 -fPIC -acc:gpu -gpu=pinned -mp'
35+
},
36+
}
37+
38+
selected_compiler = compiler_options['c']
39+
840
# set compiler options
9-
# available language options: c (sequential) or cpu_openmp (parallel CPU)
1041
compiler = Compiler(
11-
cc='gcc',
12-
language='cpu_openmp',
13-
cflags='-O3 -fPIC -ffast-math -Wall -std=c99 -shared'
42+
cc=selected_compiler['cc'],
43+
language=selected_compiler['language'],
44+
cflags=selected_compiler['cflags']
1445
)
1546

1647
# Velocity model

examples/acoustic_3D.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,43 @@
44
)
55
import numpy as np
66

7+
# available language options:
8+
# c (sequential)
9+
# cpu_openmp (parallel CPU)
10+
# gpu_openmp (GPU)
11+
# gpu_openacc (GPU)
12+
compiler_options = {
13+
'c': {
14+
'cc': 'gcc',
15+
'language': 'c',
16+
'cflags': '-O3 -fPIC -ffast-math -Wall -std=c99 -shared'
17+
},
18+
'cpu_openmp': {
19+
'cc': 'gcc',
20+
'language': 'cpu_openmp',
21+
'cflags': '-O3 -fPIC -ffast-math -Wall -std=c99 -shared -fopenmp'
22+
},
23+
'gpu_openmp': {
24+
'cc': 'clang',
25+
'language': 'gpu_openmp',
26+
'cflags': '-O3 -fPIC -ffast-math -fopenmp \
27+
-fopenmp-targets=nvptx64-nvidia-cuda \
28+
-Xopenmp-target -march=sm_75'
29+
},
30+
'gpu_openacc': {
31+
'cc': 'pgcc',
32+
'language': 'gpu_openacc',
33+
'cflags': '-O3 -fPIC -acc:gpu -gpu=pinned -mp -DDEVICEID=2'
34+
},
35+
}
36+
37+
selected_compiler = compiler_options['c']
738

839
# set compiler options
9-
# available language options: c (sequential) or cpu_openmp (parallel CPU)
1040
compiler = Compiler(
11-
cc='gcc',
12-
language='cpu_openmp',
13-
cflags='-O3 -fPIC -ffast-math -Wall -std=c99 -shared'
14-
# cflags='-O3 -fPIC -ffast-math -fopenmp \
15-
# -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_75'
41+
cc=selected_compiler['cc'],
42+
language=selected_compiler['language'],
43+
cflags=selected_compiler['cflags']
1644
)
1745

1846
# Velocity model

examples/acoustic_3D_density.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,43 @@
55
import numpy as np
66

77

8+
# available language options:
9+
# c (sequential)
10+
# cpu_openmp (parallel CPU)
11+
# gpu_openmp (GPU)
12+
# gpu_openacc (GPU)
13+
compiler_options = {
14+
'c': {
15+
'cc': 'gcc',
16+
'language': 'c',
17+
'cflags': '-O3 -fPIC -ffast-math -Wall -std=c99 -shared'
18+
},
19+
'cpu_openmp': {
20+
'cc': 'gcc',
21+
'language': 'cpu_openmp',
22+
'cflags': '-O3 -fPIC -ffast-math -Wall -std=c99 -shared -fopenmp'
23+
},
24+
'gpu_openmp': {
25+
'cc': 'clang',
26+
'language': 'gpu_openmp',
27+
'cflags': '-O3 -fPIC -ffast-math -fopenmp \
28+
-fopenmp-targets=nvptx64-nvidia-cuda \
29+
-Xopenmp-target -march=sm_75'
30+
},
31+
'gpu_openacc': {
32+
'cc': 'pgcc',
33+
'language': 'gpu_openacc',
34+
'cflags': '-O3 -fPIC -acc:gpu -gpu=pinned -mp'
35+
},
36+
}
37+
38+
selected_compiler = compiler_options['c']
39+
840
# set compiler options
9-
# available language options: c (sequential) or cpu_openmp (parallel CPU)
1041
compiler = Compiler(
11-
cc='gcc',
12-
language='cpu_openmp',
13-
cflags='-O3 -fPIC -ffast-math -Wall -std=c99 -shared'
14-
# cflags='-O3 -fPIC -ffast-math -fopenmp \
15-
# -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_75'
42+
cc=selected_compiler['cc'],
43+
language=selected_compiler['language'],
44+
cflags=selected_compiler['cflags']
1645
)
1746

1847
# Velocity model

simwave/kernel/backend/c_code/forward/constant_density/2d/wave.c

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
#include <omp.h>
88
#endif
99

10+
#if defined(GPU_OPENACC)
11+
#include <openacc.h>
12+
#endif
13+
1014
// use single (float) or double precision
1115
// according to the value passed in the compilation cmd
1216
#if defined(FLOAT)
@@ -75,6 +79,31 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
7579
#pragma omp target enter data map(to: receivers[:shot_record_size])
7680
#endif
7781

82+
#ifdef GPU_OPENACC
83+
84+
// select the device
85+
#ifdef DEVICEID
86+
acc_init(acc_device_nvidia);
87+
acc_set_device_num(DEVICEID, acc_device_nvidia);
88+
#endif
89+
90+
size_t shot_record_size = wavelet_size * num_receivers;
91+
size_t u_size = num_snapshots * domain_size;
92+
93+
#pragma acc enter data copyin(u[:u_size])
94+
#pragma acc enter data copyin(velocity[:domain_size])
95+
#pragma acc enter data copyin(damp[:domain_size])
96+
#pragma acc enter data copyin(coeff[:stencil_radius+1])
97+
#pragma acc enter data copyin(src_points_interval[:src_points_interval_size])
98+
#pragma acc enter data copyin(src_points_values[:src_points_values_size])
99+
#pragma acc enter data copyin(src_points_values_offset[:num_sources])
100+
#pragma acc enter data copyin(rec_points_interval[:rec_points_interval_size])
101+
#pragma acc enter data copyin(rec_points_values[:rec_points_values_size])
102+
#pragma acc enter data copyin(rec_points_values_offset[:num_receivers])
103+
#pragma acc enter data copyin(wavelet[:wavelet_size * wavelet_count])
104+
#pragma acc enter data copyin(receivers[:shot_record_size])
105+
#endif
106+
78107
// wavefield modeling
79108
for(size_t n = begin_timestep; n <= end_timestep; n++) {
80109

@@ -104,6 +133,10 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
104133
#pragma omp target teams distribute parallel for collapse(2)
105134
#endif
106135

136+
#ifdef GPU_OPENACC
137+
#pragma acc parallel loop collapse(2) present(coeff,damp,u,velocity)
138+
#endif
139+
107140
for(size_t i = stencil_radius; i < nz - stencil_radius; i++) {
108141
for(size_t j = stencil_radius; j < nx - stencil_radius; j++) {
109142
// index of the current point in the grid
@@ -151,6 +184,10 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
151184
#pragma omp target teams distribute parallel for
152185
#endif
153186

187+
#ifdef GPU_OPENACC
188+
#pragma acc parallel loop present(src_points_interval,src_points_values,src_points_values_offset,u,velocity,wavelet)
189+
#endif
190+
154191
// for each source
155192
for(size_t src = 0; src < num_sources; src++){
156193

@@ -201,6 +238,10 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
201238
#if defined(CPU_OPENMP) || defined(GPU_OPENMP)
202239
#pragma omp atomic
203240
#endif
241+
242+
#ifdef GPU_OPENACC
243+
#pragma acc atomic update
244+
#endif
204245
u[next_snapshot] += value;
205246

206247
kws_index_x++;
@@ -230,6 +271,11 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
230271
#ifdef GPU_OPENMP
231272
#pragma omp target teams distribute parallel for
232273
#endif
274+
275+
#ifdef GPU_OPENACC
276+
#pragma acc parallel loop present(u)
277+
#endif
278+
233279
for(size_t i = stencil_radius; i < nz - stencil_radius; i++){
234280

235281
// null dirichlet on the left
@@ -274,6 +320,11 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
274320
#ifdef GPU_OPENMP
275321
#pragma omp target teams distribute parallel for
276322
#endif
323+
324+
#ifdef GPU_OPENACC
325+
#pragma acc parallel loop present(u)
326+
#endif
327+
277328
for(size_t j = stencil_radius; j < nx - stencil_radius; j++){
278329

279330
// null dirichlet on the top
@@ -322,6 +373,10 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
322373
#pragma omp target teams distribute parallel for
323374
#endif
324375

376+
#ifdef GPU_OPENACC
377+
#pragma acc parallel loop present(rec_points_interval,rec_points_values,rec_points_values_offset,u,receivers)
378+
#endif
379+
325380
// for each receiver
326381
for(size_t rec = 0; rec < num_receivers; rec++){
327382

@@ -391,7 +446,11 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
391446
#endif
392447

393448
#ifdef GPU_OPENMP
394-
#pragma omp target teams distribute parallel for
449+
#pragma omp target teams distribute parallel for collapse(2)
450+
#endif
451+
452+
#ifdef GPU_OPENACC
453+
#pragma acc parallel loop collapse(2) present(u)
395454
#endif
396455

397456
// exchange of values ​​required
@@ -435,6 +494,25 @@ double forward(f_type *u, f_type *velocity, f_type *damp,
435494
#pragma omp target exit data map(delete: wavelet[:wavelet_size * wavelet_count])
436495
#endif
437496

497+
#ifdef GPU_OPENACC
498+
#pragma acc exit data copyout(receivers[:shot_record_size])
499+
#pragma acc exit data copyout(u[:u_size])
500+
501+
#pragma acc exit data delete(receivers[:shot_record_size])
502+
#pragma acc exit data delete(u[:u_size])
503+
504+
#pragma acc exit data delete(velocity[:domain_size])
505+
#pragma acc exit data delete(damp[:domain_size])
506+
#pragma acc exit data delete(coeff[:stencil_radius+1])
507+
#pragma acc exit data delete(src_points_interval[:src_points_interval_size])
508+
#pragma acc exit data delete(src_points_values[:src_points_values_size])
509+
#pragma acc exit data delete(src_points_values_offset[:num_sources])
510+
#pragma acc exit data delete(rec_points_interval[:rec_points_interval_size])
511+
#pragma acc exit data delete(rec_points_values[:rec_points_values_size])
512+
#pragma acc exit data delete(rec_points_values_offset[:num_receivers])
513+
#pragma acc exit data delete(wavelet[:wavelet_size * wavelet_count])
514+
#endif
515+
438516
// get the end time
439517
gettimeofday(&time_end, NULL);
440518

0 commit comments

Comments
 (0)