Skip to content

Commit 854a8f9

Browse files
author
Damian Rouson
committed
Commiting initial co_dot exercises as integration tests.
1 parent 7e7815c commit 854a8f9

File tree

4 files changed

+533
-0
lines changed

4 files changed

+533
-0
lines changed
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
module accelerated_module
2+
use iso_c_binding, only : c_double,c_float,c_int
3+
implicit none
4+
5+
private
6+
public :: co_dot_accelerated
7+
public :: co_dot_unaccelerated
8+
public :: co_dot_manually_accelerated
9+
public :: co_dot_mapped_manually_accelerated
10+
public :: CUDA,OpenACC,OpenMP
11+
public :: walltime
12+
13+
! Explicit interfaces for procedures that wrap accelerated kernels
14+
interface
15+
16+
! This is the wrapper a programmer would have to write today to manually accelerate calculations
17+
subroutine manual_cudaDot(a,b,partial_dot,n,img) bind(C, name="manual_cudaDot")
18+
use iso_c_binding, only : c_float,c_int
19+
real(c_float) :: a(*),b(*)
20+
real(c_float) :: partial_dot
21+
integer(c_int),value :: n
22+
integer(c_int),value :: img
23+
end subroutine
24+
25+
subroutine manual_mapped_cudaDot(a,b,partial_dot,n,img) bind(C, name="manual_mapped_cudaDot")
26+
use iso_c_binding, only : c_float,c_int
27+
real(c_float) :: a(*),b(*)
28+
real(c_float) :: partial_dot
29+
integer(c_int),value :: n
30+
integer(c_int),value :: img
31+
end subroutine
32+
33+
! This wrapper exploits the OpenCoarrays acceleration support and is therefore simpler
34+
subroutine cudaDot(a,b,partial_dot,n) bind(C, name="cudaDot")
35+
use iso_c_binding, only : c_float,c_int
36+
real(c_float) :: a(*),b(*)
37+
real(c_float) :: partial_dot
38+
integer(c_int),value :: n
39+
end subroutine
40+
41+
function WALLTIME() bind(C, name = "WALLTIME")
42+
use iso_fortran_env
43+
real(real64) :: WALLTIME
44+
end function WALLTIME
45+
46+
end interface
47+
48+
enum, bind(C)
49+
enumerator CUDA,OpenACC,OpenMP
50+
end enum
51+
52+
contains
53+
54+
! This parallel collective dot product uses no acceleration
55+
subroutine co_dot_unaccelerated(x,y,x_dot_y)
56+
real(c_float), intent(in) :: x(:),y(:)
57+
real(c_float), intent(out) :: x_dot_y
58+
x_dot_y = dot_product(x,y) ! Call Fortran intrinsic dot product on the local data
59+
call co_sum(x_dot_y) ! Call Fortarn 2015 collective sum
60+
end subroutine
61+
62+
! This parallel collective dot product uses manual acceleration
63+
subroutine co_dot_manually_accelerated(x,y,x_dot_y)
64+
real(c_float), intent(in) :: x(:),y(:)
65+
real(c_float), intent(out) :: x_dot_y
66+
call manual_cudaDot(x,y,x_dot_y,size(x),this_image()-1)
67+
call co_sum(x_dot_y) ! Call Fortarn 2015 collective sum
68+
end subroutine
69+
70+
subroutine co_dot_mapped_manually_accelerated(x,y,x_dot_y)
71+
real(c_float), intent(in) :: x(:),y(:)
72+
real(c_float), intent(out) :: x_dot_y
73+
call manual_mapped_cudaDot(x,y,x_dot_y,size(x),this_image()-1)
74+
call co_sum(x_dot_y) ! Call Fortarn 2015 collective sum
75+
end subroutine
76+
77+
! Exploit the OpenCoarrays support for a accelerated dot products
78+
! using any one of several acceleration APIs: OpenACC, CUDA, OpenMP 4.0, etc.
79+
! On heterogeneous platforms, the API choice can vary in space (e.g., from one image/node to the
80+
! next) or in time (e.g., based on dynamic detection of the hardware or network behavior).
81+
subroutine co_dot_accelerated(x,y,x_dot_y,API)
82+
real, accelerated, intent(in) :: x(:)[*],y(:)[*] ! These are only coarrays to facilitate marking them as accelerated
83+
real, accelerated, intent(out) :: x_dot_y[*] ! This is only a coarray to facilitate marking it as accelerated
84+
integer(c_int), intent(in) :: API
85+
select case(API)
86+
case(CUDA)
87+
call cudaDot(x,y,x_dot_y,size(x)) ! Accelerated reduction on local data
88+
case(OpenMP)
89+
error stop "OpenMP acceleration not yet implemented."
90+
case(OpenACC)
91+
error stop "OpenACC acceleration not yet implemented."
92+
case default
93+
error stop "Invalid acceleration API choice."
94+
end select
95+
call co_sum(x_dot_y) ! Fortran 2015 coarray collective
96+
end subroutine
97+
98+
end module
99+
100+
program cu_dot_test
101+
use iso_c_binding, only : c_double,c_float,c_int
102+
implicit none
103+
104+
! Unaccelerated variables
105+
real(c_float), allocatable :: a(:),b(:)
106+
real(c_float) :: dot
107+
real(c_double) :: t_start, t_end
108+
109+
! Compiler/library-accelerated variables
110+
real(c_float), allocatable, accelerated :: a_acc(:)[:], b_acc(:)[:]
111+
real(c_float), accelerated :: dot_acc[*]
112+
113+
! Manually accelerated variables
114+
real(c_float), allocatable :: a_man(:)[:], b_man(:)[:]
115+
real(c_float) :: dot_man[*]
116+
117+
integer(c_int),parameter :: n = 99900000
118+
integer(c_int) :: n_local,np,me
119+
120+
np = num_images()
121+
me = this_image()
122+
123+
if (mod(n,np)/=0) error stop "n is not evenly divisible by num_images()"
124+
n_local = n/np
125+
126+
call initialize_all_variables
127+
sync all
128+
129+
block
130+
! use accelerated_module, only : co_dot_accelerated,co_dot_unaccelerated,co_dot_manually_accelerated,CUDA,walltime,co_dot_mapped_manually_accelerated
131+
use accelerated_module
132+
133+
!Parallel execution
134+
t_start = walltime()
135+
call co_dot_accelerated(a_acc,b_acc,dot_acc,CUDA)
136+
t_end = walltime()
137+
if(me==1) print *, 'Accelerated dot_prod',dot_acc,'time:',t_end-t_start
138+
139+
sync all
140+
141+
t_start = walltime()
142+
call co_dot_manually_accelerated(a_man,b_man,dot_man)
143+
t_end = walltime()
144+
if(me==1) print *, 'Manually accelerated dot_prod',dot_man,'time:',t_end-t_start
145+
146+
sync all
147+
148+
!Serial execution
149+
t_start = walltime()
150+
call co_dot_unaccelerated(a_man,b_man,dot)
151+
t_end = walltime()
152+
if(me==1) print *, 'Serial result',dot,'time:',t_end-t_start
153+
154+
sync all
155+
156+
t_start = walltime()
157+
call co_dot_mapped_manually_accelerated(a_man,b_man,dot)
158+
t_end = walltime()
159+
if(me==1) print *, 'Manually mapped',dot,'time:',t_end-t_start
160+
end block
161+
162+
contains
163+
164+
subroutine initialize_all_variables()
165+
integer(c_int) :: i
166+
allocate(a_acc(n_local)[*],b_acc(n_local)[*])
167+
allocate(a_man(n_local)[*],b_man(n_local)[*])
168+
169+
if(me == 1) then
170+
! Initialize the local unaccelerated data on every image
171+
b = [(1.,i=1,n)]
172+
! For even n, a is orthogonal to b
173+
a = [((-1.)**i,i=1,n)]
174+
! Scatter a and b to a_cc and b_cc
175+
do i=1,np
176+
a_acc(1:n_local)[i] = a(n_local*(i-1)+1:n_local*i)
177+
a_man(1:n_local)[i] = a(n_local*(i-1)+1:n_local*i)
178+
b_acc(1:n_local)[i] = b(n_local*(i-1)+1:n_local*i)
179+
b_man(1:n_local)[i] = b(n_local*(i-1)+1:n_local*i)
180+
enddo
181+
endif
182+
end subroutine
183+
184+
end program

src/tests/integration/gpu/co_dot.f90

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
module accelerated_module
2+
use iso_c_binding, only : c_double,c_float,c_int
3+
implicit none
4+
5+
private
6+
public :: co_dot_accelerated
7+
public :: co_dot_unaccelerated
8+
public :: co_dot_manually_accelerated
9+
public :: co_dot_mapped_manually_accelerated
10+
public :: CUDA,OpenACC,OpenMP
11+
public :: walltime
12+
13+
! Explicit interfaces for procedures that wrap accelerated kernels
14+
interface
15+
16+
! This is the wrapper a programmer would have to write today to manually accelerate calculations
17+
subroutine manual_cudaDot(a,b,partial_dot,n,img) bind(C, name="manual_cudaDot")
18+
use iso_c_binding, only : c_float,c_int
19+
real(c_float) :: a(*),b(*)
20+
real(c_float) :: partial_dot
21+
integer(c_int),value :: n
22+
integer(c_int),value :: img
23+
end subroutine
24+
25+
subroutine manual_mapped_cudaDot(a,b,partial_dot,n,img) bind(C, name="manual_mapped_cudaDot")
26+
use iso_c_binding, only : c_float,c_int
27+
real(c_float) :: a(*),b(*)
28+
real(c_float) :: partial_dot
29+
integer(c_int),value :: n
30+
integer(c_int),value :: img
31+
end subroutine
32+
33+
! This wrapper exploits the OpenCoarrays acceleration support and is therefore simpler
34+
subroutine cudaDot(a,b,partial_dot,n) bind(C, name="cudaDot")
35+
use iso_c_binding, only : c_float,c_int
36+
real(c_float) :: a(*),b(*)
37+
real(c_float) :: partial_dot
38+
integer(c_int),value :: n
39+
end subroutine
40+
41+
function WALLTIME() bind(C, name = "WALLTIME")
42+
use iso_fortran_env
43+
real(real64) :: WALLTIME
44+
end function WALLTIME
45+
46+
end interface
47+
48+
enum, bind(C)
49+
enumerator CUDA,OpenACC,OpenMP
50+
end enum
51+
52+
contains
53+
54+
! This parallel collective dot product uses no acceleration
55+
subroutine co_dot_unaccelerated(x,y,x_dot_y)
56+
real(c_float), intent(in) :: x(:),y(:)
57+
real(c_float), intent(out) :: x_dot_y
58+
x_dot_y = dot_product(x,y) ! Call Fortran intrinsic dot product on the local data
59+
call co_sum(x_dot_y) ! Call Fortarn 2015 collective sum
60+
end subroutine
61+
62+
! This parallel collective dot product uses manual acceleration
63+
subroutine co_dot_manually_accelerated(x,y,x_dot_y)
64+
real(c_float), intent(in) :: x(:),y(:)
65+
real(c_float), intent(out) :: x_dot_y
66+
call manual_cudaDot(x,y,x_dot_y,size(x),this_image()-1)
67+
call co_sum(x_dot_y) ! Call Fortarn 2015 collective sum
68+
end subroutine
69+
70+
subroutine co_dot_mapped_manually_accelerated(x,y,x_dot_y)
71+
real(c_float), intent(in) :: x(:),y(:)
72+
real(c_float), intent(out) :: x_dot_y
73+
call manual_mapped_cudaDot(x,y,x_dot_y,size(x),this_image()-1)
74+
call co_sum(x_dot_y) ! Call Fortarn 2015 collective sum
75+
end subroutine
76+
77+
! Exploit the OpenCoarrays support for a accelerated dot products
78+
! using any one of several acceleration APIs: OpenACC, CUDA, OpenMP 4.0, etc.
79+
! On heterogeneous platforms, the API choice can vary in space (e.g., from one image/node to the
80+
! next) or in time (e.g., based on dynamic detection of the hardware or network behavior).
81+
subroutine co_dot_accelerated(x,y,x_dot_y,API)
82+
real, intent(in) :: x(:),y(:)
83+
real, intent(out) :: x_dot_y
84+
integer(c_int), intent(in) :: API
85+
select case(API)
86+
case(CUDA)
87+
call cudaDot(x,y,x_dot_y,size(x)) ! Accelerated reduction on local data
88+
case(OpenMP)
89+
error stop "OpenMP acceleration not yet implemented."
90+
case(OpenACC)
91+
error stop "OpenACC acceleration not yet implemented."
92+
case default
93+
error stop "Invalid acceleration API choice."
94+
end select
95+
call co_sum(x_dot_y) ! Fortran 2015 coarray collective
96+
end subroutine
97+
98+
end module
99+
100+
program cu_dot_test
101+
use iso_c_binding, only : c_double,c_float,c_int
102+
implicit none
103+
104+
! Unaccelerated variables
105+
real(c_float), allocatable :: a(:),b(:)
106+
real(c_float) :: dot
107+
real(c_double) :: t_start, t_end
108+
109+
! Compiler/library-accelerated variables
110+
real(c_float), allocatable :: a_acc(:)[:], b_acc(:)[:]
111+
real(c_float) :: dot_acc[*]
112+
113+
! Manually accelerated variables
114+
real(c_float), allocatable :: a_man(:)[:], b_man(:)[:]
115+
real(c_float) :: dot_man[*]
116+
117+
integer(c_int),parameter :: n = 99900000
118+
integer(c_int) :: n_local,np,me
119+
120+
np = num_images()
121+
me = this_image()
122+
123+
if (mod(n,np)/=0) error stop "n is not evenly divisible by num_images()"
124+
n_local = n/np
125+
126+
call initialize_all_variables
127+
sync all
128+
129+
block
130+
! use accelerated_module, only : co_dot_accelerated,co_dot_unaccelerated,co_dot_manually_accelerated,CUDA,walltime,co_dot_mapped_manually_accelerated
131+
use accelerated_module
132+
133+
!Parallel execution
134+
t_start = walltime()
135+
call co_dot_accelerated(a_acc,b_acc,dot_acc,CUDA)
136+
t_end = walltime()
137+
if(me==1) print *, 'Accelerated dot_prod',dot_acc,'time:',t_end-t_start
138+
139+
sync all
140+
141+
t_start = walltime()
142+
call co_dot_manually_accelerated(a_man,b_man,dot_man)
143+
t_end = walltime()
144+
if(me==1) print *, 'Manually accelerated dot_prod',dot_man,'time:',t_end-t_start
145+
146+
sync all
147+
148+
!Serial execution
149+
t_start = walltime()
150+
call co_dot_unaccelerated(a_man,b_man,dot)
151+
t_end = walltime()
152+
if(me==1) print *, 'Serial result',dot,'time:',t_end-t_start
153+
154+
sync all
155+
156+
t_start = walltime()
157+
call co_dot_mapped_manually_accelerated(a_man,b_man,dot)
158+
t_end = walltime()
159+
if(me==1) print *, 'Manually mapped',dot,'time:',t_end-t_start
160+
end block
161+
162+
contains
163+
164+
subroutine initialize_all_variables()
165+
integer(c_int) :: i
166+
call accelerated_allocate(a_acc(n_local)[*],b_acc(n_local)[*])
167+
call accelerated_allocate(a_man(n_local)[*],b_man(n_local)[*])
168+
169+
if(me == 1) then
170+
! Initialize the local unaccelerated data on every image
171+
b = [(1.,i=1,n)]
172+
! For even n, a is orthogonal to b
173+
a = [((-1.)**i,i=1,n)]
174+
! Scatter a and b to a_cc and b_cc
175+
do i=1,np
176+
a_acc(1:n_local)[i] = a(n_local*(i-1)+1:n_local*i)
177+
a_man(1:n_local)[i] = a(n_local*(i-1)+1:n_local*i)
178+
b_acc(1:n_local)[i] = b(n_local*(i-1)+1:n_local*i)
179+
b_man(1:n_local)[i] = b(n_local*(i-1)+1:n_local*i)
180+
enddo
181+
endif
182+
end subroutine
183+
184+
end program

0 commit comments

Comments
 (0)