|
| 1 | +module accelerated_module |
| 2 | + use iso_c_binding, only : c_double,c_float,c_int |
| 3 | + implicit none |
| 4 | + |
| 5 | + private |
| 6 | + public :: co_dot_accelerated |
| 7 | + public :: co_dot_unaccelerated |
| 8 | + public :: co_dot_manually_accelerated |
| 9 | + public :: co_dot_mapped_manually_accelerated |
| 10 | + public :: CUDA,OpenACC,OpenMP |
| 11 | + public :: walltime |
| 12 | + |
| 13 | + ! Explicit interfaces for procedures that wrap accelerated kernels |
| 14 | + interface |
| 15 | + |
| 16 | + ! This is the wrapper a programmer would have to write today to manually accelerate calculations |
| 17 | + subroutine manual_cudaDot(a,b,partial_dot,n,img) bind(C, name="manual_cudaDot") |
| 18 | + use iso_c_binding, only : c_float,c_int |
| 19 | + real(c_float) :: a(*),b(*) |
| 20 | + real(c_float) :: partial_dot |
| 21 | + integer(c_int),value :: n |
| 22 | + integer(c_int),value :: img |
| 23 | + end subroutine |
| 24 | + |
| 25 | + subroutine manual_mapped_cudaDot(a,b,partial_dot,n,img) bind(C, name="manual_mapped_cudaDot") |
| 26 | + use iso_c_binding, only : c_float,c_int |
| 27 | + real(c_float) :: a(*),b(*) |
| 28 | + real(c_float) :: partial_dot |
| 29 | + integer(c_int),value :: n |
| 30 | + integer(c_int),value :: img |
| 31 | + end subroutine |
| 32 | + |
| 33 | + ! This wrapper exploits the OpenCoarrays acceleration support and is therefore simpler |
| 34 | + subroutine cudaDot(a,b,partial_dot,n) bind(C, name="cudaDot") |
| 35 | + use iso_c_binding, only : c_float,c_int |
| 36 | + real(c_float) :: a(*),b(*) |
| 37 | + real(c_float) :: partial_dot |
| 38 | + integer(c_int),value :: n |
| 39 | + end subroutine |
| 40 | + |
| 41 | + function WALLTIME() bind(C, name = "WALLTIME") |
| 42 | + use iso_fortran_env |
| 43 | + real(real64) :: WALLTIME |
| 44 | + end function WALLTIME |
| 45 | + |
| 46 | + end interface |
| 47 | + |
| 48 | + enum, bind(C) |
| 49 | + enumerator CUDA,OpenACC,OpenMP |
| 50 | + end enum |
| 51 | + |
| 52 | +contains |
| 53 | + |
| 54 | + ! This parallel collective dot product uses no acceleration |
| 55 | + subroutine co_dot_unaccelerated(x,y,x_dot_y) |
| 56 | + real(c_float), intent(in) :: x(:),y(:) |
| 57 | + real(c_float), intent(out) :: x_dot_y |
| 58 | + x_dot_y = dot_product(x,y) ! Call Fortran intrinsic dot product on the local data |
| 59 | + call co_sum(x_dot_y) ! Call Fortarn 2015 collective sum |
| 60 | + end subroutine |
| 61 | + |
| 62 | + ! This parallel collective dot product uses manual acceleration |
| 63 | + subroutine co_dot_manually_accelerated(x,y,x_dot_y) |
| 64 | + real(c_float), intent(in) :: x(:),y(:) |
| 65 | + real(c_float), intent(out) :: x_dot_y |
| 66 | + call manual_cudaDot(x,y,x_dot_y,size(x),this_image()-1) |
| 67 | + call co_sum(x_dot_y) ! Call Fortarn 2015 collective sum |
| 68 | + end subroutine |
| 69 | + |
| 70 | + subroutine co_dot_mapped_manually_accelerated(x,y,x_dot_y) |
| 71 | + real(c_float), intent(in) :: x(:),y(:) |
| 72 | + real(c_float), intent(out) :: x_dot_y |
| 73 | + call manual_mapped_cudaDot(x,y,x_dot_y,size(x),this_image()-1) |
| 74 | + call co_sum(x_dot_y) ! Call Fortarn 2015 collective sum |
| 75 | + end subroutine |
| 76 | + |
| 77 | + ! Exploit the OpenCoarrays support for a accelerated dot products |
| 78 | + ! using any one of several acceleration APIs: OpenACC, CUDA, OpenMP 4.0, etc. |
| 79 | + ! On heterogeneous platforms, the API choice can vary in space (e.g., from one image/node to the |
| 80 | + ! next) or in time (e.g., based on dynamic detection of the hardware or network behavior). |
| 81 | + subroutine co_dot_accelerated(x,y,x_dot_y,API) |
| 82 | + real, accelerated, intent(in) :: x(:)[*],y(:)[*] ! These are only coarrays to facilitate marking them as accelerated |
| 83 | + real, accelerated, intent(out) :: x_dot_y[*] ! This is only a coarray to facilitate marking it as accelerated |
| 84 | + integer(c_int), intent(in) :: API |
| 85 | + select case(API) |
| 86 | + case(CUDA) |
| 87 | + call cudaDot(x,y,x_dot_y,size(x)) ! Accelerated reduction on local data |
| 88 | + case(OpenMP) |
| 89 | + error stop "OpenMP acceleration not yet implemented." |
| 90 | + case(OpenACC) |
| 91 | + error stop "OpenACC acceleration not yet implemented." |
| 92 | + case default |
| 93 | + error stop "Invalid acceleration API choice." |
| 94 | + end select |
| 95 | + call co_sum(x_dot_y) ! Fortran 2015 coarray collective |
| 96 | + end subroutine |
| 97 | + |
| 98 | +end module |
| 99 | + |
| 100 | +program cu_dot_test |
| 101 | + use iso_c_binding, only : c_double,c_float,c_int |
| 102 | + implicit none |
| 103 | + |
| 104 | + ! Unaccelerated variables |
| 105 | + real(c_float), allocatable :: a(:),b(:) |
| 106 | + real(c_float) :: dot |
| 107 | + real(c_double) :: t_start, t_end |
| 108 | + |
| 109 | + ! Compiler/library-accelerated variables |
| 110 | + real(c_float), allocatable, accelerated :: a_acc(:)[:], b_acc(:)[:] |
| 111 | + real(c_float), accelerated :: dot_acc[*] |
| 112 | + |
| 113 | + ! Manually accelerated variables |
| 114 | + real(c_float), allocatable :: a_man(:)[:], b_man(:)[:] |
| 115 | + real(c_float) :: dot_man[*] |
| 116 | + |
| 117 | + integer(c_int),parameter :: n = 99900000 |
| 118 | + integer(c_int) :: n_local,np,me |
| 119 | + |
| 120 | + np = num_images() |
| 121 | + me = this_image() |
| 122 | + |
| 123 | + if (mod(n,np)/=0) error stop "n is not evenly divisible by num_images()" |
| 124 | + n_local = n/np |
| 125 | + |
| 126 | + call initialize_all_variables |
| 127 | + sync all |
| 128 | + |
| 129 | + block |
| 130 | +! use accelerated_module, only : co_dot_accelerated,co_dot_unaccelerated,co_dot_manually_accelerated,CUDA,walltime,co_dot_mapped_manually_accelerated |
| 131 | + use accelerated_module |
| 132 | + |
| 133 | + !Parallel execution |
| 134 | + t_start = walltime() |
| 135 | + call co_dot_accelerated(a_acc,b_acc,dot_acc,CUDA) |
| 136 | + t_end = walltime() |
| 137 | + if(me==1) print *, 'Accelerated dot_prod',dot_acc,'time:',t_end-t_start |
| 138 | + |
| 139 | + sync all |
| 140 | + |
| 141 | + t_start = walltime() |
| 142 | + call co_dot_manually_accelerated(a_man,b_man,dot_man) |
| 143 | + t_end = walltime() |
| 144 | + if(me==1) print *, 'Manually accelerated dot_prod',dot_man,'time:',t_end-t_start |
| 145 | + |
| 146 | + sync all |
| 147 | + |
| 148 | + !Serial execution |
| 149 | + t_start = walltime() |
| 150 | + call co_dot_unaccelerated(a_man,b_man,dot) |
| 151 | + t_end = walltime() |
| 152 | + if(me==1) print *, 'Serial result',dot,'time:',t_end-t_start |
| 153 | + |
| 154 | + sync all |
| 155 | + |
| 156 | + t_start = walltime() |
| 157 | + call co_dot_mapped_manually_accelerated(a_man,b_man,dot) |
| 158 | + t_end = walltime() |
| 159 | + if(me==1) print *, 'Manually mapped',dot,'time:',t_end-t_start |
| 160 | + end block |
| 161 | + |
| 162 | +contains |
| 163 | + |
| 164 | + subroutine initialize_all_variables() |
| 165 | + integer(c_int) :: i |
| 166 | + allocate(a_acc(n_local)[*],b_acc(n_local)[*]) |
| 167 | + allocate(a_man(n_local)[*],b_man(n_local)[*]) |
| 168 | + |
| 169 | + if(me == 1) then |
| 170 | + ! Initialize the local unaccelerated data on every image |
| 171 | + b = [(1.,i=1,n)] |
| 172 | + ! For even n, a is orthogonal to b |
| 173 | + a = [((-1.)**i,i=1,n)] |
| 174 | + ! Scatter a and b to a_cc and b_cc |
| 175 | + do i=1,np |
| 176 | + a_acc(1:n_local)[i] = a(n_local*(i-1)+1:n_local*i) |
| 177 | + a_man(1:n_local)[i] = a(n_local*(i-1)+1:n_local*i) |
| 178 | + b_acc(1:n_local)[i] = b(n_local*(i-1)+1:n_local*i) |
| 179 | + b_man(1:n_local)[i] = b(n_local*(i-1)+1:n_local*i) |
| 180 | + enddo |
| 181 | + endif |
| 182 | + end subroutine |
| 183 | + |
| 184 | +end program |
0 commit comments