@@ -47,8 +47,16 @@ subroutine co_dot_unaccelerated(x,y,x_dot_y)
47
47
subroutine co_dot_accelerated (x ,y ,x_dot_y ,API )
48
48
real , intent (in ) :: x(:),y(:)
49
49
real , intent (out ) :: x_dot_y
50
- integer (c_int), intent (in ) :: API
51
- select case (API)
50
+ integer (c_int), intent (in ), optional :: API
51
+ integer (c_int) :: chosen_API
52
+
53
+ if (present (API))
54
+ chosen_API = API
55
+ else
56
+ chosen_API = CUDA
57
+ end if
58
+
59
+ select case (chosen_API)
52
60
case (CUDA)
53
61
call cudaDot(x,y,x_dot_y,size (x)) ! Accelerated reduction on local data
54
62
case (OpenMP)
@@ -68,18 +76,14 @@ program cu_dot_test
68
76
implicit none
69
77
70
78
! Unaccelerated variables
71
- real (c_float), allocatable :: a (:),b (:)
79
+ real (c_float), allocatable :: a_unacc (:),b_unacc (:)
72
80
real (c_float) :: dot
73
81
real (c_double) :: t_start, t_end
74
82
75
83
! Library-accelerated variables
76
84
real (c_float), allocatable :: a_acc(:)[:], b_acc(:)[:]
77
85
real (c_float) :: dot_acc[* ]
78
86
79
- ! Manually accelerated variables
80
- real (c_float), allocatable :: a_man(:)[:], b_man(:)[:]
81
- real (c_float) :: dot_man[* ]
82
-
83
87
integer (c_int),parameter :: n = 99900000
84
88
integer (c_int) :: n_local,np,me
85
89
@@ -98,15 +102,15 @@ program cu_dot_test
98
102
99
103
! Parallel execution
100
104
t_start = walltime()
101
- call co_dot_accelerated(a_acc,b_acc,dot_acc,CUDA)
105
+ call co_dot_accelerated(a_acc( 1 :n_local) ,b_acc( 1 :n_local) ,dot_acc,CUDA)
102
106
t_end = walltime()
103
107
if (me== 1 ) print * , ' Accelerated dot_prod' ,dot_acc,' time:' ,t_end- t_start
104
108
105
109
sync all
106
110
107
111
! Serial execution
108
112
t_start = walltime()
109
- call co_dot_unaccelerated(a_man,b_man ,dot)
113
+ call co_dot_unaccelerated(a_unacc( 1 :n_local),b_unacc( 1 :n_local) ,dot)
110
114
t_end = walltime()
111
115
if (me== 1 ) print * , ' Serial result' ,dot,' time:' ,t_end- t_start
112
116
@@ -118,8 +122,10 @@ program cu_dot_test
118
122
119
123
subroutine initialize_all_variables ()
120
124
integer (c_int) :: i
121
- call accelerated_allocate(a_acc(n_local)[* ],b_acc(n_local)[* ])
122
- call accelerated_allocate(a_man(n_local)[* ],b_man(n_local)[* ])
125
+ ! The allocation arguments must be coarrays to support the scatter operation below
126
+ call accelerated_allocate(a_acc,n_local)
127
+ call accelerated_allocate(b_acc,n_local)
128
+ allocate (a_unacc(n_local)[* ],b_unacc(n_local)[* ])
123
129
124
130
if (me == 1 ) then
125
131
! Initialize the local unaccelerated data on every image
@@ -129,10 +135,11 @@ subroutine initialize_all_variables()
129
135
! Scatter a and b to a_cc and b_cc
130
136
do i= 1 ,np
131
137
a_acc(1 :n_local)[i] = a(n_local* (i-1 )+ 1 :n_local* i)
132
- a_man(1 :n_local)[i] = a(n_local* (i-1 )+ 1 :n_local* i)
133
138
b_acc(1 :n_local)[i] = b(n_local* (i-1 )+ 1 :n_local* i)
134
- b_man(1 :n_local)[i] = b(n_local* (i-1 )+ 1 :n_local* i)
135
- enddo
139
+ end do
140
+ sync all
141
+ a_unacc= a_acc
142
+ b_unacc= b_acc
136
143
endif
137
144
end subroutine
138
145
0 commit comments