Skip to content

Commit 556ec20

Browse files
author
Damian Rouson
committed
Update co_dot exercise (works on Titan). Correct use of c_sizeof and co_reduce binding.
1 parent ecee70b commit 556ec20

File tree

4 files changed

+52
-219
lines changed

4 files changed

+52
-219
lines changed

src/extensions/opencoarrays.F90

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ module opencoarrays
3636
#elif defined(COMPILER_LACKS_C_SIZEOF_ASSUMED_RANK)
3737
use iso_c_binding, only : c_int,c_char,c_ptr,c_loc,c_double,c_int32_t,c_bool,c_funloc,c_ptrdiff_t
3838
#else
39-
use iso_c_binding, only : c_int,c_char,c_ptr,c_loc,c_double,c_int32_t,c_bool,c_funloc,c_ptrdiff_t,c_sizeof
39+
use iso_c_binding, only : c_int,c_char,c_ptr,c_loc,c_float,c_double,c_int32_t,c_bool,c_funloc,c_ptrdiff_t,c_sizeof
4040
#endif
4141
implicit none
4242

@@ -240,8 +240,9 @@ subroutine caf_finalize(argc, argv) bind(C,name="_gfortran_caf_finalize")
240240
! PREFIX(registernc) (void* mem,size_t mem_size)
241241

242242
subroutine opencoarrays_registernc(mem,mem_size) bind(C,name="_gfortran_caf_registernc")
243+
import :: c_ptr,c_ptrdiff_t,c_int
243244
type(c_ptr), intent(in), value :: mem
244-
integer(c_ptrdiff_t), intent(in) :: mem_size
245+
integer(c_ptrdiff_t), value, intent(in) :: mem_size
245246
end subroutine
246247

247248
! C function signature from ../mpi/mpi_caf.c:
@@ -268,7 +269,7 @@ subroutine opencoarrays_co_reduce(a, opr, opr_flags, result_image, stat, errmsg,
268269
#ifdef COMPILER_SUPPORTS_CAF_INTRINSICS
269270
bind(C,name="_caf_extensions_co_reduce")
270271
#else
271-
bind(C,name="_gfortran_extensions_co_reduce")
272+
bind(C,name="_gfortran_caf_co_reduce")
272273
#endif
273274
use iso_c_binding, only : c_ptr,c_funptr,c_int,c_char
274275
type(c_ptr), intent(in), value :: a
@@ -364,8 +365,7 @@ function opencoarrays_this_image(coarray) bind(C,name="_gfortran_caf_this_image"
364365

365366
! C function signature from ../mpi/mpi_caf.c:
366367
! int PREFIX (num_images) (int, int);
367-
! function opencoarrays_num_images(coarray,dim_) bind(C,name="_gfortran_caf_num_images") result(num_images_)
368-
function opencoarrays_num_images(coarray,dim_) bind(C,name="_caf_extensions_num_images") result(num_images_)
368+
function opencoarrays_num_images(coarray,dim_) bind(C,name="_gfortran_caf_num_images") result(num_images_)
369369
import :: c_int
370370
integer(c_int), value, intent(in) :: coarray,dim_
371371
integer(c_int) :: num_images_
@@ -570,8 +570,8 @@ function c_sizeof(mold) result(c_size_of_mold)
570570
! _________________________________________________________________________________
571571

572572
subroutine accelerate(a)
573-
real(c_double), intent(in), contiguous :: a(..)
574-
call opencoarrays_registernc(c_loc(a),size(a)*c_sizeof(a))
573+
real(c_float), intent(in), contiguous, target :: a(..)
574+
call opencoarrays_registernc(c_loc(a),c_sizeof(a))
575575
end subroutine
576576

577577
subroutine co_reduce_c_int(a, opr, result_image, stat, errmsg)
@@ -877,23 +877,27 @@ subroutine co_sum_c_int(a,result_image,stat,errmsg)
877877
end subroutine
878878

879879
! Return the image number (MPI rank + 1)
880-
function this_image() result(image_num)
881-
use mpi, only : MPI_Comm_rank
882-
integer(c_int) :: image_num,ierr
883-
!image_num = opencoarrays_this_image(unused)
884-
call MPI_Comm_rank(CAF_COMM_WORLD,image_num,ierr)
885-
if (ierr/=0) call error_stop
886-
image_num = image_num + 1
887-
end function
880+
function this_image() result(image_num)
881+
#ifndef COMPILER_PROVIDES_MPI
882+
use mpi, only : MPI_Comm_rank
883+
#endif
884+
integer(c_int) :: image_num,ierr
885+
!image_num = opencoarrays_this_image(unused)
886+
call MPI_Comm_rank(CAF_COMM_WORLD,image_num,ierr)
887+
if (ierr/=0) call error_stop
888+
image_num = image_num + 1
889+
end function
888890

889891
! Return the total number of images
890-
function num_images() result(num_images_)
891-
use mpi, only : MPI_Comm_size
892-
integer(c_int) :: num_images_,ierr
893-
!num_images_ = opencoarrays_num_images(unused_coarray,unused_scalar)
894-
call MPI_Comm_size(CAF_COMM_WORLD,num_images_,ierr)
895-
if (ierr/=0) call error_stop
896-
end function
892+
function num_images() result(num_images_)
893+
#ifndef COMPILER_PROVIDES_MPI
894+
use mpi, only : MPI_Comm_size
895+
#endif
896+
integer(c_int) :: num_images_,ierr
897+
!num_images_ = opencoarrays_num_images(unused_coarray,unused_scalar)
898+
call MPI_Comm_size(CAF_COMM_WORLD,num_images_,ierr)
899+
if (ierr/=0) call error_stop
900+
end function
897901

898902
! Halt the execution of all images
899903
subroutine error_stop(stop_code)

src/tests/integration/gpu/accelerated_co_dot.f90

Lines changed: 0 additions & 184 deletions
This file was deleted.

src/tests/integration/gpu/co_dot.f90

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ subroutine co_dot_accelerated(x,y,x_dot_y,API)
5050
integer(c_int), intent(in), optional :: API
5151
integer(c_int) :: chosen_API
5252

53-
if (present(API))
53+
if (present(API)) then
5454
chosen_API = API
5555
else
5656
chosen_API = CUDA
@@ -76,6 +76,7 @@ program cu_dot_test
7676
implicit none
7777

7878
! Unaccelerated variables
79+
real(c_float), allocatable :: a(:),b(:)
7980
real(c_float), allocatable :: a_unacc(:),b_unacc(:)
8081
real(c_float) :: dot
8182
real(c_double) :: t_start, t_end
@@ -84,7 +85,7 @@ program cu_dot_test
8485
real(c_float), allocatable :: a_acc(:)[:], b_acc(:)[:]
8586
real(c_float) :: dot_acc[*]
8687

87-
integer(c_int),parameter :: n = 99900000
88+
integer(c_int) :: n = 99999904
8889
integer(c_int) :: n_local,np,me
8990

9091
np = num_images()
@@ -121,11 +122,12 @@ program cu_dot_test
121122
contains
122123

123124
subroutine initialize_all_variables()
125+
use opencoarrays, only : accelerate
124126
integer(c_int) :: i
125127
! These allocation arguments must be coarrays to support the scatter operation below
126-
allocate(a_unacc(n_local)[*],b_unacc(n_local)[*])
128+
allocate(a_unacc(n_local),b_unacc(n_local))
127129
! These allocation arguments will be defined locally and therefore need not be coarrays
128-
allocate(a_acc(n_local),b_acc(n_local))
130+
allocate(a_acc(n_local)[*],b_acc(n_local)[*])
129131

130132
! Register the desired variables for acceleration
131133
call accelerate(a_acc)
@@ -141,10 +143,10 @@ subroutine initialize_all_variables()
141143
a_acc(1:n_local)[i] = a(n_local*(i-1)+1:n_local*i)
142144
b_acc(1:n_local)[i] = b(n_local*(i-1)+1:n_local*i)
143145
end do
144-
sync all
145-
a_unacc=a_acc
146-
b_unacc=b_acc
147146
endif
147+
sync all
148+
a_unacc=a_acc
149+
b_unacc=b_acc
148150
end subroutine
149151

150152
end program

src/tests/integration/gpu/kernelCaller.cu

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
#include <stdio.h>
22
#include <stdlib.h>
33
#include <cuda.h>
4+
#include "cublas_v2.h"
45

56
extern "C" void cudaPrint(int *data, int n, int me, char *name);
67
extern "C" void cudaAdd(int* data_in1, int* data_in2, int *data_out, int n);
78
extern "C" void cudaDot(float* in1, float* in2, float* out, int n);
89

9-
#define MAX_BLOCK_SZ 512
10+
#define MAX_BLOCK_SZ 1024
1011

1112
__global__ void printOnCuda(int *data, int n)
1213
{
@@ -27,11 +28,11 @@ __global__ void assignOnCuda(int *data, int n)
2728
// printf("From CUDA data[%d] = %d\n",i,data[i]);
2829
}
2930

30-
__global__ void Dev_dot(float x[], float y[], float z[], int n) {
31+
__global__ void Dev_dot(float x[], float y[], float z[], long n) {
3132
/* Use tmp to store products of vector components in each block */
3233
/* Can't use variable dimension here */
3334
__shared__ float tmp[MAX_BLOCK_SZ];
34-
int t = blockDim.x * blockIdx.x + threadIdx.x;
35+
long t = blockDim.x * blockIdx.x + threadIdx.x;
3536
int loc_t = threadIdx.x;
3637

3738
if (t < n) tmp[loc_t] = x[t]*y[t];
@@ -70,18 +71,28 @@ extern "C"
7071
void cudaDot(float *in1, float *in2, float *out, int n)
7172
{
7273
float *partial_dot;
73-
int nThreads = 64, i=0;
74+
int nThreads = 1024, i=0;
7475
int nBlocks = ((n-1)/nThreads)+1;
76+
float *d_in1,*d_in2;
77+
// cublasHandle_t handle;
7578

7679
cudaMallocManaged(&partial_dot, nBlocks * sizeof(float));
80+
cudaDeviceSynchronize();
7781

7882
*out = 0.0;
83+
84+
cudaHostGetDevicePointer((void **) &d_in1, (void *) in1, 0);
85+
cudaHostGetDevicePointer((void **) &d_in2, (void *) in2, 0);
7986

80-
Dev_dot<<<nBlocks,nThreads>>>(in1,in2,partial_dot,n);
87+
Dev_dot<<<nBlocks,nThreads>>>(d_in1,d_in2,partial_dot,n);
88+
// cublasCreate (&handle);
89+
// cublasSdot(handle,n,d_in1,1,d_in2,1,out);
8190
cudaDeviceSynchronize();
8291

8392
for(i=0;i<nBlocks;i++)
84-
*out += partial_dot[i];
93+
{
94+
*out += partial_dot[i];
95+
}
8596

8697
cudaFree(partial_dot);
8798

0 commit comments

Comments
 (0)