Skip to content

Commit 64fd9e2

Browse files
authored
Merge pull request #150 from eth-cscs/frontend/prgenv_pgi_kesch
Add regression tests for GPU direct.
2 parents 138e5e0 + 698caee commit 64fd9e2

File tree

8 files changed

+278
-12
lines changed

8 files changed

+278
-12
lines changed

config/cscs.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,20 +107,23 @@ class ReframeSettings:
107107
'partitions': {
108108
'login': {
109109
'scheduler': 'local',
110-
'environs': ['PrgEnv-gnu', 'PrgEnv-cray', 'PrgEnv-gdr'],
110+
'environs': ['PrgEnv-gnu', 'PrgEnv-cray',
111+
'PrgEnv-pgi', 'PrgEnv-gnu-gdr'],
111112
'descr': 'Kesch login nodes',
112113
},
113114
'pn': {
114115
'scheduler': 'nativeslurm',
115116
'access': ['--partition=pn-regression'],
116-
'environs': ['PrgEnv-gnu', 'PrgEnv-cray', 'PrgEnv-gdr'],
117+
'environs': ['PrgEnv-gnu', 'PrgEnv-cray',
118+
'PrgEnv-pgi', 'PrgEnv-gnu-gdr'],
117119
'descr': 'Kesch post-processing nodes'
118120
},
119121

120122
'cn': {
121123
'scheduler': 'nativeslurm',
122124
'access': ['--partition=cn-regression'],
123-
'environs': ['PrgEnv-gnu', 'PrgEnv-cray', 'PrgEnv-gdr'],
125+
'environs': ['PrgEnv-gnu', 'PrgEnv-cray',
126+
'PrgEnv-pgi', 'PrgEnv-gnu-gdr'],
124127
'descr': 'Kesch compute nodes',
125128
'resources': {
126129
'_rfm_gpu': ['--gres=gpu:{num_gpus_per_node}']
@@ -199,7 +202,14 @@ class ReframeSettings:
199202
'cxx': 'mpicxx',
200203
'ftn': 'mpif90',
201204
},
202-
'PrgEnv-gdr': {
205+
'PrgEnv-pgi': {
206+
'type': 'ProgEnvironment',
207+
'modules': ['/apps/common/regression/prgenv_pgi_17.10_aj'],
208+
'cc': 'mpicc',
209+
'cxx': 'mpicxx',
210+
'ftn': 'mpif90',
211+
},
212+
'PrgEnv-gnu-gdr': {
203213
'type': 'ProgEnvironment',
204214
'modules': ['gmvapich2/17.02_cuda_8.0_gdr'],
205215
'cc': 'mpicc',

cscs-checks/microbenchmarks/mch/g2g_meteoswiss_check.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def __init__(self, g2g, **kwargs):
1313
self.descr = 'G2G Meteoswiss check with G2G=%s' % g2g
1414
self.strict_check = False
1515
self.valid_systems = ['kesch:cn']
16-
self.valid_prog_environs = ['PrgEnv-gdr']
16+
self.valid_prog_environs = ['PrgEnv-gnu-gdr']
1717
self.executable = 'src/$EXECUTABLE'
1818
self.sourcesdir = ('https://github.com/MeteoSwiss-APN/'
1919
'comm_overlap_bench.git')
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import os
2+
3+
import reframe.utility.sanity as sn
4+
from reframe.core.pipeline import RegressionTest
5+
6+
class GpuDirectAccCheck(RegressionTest):
7+
def __init__(self, **kwargs):
8+
super().__init__('gpu_direct_acc_check',
9+
os.path.dirname(__file__), **kwargs)
10+
self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
11+
self.valid_prog_environs = ['PrgEnv-cray']
12+
if self.current_system.name in ['daint', 'dom']:
13+
self.modules = ['craype-accel-nvidia60']
14+
self._pgi_flags = '-acc -ta=tesla:cc60'
15+
self.variables = {'MPICH_RDMA_ENABLED_CUDA': '1'}
16+
elif self.current_system.name in ['kesch']:
17+
self.modules = ['craype-accel-nvidia35']
18+
self._pgi_flags = '-acc -ta=tesla:cc35'
19+
self.variables = {'MPICH_RDMA_ENABLED_CUDA': '1',
20+
'MV2_USE_CUDA': '1',
21+
'MV2_USE_GPUDIRECT': '1',
22+
'G2G': '1',
23+
'MPICH_G2G_PIPELINE': '1'}
24+
25+
self.num_tasks = 2
26+
self.num_gpus_per_node = 1
27+
self.sourcepath = 'gpu_direct_acc.f90'
28+
self.num_tasks_per_node = 1
29+
30+
result = sn.extractsingle(r'Result :\s+(?P<result>\d+\.?\d*)',
31+
self.stdout, 'result', float)
32+
self.sanity_patterns = sn.assert_reference(result, 1., -1e-5, 1e-5)
33+
34+
self.maintainers = ['AJ', 'VK']
35+
self.tags = {'production'}
36+
37+
def setup(self, partition, environ, **job_opts):
38+
if environ.name == 'PrgEnv-cray':
39+
environ.fflags = '-hacc -hnoomp'
40+
elif environ.name == 'PrgEnv-pgi':
41+
environ.fflags = self._pgi_flags
42+
43+
super().setup(partition, environ, **job_opts)
44+
45+
46+
def _get_checks(**kwargs):
47+
return [GpuDirectAccCheck(**kwargs)]
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import os
2+
3+
import reframe.utility.sanity as sn
4+
from reframe.core.pipeline import RegressionTest
5+
6+
class GpuDirectCudaCheck(RegressionTest):
7+
def __init__(self, **kwargs):
8+
super().__init__('gpu_direct_cuda_check',
9+
os.path.dirname(__file__), **kwargs)
10+
self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
11+
self.valid_prog_environs = ['PrgEnv-gnu']
12+
if self.current_system.name in ['daint', 'dom']:
13+
self.variables = {'MPICH_RDMA_ENABLED_CUDA': '1'}
14+
elif self.current_system.name in ['kesch']:
15+
self.valid_prog_environs = ['PrgEnv-gnu-gdr']
16+
self.variables = {'MPICH_RDMA_ENABLED_CUDA': '1',
17+
'MV2_USE_CUDA': '1',
18+
'MV2_USE_GPUDIRECT': '1',
19+
'MPICH_G2G_PIPELINE': '1',
20+
'G2G': '1'}
21+
22+
self.num_tasks = 2
23+
self.num_gpus_per_node = 1
24+
self.sourcepath = 'gpu_direct_cuda.cu'
25+
self.num_tasks_per_node = 1
26+
27+
self.modules = ['cudatoolkit']
28+
29+
result = sn.extractsingle(r'Result :\s+(?P<result>\d+\.?\d*)',
30+
self.stdout, 'result', float)
31+
self.sanity_patterns = sn.assert_reference(result, 1., -1e-5, 1e-5)
32+
33+
self.maintainers = ['AJ', 'VK']
34+
self.tags = {'production'}
35+
36+
def compile(self):
37+
# Set nvcc flags
38+
nvidia_sm = '60'
39+
cpp_compiler = 'CC'
40+
if self.current_system.name == 'kesch':
41+
nvidia_sm = '37'
42+
cpp_compiler = 'mpicxx'
43+
self.current_environ.cxxflags = ('-ccbin %s -lcublas -lcudart '
44+
'-arch=sm_%s' %
45+
(cpp_compiler, nvidia_sm))
46+
super().compile()
47+
48+
49+
def _get_checks(**kwargs):
50+
return [GpuDirectCudaCheck(**kwargs)]

cscs-checks/prgenv/openacc_checks.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,31 @@
33
import reframe.utility.sanity as sn
44
from reframe.core.pipeline import RegressionTest
55

6-
76
class OpenACCFortranCheck(RegressionTest):
8-
def __init__(self, **kwargs):
9-
super().__init__('openacc_fortran_check',
7+
def __init__(self, num_tasks, **kwargs):
8+
if num_tasks == 1:
9+
check_name = 'openacc_fortran_check'
10+
else:
11+
check_name = 'openacc_mpi_fortran_check'
12+
super().__init__(check_name,
1013
os.path.dirname(__file__), **kwargs)
1114
self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
1215
self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
1316
if self.current_system.name in ['daint', 'dom']:
1417
self.modules = ['craype-accel-nvidia60']
15-
16-
self.sourcepath = 'vecAdd_openacc.f90'
18+
self._pgi_flags = '-acc -ta=tesla:cc60'
19+
elif self.current_system.name in ['kesch']:
20+
self.modules = ['craype-accel-nvidia35']
21+
self._pgi_flags = '-acc -ta=tesla:cc35'
22+
23+
self.num_tasks = num_tasks
24+
if self.num_tasks == 1:
25+
self.sourcepath = 'vecAdd_openacc.f90'
26+
else:
27+
self.sourcepath = 'vecAdd_openacc_mpi.f90'
1728
self.num_gpus_per_node = 1
1829
self.executable = self.name
30+
self.num_tasks_per_node = 1
1931

2032
result = sn.extractsingle(r'final result:\s+(?P<result>\d+\.?\d*)',
2133
self.stdout, 'result', float)
@@ -28,10 +40,11 @@ def setup(self, partition, environ, **job_opts):
2840
if environ.name == 'PrgEnv-cray':
2941
environ.fflags = '-hacc -hnoomp'
3042
elif environ.name == 'PrgEnv-pgi':
31-
environ.fflags = '-acc -ta=tesla:cc60'
43+
environ.fflags = self._pgi_flags
3244

3345
super().setup(partition, environ, **job_opts)
3446

3547

3648
def _get_checks(**kwargs):
37-
return [OpenACCFortranCheck(**kwargs)]
49+
return [OpenACCFortranCheck(1, **kwargs),
50+
OpenACCFortranCheck(2, **kwargs)]
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
program GpuDirectAcc
2+
implicit none
3+
4+
include 'mpif.h'
5+
6+
integer :: ierr, status
7+
integer :: mpi_size, mpi_rank
8+
integer(8) :: mydata(1)
9+
10+
call MPI_Init(ierr)
11+
12+
call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr)
13+
call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr)
14+
15+
if (mpi_size.ne.2) then
16+
if (mpi_rank.eq.0) write (*,*) "2 MPI ranks required"
17+
call MPI_Finalize(ierr);
18+
stop
19+
end if
20+
21+
mydata(1) = mpi_rank
22+
23+
!$acc data copy(mydata)
24+
if (mpi_rank.eq.0) then
25+
!$acc host_data use_device(mydata)
26+
call MPI_Recv(mydata, 1, MPI_INTEGER8, 1, 0, MPI_COMM_WORLD, status, ierr)
27+
!$acc end host_data
28+
else
29+
!$acc host_data use_device(mydata)
30+
call MPI_Send(mydata, 1, MPI_INTEGER8, 0, 0, MPI_COMM_WORLD, ierr)
31+
!$acc end host_data
32+
end if
33+
!$acc end data
34+
35+
if (mpi_rank.eq.0) then
36+
write (*,*) "Result : ", mydata
37+
end if
38+
39+
call MPI_Finalize(ierr);
40+
41+
end program GpuDirectAcc
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#include <iostream>
2+
#include <mpi.h>
3+
4+
using std::cout;
5+
using std::endl;
6+
7+
int main(int argc, char** argv){
8+
MPI_Status status;
9+
int mpi_size, mpi_rank;
10+
int host_data, *device_data;
11+
12+
MPI_Init(&argc, &argv);
13+
14+
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
15+
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
16+
17+
if (mpi_size!=2){
18+
if (mpi_rank==0) cout << "2 MPI ranks required" << endl;
19+
MPI_Finalize();
20+
return(1);
21+
}
22+
23+
host_data = mpi_rank;
24+
cudaMalloc((void **)&device_data, sizeof(int));
25+
26+
cudaMemcpy(device_data, &host_data, sizeof(int), cudaMemcpyHostToDevice);
27+
28+
if (mpi_rank==0){
29+
MPI_Recv(device_data, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, &status);
30+
}else{
31+
MPI_Send(device_data, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
32+
}
33+
34+
cudaMemcpy(&host_data, device_data, sizeof(int), cudaMemcpyDeviceToHost);
35+
36+
cudaFree(device_data);
37+
38+
if (mpi_rank==0){
39+
cout << "Result : " << host_data << endl;
40+
}
41+
42+
MPI_Finalize();
43+
44+
return(0);
45+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
program main
2+
include 'mpif.h'
3+
4+
! Size of vectors
5+
integer :: n = 100000
6+
7+
! Input vectors
8+
real(8),dimension(:),allocatable :: a
9+
real(8),dimension(:),allocatable :: b
10+
! Output vector
11+
real(8),dimension(:),allocatable :: c
12+
13+
integer :: i
14+
real(8) :: sum
15+
16+
call MPI_Init(ierr)
17+
call MPI_Comm_size(MPI_COMM_WORLD, isize, ierr)
18+
call MPI_Comm_rank(MPI_COMM_WORLD, irank, ierr)
19+
20+
! Allocate memory for each vector
21+
allocate(a(n))
22+
allocate(b(n))
23+
allocate(c(n))
24+
25+
! Initialize content of input vectors, vector a[i] = sin(i)^2 vector b[i] = cos(i)^2
26+
do i=1,n
27+
a(i) = sin(i*1D0)*sin(i*1D0)
28+
b(i) = cos(i*1D0)*cos(i*1D0)
29+
enddo
30+
31+
! Sum component wise and save result into vector c
32+
33+
!$acc kernels copyin(a(1:n),b(1:n)), copyout(c(1:n))
34+
do i=1,n
35+
c(i) = a(i) + b(i)
36+
enddo
37+
!$acc end kernels
38+
39+
sum = 0d0
40+
! Sum up vector c and print result divided by n, this should equal 1 within error
41+
do i=1,n
42+
sum = sum + c(i)
43+
enddo
44+
sum = sum/n/isize
45+
46+
if (irank.eq.0) then
47+
call MPI_Reduce(MPI_IN_PLACE, sum, 1, MPI_REAL8, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
48+
print *, 'final result: ', sum
49+
else
50+
call MPI_Reduce(sum, sum, 1, MPI_REAL8, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
51+
end if
52+
53+
! Release memory
54+
deallocate(a)
55+
deallocate(b)
56+
deallocate(c)
57+
58+
call MPI_Finalize(ierr)
59+
60+
end program

0 commit comments

Comments
 (0)