Skip to content

Commit 3a39a39

Browse files
author
Vasileios Karakasis
authored
Merge pull request #1687 from jjotero/test/ault-dev
[test] GPU dgemm and pointer chasing tests
2 parents ddecc3f + 26e7dc5 commit 3a39a39

File tree

20 files changed

+1472
-1
lines changed

20 files changed

+1472
-1
lines changed
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# Copyright 2016-2020 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
6+
import reframe as rfm
7+
import reframe.utility.sanity as sn
8+
9+
10+
@rfm.simple_test
11+
class GPUdgemmTest(rfm.RegressionTest):
12+
def __init__(self):
13+
self.valid_systems = ['daint:gpu', 'dom:gpu',
14+
'ault:amdv100', 'ault:intelv100',
15+
'ault:amda100', 'ault:amdvega']
16+
self.valid_prog_environs = ['PrgEnv-gnu']
17+
self.num_tasks = 0
18+
self.num_tasks_per_node = 1
19+
self.build_system = 'Make'
20+
self.executable = 'dgemm.x'
21+
self.sanity_patterns = self.assert_num_gpus()
22+
self.perf_patterns = {
23+
'perf': sn.min(sn.extractall(
24+
r'^\s*\[[^\]]*\]\s*GPU\s*\d+: (?P<fp>\S+) TF/s',
25+
self.stdout, 'fp', float))
26+
}
27+
self.reference = {
28+
'dom:gpu': {
29+
'perf': (3.35, -0.1, None, 'TF/s')
30+
},
31+
'daint:gpu': {
32+
'perf': (3.35, -0.1, None, 'TF/s')
33+
},
34+
'ault:amdv100': {
35+
'perf': (5.25, -0.1, None, 'TF/s')
36+
},
37+
'ault:intelv100': {
38+
'perf': (5.25, -0.1, None, 'TF/s')
39+
},
40+
'ault:amda100': {
41+
'perf': (10.5, -0.1, None, 'TF/s')
42+
},
43+
'ault:amdvega': {
44+
'perf': (3.45, -0.1, None, 'TF/s')
45+
}
46+
}
47+
48+
self.maintainers = ['JO', 'SK']
49+
self.tags = {'benchmark'}
50+
51+
@sn.sanity_function
52+
def assert_num_gpus(self):
53+
return sn.assert_eq(
54+
sn.count(sn.findall(r'^\s*\[[^\]]*\]\s*Test passed', self.stdout)),
55+
sn.getattr(self.job, 'num_tasks'))
56+
57+
@rfm.run_before('compile')
58+
def select_makefile(self):
59+
cp = self.current_partition.fullname
60+
if cp == 'ault:amdvega':
61+
self.build_system.makefile = 'makefile.hip'
62+
else:
63+
self.build_system.makefile = 'makefile.cuda'
64+
65+
@rfm.run_before('compile')
66+
def set_gpu_arch(self):
67+
cp = self.current_partition.fullname
68+
69+
# Deal with the NVIDIA options first
70+
nvidia_sm = None
71+
if cp in {'tsa:cn', 'ault:intelv100', 'ault:amdv100'}:
72+
nvidia_sm = '70'
73+
elif cp == 'ault:amda100':
74+
nvidia_sm = '80'
75+
elif cp in {'dom:gpu', 'daint:gpu'}:
76+
nvidia_sm = '60'
77+
78+
if nvidia_sm:
79+
self.build_system.cxxflags += [f'-arch=sm_{nvidia_sm}']
80+
if cp in {'dom:gpu', 'daint:gpu'}:
81+
self.modules += ['craype-accel-nvidia60']
82+
if cp == 'dom:gpu':
83+
self.modules += ['cdt-cuda']
84+
85+
else:
86+
self.modules += ['cuda']
87+
88+
# Deal with the AMD options
89+
amd_trgt = None
90+
if cp == 'ault:amdvega':
91+
amd_trgt = 'gfx906'
92+
93+
if amd_trgt:
94+
self.build_system.cxxflags += [f'--amdgpu-target={amd_trgt}']
95+
self.modules += ['rocm']
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../memory_bandwidth/src/Xdevice
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
/*
2+
* Basic DGEMM test
3+
*
4+
* Multiply two matrices of dimensions SIZE*SIZE filled with ones. Therefore,
5+
* all the elements of the resulting matrix will be just SIZE.
6+
*/
7+
8+
#define SIZE 1024
9+
#define REPEAT 30
10+
11+
#include <iostream>
12+
#include <unistd.h>
13+
#include <thread>
14+
#include <mutex>
15+
#include <vector>
16+
#include <algorithm>
17+
#include <functional>
18+
19+
#include "Xdevice/runtime.hpp"
20+
#include "Xdevice/blas.hpp"
21+
22+
23+
namespace kernels
24+
{
25+
template<class T>
26+
__global__ void init_as_ones(T * arr, size_t size)
27+
{
28+
unsigned int tid = threadIdx.x + blockIdx.x*blockDim.x;
29+
if (tid < size)
30+
{
31+
arr[tid] = (T)1.0;
32+
}
33+
}
34+
35+
template<class T>
36+
__global__ void verify(T * arr, size_t size, int * err)
37+
{
38+
unsigned int tid = threadIdx.x + blockIdx.x*blockDim.x;
39+
if (tid < size)
40+
{
41+
if (int(arr[tid]) != SIZE)
42+
atomicAdd(err, 1);
43+
}
44+
}
45+
}
46+
47+
/*
48+
* This code uses a thread per device in the node.
49+
* For simplicity, we define the variables below as global.
50+
*/
51+
52+
#define HOST_NAME_SIZE 128
53+
char hostname[HOST_NAME_SIZE];
54+
double tflops = SIZE*SIZE*SIZE*2.0 * 1E-12;
55+
int totalErrors = 0;
56+
std::mutex mtx;
57+
58+
#define BLOCK_SIZE 128
59+
void dgemm(int device)
60+
{
61+
XSetDevice(device);
62+
63+
double * A;
64+
double * B;
65+
double * C;
66+
const double alpha = 1.0;
67+
const double beta = 0.0;
68+
69+
XMalloc((void**)&A, sizeof(double)*SIZE*SIZE);
70+
XMalloc((void**)&B, sizeof(double)*SIZE*SIZE);
71+
XMalloc((void**)&C, sizeof(double)*SIZE*SIZE);
72+
73+
kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(A, SIZE*SIZE);
74+
kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(B, SIZE*SIZE);
75+
XDeviceSynchronize();
76+
77+
XStream_t stream;
78+
XStreamCreate(&stream);
79+
XblasHandle_t blas_handle;
80+
XblasCreate(&blas_handle);
81+
XblasSetStream(blas_handle, stream);
82+
83+
// Warmup call
84+
XblasDgemm(blas_handle,
85+
XBLAS_OP_N, XBLAS_OP_N,
86+
SIZE, SIZE, SIZE,
87+
&alpha,
88+
(const double*)A, SIZE,
89+
(const double*)B, SIZE,
90+
&beta,
91+
C, SIZE);
92+
XDeviceSynchronize();
93+
94+
// Time the execution
95+
XTimer t(stream);
96+
t.start();
97+
for (int i = 0; i < REPEAT; i++)
98+
{
99+
XblasDgemm(blas_handle,
100+
XBLAS_OP_N, XBLAS_OP_N,
101+
SIZE, SIZE, SIZE,
102+
&alpha,
103+
(const double*)A, SIZE,
104+
(const double*)B, SIZE,
105+
&beta,
106+
C, SIZE);
107+
}
108+
109+
// Calc the performance data in TFlops/sec
110+
double perf = tflops/(t.stop()/REPEAT/1000.0);
111+
112+
XblasDestroy(blas_handle);
113+
XStreamDestroy(stream);
114+
115+
// Verify that the final values of C are correct.
116+
int * err, h_err = 0;
117+
XMalloc((void**)&err, sizeof(int));
118+
XMemcpy( err, &h_err, sizeof(int), XMemcpyHostToDevice);
119+
kernels::verify<double><<<(SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(C, SIZE*SIZE, err);
120+
XMemcpy(&h_err, err, sizeof(int), XMemcpyDeviceToHost);
121+
{
122+
std::lock_guard<std::mutex> lg(mtx);
123+
totalErrors += h_err;
124+
125+
// Print the performance results
126+
printf("[%s] GPU %d: %4.2f TF/s\n", hostname, device, (float)perf);
127+
}
128+
XFree(A);
129+
XFree(B);
130+
XFree(C);
131+
132+
}
133+
134+
int main(int argc, char **argv)
135+
{
136+
137+
gethostname(hostname, sizeof(hostname));
138+
139+
int num_devices;
140+
XGetDeviceCount(&num_devices);
141+
142+
// Print device count
143+
printf("[%s] Found %d device(s).\n", hostname, num_devices);
144+
145+
// Create vector of threads.
146+
std::vector<std::thread> threads;
147+
148+
// Do the dgemm for all devices in the node.
149+
for (int device = 0; device < num_devices; device++)
150+
{
151+
threads.push_back(std::thread(dgemm,device));
152+
}
153+
154+
// Join all threads
155+
std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
156+
157+
// Test if there were any errors and print the test result.
158+
printf("[%s] Test %s\n", hostname, totalErrors == 0 ? "passed" : "failed");
159+
160+
return 0;
161+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
dgemm:
2+
nvcc [email protected] -o [email protected] ${CXXFLAGS} -lnvidia-ml -lcublas -std=c++14
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
CXXFLAGS?=--amdgpu-target=gfx906,gfx908
2+
ROCM_ROOT?=/opt/rocm
3+
RSMI_ROOT?=/opt/rocm/rocm_smi
4+
5+
dgemm:
6+
hipcc -O3 [email protected] -o [email protected] -DTARGET_HIP ${CXXFLAGS} -std=c++14 -I${ROCM_ROOT} -I${RSMI_ROOT}/include -lnuma -lrocm_smi64 -lrocblas

cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/blas.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ void XblasDestroy(cublasHandle_t handle)
3939
checkError( cublasDestroy(handle) );
4040
}
4141

42+
void XblasSetStream(cublasHandle_t h, cudaStream_t s)
43+
{
44+
checkError ( cublasSetStream(h, s) );
45+
}
4246

4347
auto XblasDgemm = cublasDgemm;
4448
auto XblasSgemm = cublasSgemm;

cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/smi.hpp

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
#include <unistd.h>
66
#include <nvml.h>
77

8+
/*
9+
* NVML - SMI tools
10+
*/
11+
812
static inline void nvmlCheck(nvmlReturn_t err)
913
{
1014
# ifdef DEBUG
@@ -80,4 +84,96 @@ Smi::~Smi()
8084
}
8185
}
8286

87+
88+
/*
89+
* ASM tools
90+
*/
91+
92+
__device__ __forceinline__ uint32_t XClock()
93+
{
94+
// Clock counter
95+
uint32_t x;
96+
asm volatile ("mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
97+
return x;
98+
}
99+
100+
__device__ __forceinline__ uint64_t XClock64()
101+
{
102+
// Clock counter
103+
uint64_t x;
104+
asm volatile ("mov.u64 %0, %%clock64;" : "=l"(x) :: "memory");
105+
return x;
106+
}
107+
108+
__device__ __forceinline__ uint32_t XSyncClock()
109+
{
110+
// Clock counter with a preceeding barrier.
111+
uint32_t x;
112+
asm volatile ("bar.sync 0;\n\t"
113+
"mov.u32 %0, %%clock;" : "=r"(x) :: "memory");
114+
return x;
115+
}
116+
117+
__device__ __forceinline__ uint64_t XSyncClock64()
118+
{
119+
// Clock counter with a preceeding barrier.
120+
uint64_t x;
121+
asm volatile ("bar.sync 0;\n\t"
122+
"mov.u64 %0, %%clock64;" : "=l"(x) :: "memory");
123+
return x;
124+
}
125+
126+
127+
template<class T = uint32_t>
128+
class __XClocks
129+
{
130+
/*
131+
* XClocks timer tool
132+
* Tracks the number of clock cycles between a call to the start
133+
* and end member functions.
134+
*/
135+
public:
136+
T startClock;
137+
__device__ void start()
138+
{
139+
startClock = XSyncClock();
140+
}
141+
__device__ T end()
142+
{
143+
return XClock() - startClock;
144+
}
145+
};
146+
147+
template<>
148+
void __XClocks<uint64_t>::start()
149+
{
150+
this->startClock = XSyncClock64();
151+
}
152+
153+
template<>
154+
uint64_t __XClocks<uint64_t>::end()
155+
{
156+
return XClock64() - this->startClock;
157+
}
158+
159+
using XClocks64 = __XClocks<uint64_t>;
160+
using XClocks = __XClocks<>;
161+
162+
163+
template<class T>
164+
__device__ T XClockLatency()
165+
{
166+
uint64_t start = XClock64();
167+
uint64_t end = XClock64();
168+
return (T)(end-start);
169+
}
170+
171+
__device__ __forceinline__ int __smId()
172+
{
173+
// SM ID
174+
uint32_t x;
175+
asm volatile ("mov.u32 %0, %%smid;" : "=r"(x) :: "memory");
176+
return (int)x;
177+
}
178+
83179
#endif

cscs-checks/microbenchmarks/gpu/memory_bandwidth/src/Xdevice/cuda/types.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,6 @@ XMemcpyKind XMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
1313
XMemcpyKind XMemcpyHostToHost = cudaMemcpyHostToHost;
1414
XMemcpyKind XMemcpyDefault = cudaMemcpyDefault;
1515

16+
#define XHostAllocMapped cudaHostAllocMapped
17+
1618
#endif

0 commit comments

Comments
 (0)