Skip to content

Commit 42c8e33

Browse files
author
Theofilos Manitaras
committed
Make test flexible and support multiple gpus
1 parent 0b7e28b commit 42c8e33

File tree

2 files changed

+51
-24
lines changed

2 files changed

+51
-24
lines changed

cscs-checks/microbenchmarks/kernel_latency/kernel_latency.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import reframe.utility.sanity as sn
33

44

5+
@rfm.required_version('>=2.16-dev0')
56
@rfm.parameterized_test(['sync'], ['async'])
67
class KernelLatencyTest(rfm.RegressionTest):
78
def __init__(self, kernel_version):
@@ -10,13 +11,16 @@ def __init__(self, kernel_version):
1011
self.build_system = 'SingleSource'
1112
self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
1213
self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
13-
self.num_gpus_per_node = 1
14+
self.num_tasks = 0
15+
self.num_tasks_per_node = 1
1416

1517
if self.current_system.name in {'dom', 'daint'}:
18+
self.num_gpus_per_node = 1
1619
gpu_arch = '60'
1720
self.modules = ['craype-accel-nvidia60']
1821
self.valid_prog_environs += ['PrgEnv-gnu']
1922
else:
23+
self.num_gpus_per_node = 16
2024
self.modules = ['craype-accel-nvidia35']
2125
gpu_arch = '37'
2226

@@ -28,12 +32,21 @@ def __init__(self, kernel_version):
2832
else:
2933
self.build_system.cppflags = ['-D SYNCKERNEL=0']
3034

31-
self.sanity_patterns = sn.assert_found(r'Found \d+ gpu\(s\)',
32-
self.stdout)
35+
self.sanity_patterns = sn.all([
36+
sn.assert_eq(
37+
sn.count(sn.findall(r'\[\S+\] Found \d+ gpu\(s\)',
38+
self.stdout)),
39+
self.num_tasks_assigned),
40+
sn.assert_eq(
41+
sn.count(sn.findall(r'\[\S+\] \[gpu \d+\] Kernel launch '
42+
r'latency: \S+ us', self.stdout)),
43+
self.num_tasks_assigned * self.num_gpus_per_node)
44+
])
45+
3346
self.perf_patterns = {
34-
'latency': sn.extractsingle(
35-
r'Kernel launch latency: (?P<latency>\S+) us',
36-
self.stdout, 'latency', float)
47+
'latency': sn.max(sn.extractall(
48+
r'\[\S+\] \[gpu \d+\] Kernel launch latency: '
49+
r'(?P<latency>\S+) us', self.stdout, 'latency', float))
3750
}
3851
self.sys_reference = {
3952
'sync': {
@@ -63,4 +76,9 @@ def __init__(self, kernel_version):
6376
self.reference = self.sys_reference[kernel_version]
6477

6578
self.maintainers = ['TM']
66-
self.tags = {'production'}
79+
self.tags = {'benchmark', 'diagnostic'}
80+
81+
@property
82+
@sn.sanity_function
83+
def num_tasks_assigned(self):
84+
return self.job.num_tasks

cscs-checks/microbenchmarks/kernel_latency/src/kernel_latency.cu

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,58 @@
11
#include <iostream>
22
#include <chrono>
33
#include <ratio>
4+
#include <unistd.h>
45
#include <cuda.h>
56

67
__global__ void null_kernel() {
78
};
89

910
int main(int argc, char* argv[]) {
1011

12+
char hostname[256];
13+
hostname[255]='\0';
14+
gethostname(hostname, 255);
15+
1116
cudaError_t error;
1217
int gpu_count = 0;
1318

1419
error = cudaGetDeviceCount(&gpu_count);
1520

1621
if (error == cudaSuccess) {
1722
if (gpu_count <= 0) {
18-
std::cout << "Could not found any gpu\n";
23+
std::cout << "[" << hostname << "] " << "Could not find any gpu\n";
1924
return 1;
2025
}
21-
std::cout << "Found " << gpu_count << " gpu(s)\n";
26+
std::cout << "[" << hostname << "] " << "Found " << gpu_count << " gpu(s)\n";
2227
}
2328
else{
24-
std::cout << "Error getting gpu count, exiting...\n";
29+
std::cout << "[" << hostname << "] " << "Error getting gpu count, exiting...\n";
2530
return 1;
2631
}
2732

28-
// Single kernel launch to initialize cuda runtime
29-
null_kernel<<<1, 1>>>();
30-
31-
auto t_start = std::chrono::system_clock::now();
32-
const int kernel_count = 1000;
33+
for (int i = 0; i < gpu_count; i++) {
3334

34-
for (int i = 0; i < kernel_count; ++i) {
35+
cudaSetDevice(i);
36+
// Single kernel launch to initialize cuda runtime
3537
null_kernel<<<1, 1>>>();
36-
#if SYNCKERNEL == 1
38+
39+
auto t_start = std::chrono::system_clock::now();
40+
const int kernel_count = 1000;
41+
42+
for (int i = 0; i < kernel_count; ++i) {
43+
null_kernel<<<1, 1>>>();
44+
#if SYNCKERNEL == 1
45+
cudaDeviceSynchronize();
46+
#endif
47+
}
48+
49+
#if SYNCKERNEL != 1
3750
cudaDeviceSynchronize();
3851
#endif
39-
}
4052

41-
#if SYNCKERNEL != 1
42-
cudaDeviceSynchronize();
43-
#endif
44-
45-
auto t_end = std::chrono::system_clock::now();
46-
std::cout << "Kernel launch latency: " << std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(t_end - t_start).count() / kernel_count << " us\n";
53+
auto t_end = std::chrono::system_clock::now();
54+
std::cout << "[" << hostname << "] " << "[gpu " << i << "] " << "Kernel launch latency: " << std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(t_end - t_start).count() / kernel_count << " us\n";
55+
}
4756

4857
return 0;
4958
}

0 commit comments

Comments
 (0)