Skip to content

Commit 2c5815d

Browse files
authored
Merge pull request #564 from teojgo/regression_test/gpu_burn_multi_gpu
[test] Adapt the GPU burn test for multi-gpu nodes
2 parents b42b096 + 23e03b4 commit 2c5815d

File tree

2 files changed

+671
-0
lines changed

2 files changed

+671
-0
lines changed
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import os
2+
3+
import reframe as rfm
4+
import reframe.utility.sanity as sn
5+
6+
7+
@rfm.simple_test
8+
class GpuBurnTest(rfm.RegressionTest):
9+
def __init__(self):
10+
super().__init__()
11+
self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
12+
self.descr = 'GPU burn test'
13+
self.valid_prog_environs = ['PrgEnv-gnu']
14+
15+
if self.current_system.name == 'kesch':
16+
self.modules = ['craype-accel-nvidia35']
17+
# NOTE: The first option indicates the precision (-d for double)
18+
# while the seconds is the time (in secs) to run the test.
19+
# For multi-gpu nodes, we run the gpu burn test for more
20+
# time to get reliable measurements.
21+
self.executable_opts = ['-d', '40']
22+
self.num_gpus_per_node = 16
23+
gpu_arch = '37'
24+
else:
25+
self.modules = ['craype-accel-nvidia60']
26+
self.executable_opts = ['-d', '20']
27+
self.num_gpus_per_node = 1
28+
gpu_arch = '60'
29+
30+
self.sourcepath = 'gpu_burn.cu'
31+
self.build_system = 'SingleSource'
32+
self.build_system.cxxflags = ['-arch=compute_%s' % gpu_arch,
33+
'-code=sm_%s' % gpu_arch]
34+
self.build_system.ldflags = ['-lcuda', '-lcublas', '-lnvidia-ml']
35+
36+
self.sanity_patterns = sn.assert_eq(
37+
sn.count(sn.findall('OK', self.stdout)), self.num_tasks_assigned)
38+
39+
self.perf_patterns = {
40+
'perf': sn.min(sn.extractall(
41+
r'GPU\s+\d+\(\S*\): (?P<perf>\S*) GF\/s', self.stdout,
42+
'perf', float))
43+
}
44+
45+
self.reference = {
46+
'dom:gpu': {
47+
'perf': (4115, -0.10, None)
48+
},
49+
'daint:gpu': {
50+
'perf': (4115, -0.10, None)
51+
},
52+
'kesch:cn': {
53+
'perf': (950, -0.10, None)
54+
}
55+
}
56+
57+
self.num_tasks = 0
58+
self.num_tasks_per_node = 1
59+
60+
self.maintainers = ['AJ', 'VK', 'TM']
61+
62+
@property
63+
@sn.sanity_function
64+
def num_tasks_assigned(self):
65+
return self.job.num_tasks * self.num_gpus_per_node

0 commit comments

Comments
 (0)