Skip to content

Commit 1d187c6

Browse files
authored
Merge branch 'master' into enhancement/new_syntax
2 parents ebe0ee8 + 676c076 commit 1d187c6

File tree

25 files changed

+928
-115
lines changed

25 files changed

+928
-115
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,4 @@ For unsubscribing, you may send an empty message to [[email protected]
6161

6262
### Slack
6363

64-
You may also reach the community through Slack at [reframetalk.slack.com](https://reframetalk.slack.com/join/signup). Currently, you may join the Slack workspace by invitation only, which you will get as soon as you subscribe to the mailing list.
64+
You may also reach the community through Slack [here](https://reframe-slack.herokuapp.com).
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
import reframe as rfm
2+
import reframe.utility.sanity as sn
3+
4+
5+
@rfm.simple_test
6+
class Stencil4HPXCheck(rfm.RunOnlyRegressionTest):
7+
def __init__(self):
8+
super().__init__()
9+
10+
self.descr = 'HPX 1d_stencil_4 check'
11+
self.valid_systems = ['daint:gpu, daint:mc', 'dom:gpu', 'dom:mc']
12+
self.valid_prog_environs = ['PrgEnv-gnu']
13+
14+
self.modules = ['HPX']
15+
self.executable = '1d_stencil_4'
16+
17+
self.nt_opts = '100' # number of time steps
18+
self.np_opts = '100' # number of partitions
19+
self.nx_opts = '10000000' # number of points per partition
20+
self.executable_opts = ['--nt', self.nt_opts,
21+
'--np', self.np_opts,
22+
'--nx', self.nx_opts]
23+
self.sourcesdir = None
24+
25+
self.use_multithreading = None
26+
27+
self.perf_patterns = {
28+
'time': sn.extractsingle(r'\d+,\s*(?P<time>(\d+)?.?\d+),\s*\d+,'
29+
r'\s*\d+,\s*\d+',
30+
self.stdout, 'time', float)
31+
}
32+
self.reference = {
33+
'dom:gpu': {
34+
'time': (42, None, 0.1, 's')
35+
},
36+
'dom:mc': {
37+
'time': (30, None, 0.1, 's')
38+
},
39+
'daint:gpu': {
40+
'time': (42, None, 0.1, 's')
41+
},
42+
'daint:mc': {
43+
'time': (30, None, 0.1, 's')
44+
},
45+
}
46+
47+
self.tags = {'production'}
48+
self.maintainers = ['VH', 'JG']
49+
50+
def setup(self, partition, environ, **job_opts):
51+
result = sn.findall(r'(?P<tid>\d+),\s*(?P<time>(\d+)?.?\d+),'
52+
r'\s*(?P<pts>\d+),\s*(?P<parts>\d+),'
53+
r'\s*(?P<steps>\d+)',
54+
self.stdout)
55+
56+
if partition.fullname == 'daint:gpu':
57+
self.num_tasks = 1
58+
self.num_tasks_per_node = 1
59+
self.num_cpus_per_task = 12
60+
elif partition.fullname == 'daint:mc':
61+
self.num_tasks = 1
62+
self.num_tasks_per_node = 1
63+
self.num_cpus_per_task = 36
64+
elif partition.fullname == 'dom:gpu':
65+
self.num_tasks = 1
66+
self.num_tasks_per_node = 1
67+
self.num_cpus_per_task = 12
68+
elif partition.fullname == 'dom:mc':
69+
self.num_tasks = 1
70+
self.num_tasks_per_node = 1
71+
self.num_cpus_per_task = 36
72+
73+
self.executable_opts += ['--hpx:threads=%s' % self.num_cpus_per_task]
74+
75+
assert_num_threads = sn.map(lambda x: sn.assert_eq(
76+
int(x.group('tid')), self.num_cpus_per_task), result)
77+
assert_num_points = sn.map(lambda x: sn.assert_eq(
78+
x.group('pts'), self.nx_opts), result)
79+
assert_num_parts = sn.map(lambda x: sn.assert_eq(x.group('parts'),
80+
self.np_opts), result)
81+
assert_num_steps = sn.map(lambda x: sn.assert_eq(x.group('steps'),
82+
self.nt_opts), result)
83+
84+
self.sanity_patterns = sn.all(sn.chain(assert_num_threads,
85+
assert_num_points,
86+
assert_num_parts,
87+
assert_num_steps))
88+
89+
super().setup(partition, environ, **job_opts)
90+
91+
92+
@rfm.simple_test
93+
class Stencil8HPXCheck(rfm.RunOnlyRegressionTest):
94+
def __init__(self):
95+
super().__init__()
96+
97+
self.descr = 'HPX 1d_stencil_8 check'
98+
self.valid_systems = ['daint:gpu, daint:mc', 'dom:gpu', 'dom:mc']
99+
self.valid_prog_environs = ['PrgEnv-gnu']
100+
101+
self.modules = ['HPX']
102+
self.executable = '1d_stencil_8'
103+
104+
self.nt_opts = '100' # number of time steps
105+
self.np_opts = '100' # number of partitions
106+
self.nx_opts = '10000000' # number of points per partition
107+
self.executable_opts = ['--nt', self.nt_opts,
108+
'--np', self.np_opts,
109+
'--nx', self.nx_opts]
110+
self.sourcesdir = None
111+
112+
self.use_multithreading = None
113+
114+
self.perf_patterns = {
115+
'time': sn.extractsingle(r'\d+,\s*\d+,\s*(?P<time>(\d+)?.?\d+),'
116+
r'\s*\d+,\s*\d+,\s*\d+',
117+
self.stdout, 'time', float)
118+
}
119+
self.reference = {
120+
'dom:gpu': {
121+
'time': (26, None, 0.1, 's')
122+
},
123+
'dom:mc': {
124+
'time': (19, None, 0.1, 's')
125+
},
126+
'daint:gpu': {
127+
'time': (26, None, 0.1, 's')
128+
},
129+
'daint:mc': {
130+
'time': (19, None, 0.1, 's')
131+
},
132+
}
133+
134+
self.tags = {'production'}
135+
self.maintainers = ['VH', 'JG']
136+
137+
def setup(self, partition, environ, **job_opts):
138+
result = sn.findall(r'(?P<lid>\d+),\s*(?P<tid>\d+),'
139+
r'\s*(?P<time>(\d+)?.?\d+),'
140+
r'\s*(?P<pts>\d+),'
141+
r'\s*(?P<parts>\d+),'
142+
r'\s*(?P<steps>\d+)', self.stdout)
143+
144+
if partition.fullname == 'daint:gpu':
145+
self.num_tasks = 2
146+
self.num_tasks_per_node = 1
147+
self.num_cpus_per_task = 12
148+
elif partition.fullname == 'daint:mc':
149+
self.num_tasks = 4
150+
self.num_tasks_per_node = 2
151+
self.num_cpus_per_task = 18
152+
self.num_tasks_per_socket = 1
153+
elif partition.fullname == 'dom:gpu':
154+
self.num_tasks = 2
155+
self.num_tasks_per_node = 1
156+
self.num_cpus_per_task = 12
157+
elif partition.fullname == 'dom:mc':
158+
self.num_tasks = 4
159+
self.num_tasks_per_node = 2
160+
self.num_cpus_per_task = 18
161+
self.num_tasks_per_socket = 1
162+
163+
self.executable_opts += ['--hpx:threads=%s' % self.num_cpus_per_task]
164+
165+
num_threads = self.num_tasks * self.num_cpus_per_task
166+
assert_num_tasks = sn.map(lambda x: sn.assert_eq(int(x.group('lid')),
167+
self.num_tasks), result)
168+
assert_num_threads = sn.map(lambda x: sn.assert_eq(int(x.group('tid')),
169+
num_threads), result)
170+
assert_num_points = sn.map(lambda x: sn.assert_eq(x.group('pts'),
171+
self.nx_opts), result)
172+
assert_num_parts = sn.map(lambda x: sn.assert_eq(x.group('parts'),
173+
self.np_opts), result)
174+
assert_num_steps = sn.map(lambda x: sn.assert_eq(x.group('steps'),
175+
self.nt_opts), result)
176+
177+
self.sanity_patterns = sn.all(sn.chain(assert_num_tasks,
178+
assert_num_threads,
179+
assert_num_points,
180+
assert_num_parts,
181+
assert_num_steps))
182+
183+
super().setup(partition, environ, **job_opts)
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import reframe as rfm
2+
import reframe.utility.sanity as sn
3+
4+
5+
@rfm.required_version('>=2.16-dev0')
6+
@rfm.parameterized_test(['sync'], ['async'])
7+
class KernelLatencyTest(rfm.RegressionTest):
8+
def __init__(self, kernel_version):
9+
super().__init__()
10+
self.sourcepath = 'kernel_latency.cu'
11+
self.build_system = 'SingleSource'
12+
self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
13+
self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
14+
self.num_tasks = 0
15+
self.num_tasks_per_node = 1
16+
17+
if self.current_system.name in {'dom', 'daint'}:
18+
self.num_gpus_per_node = 1
19+
gpu_arch = '60'
20+
self.modules = ['craype-accel-nvidia60']
21+
self.valid_prog_environs += ['PrgEnv-gnu']
22+
else:
23+
self.num_gpus_per_node = 16
24+
self.modules = ['craype-accel-nvidia35']
25+
gpu_arch = '37'
26+
27+
self.build_system.cxxflags = ['-arch=compute_%s' % gpu_arch,
28+
'-code=sm_%s' % gpu_arch, '-std=c++11']
29+
30+
if kernel_version == 'sync':
31+
self.build_system.cppflags = ['-D SYNCKERNEL=1']
32+
else:
33+
self.build_system.cppflags = ['-D SYNCKERNEL=0']
34+
35+
self.sanity_patterns = sn.all([
36+
sn.assert_eq(
37+
sn.count(sn.findall(r'\[\S+\] Found \d+ gpu\(s\)',
38+
self.stdout)),
39+
self.num_tasks_assigned),
40+
sn.assert_eq(
41+
sn.count(sn.findall(r'\[\S+\] \[gpu \d+\] Kernel launch '
42+
r'latency: \S+ us', self.stdout)),
43+
self.num_tasks_assigned * self.num_gpus_per_node)
44+
])
45+
46+
self.perf_patterns = {
47+
'latency': sn.max(sn.extractall(
48+
r'\[\S+\] \[gpu \d+\] Kernel launch latency: '
49+
r'(?P<latency>\S+) us', self.stdout, 'latency', float))
50+
}
51+
self.sys_reference = {
52+
'sync': {
53+
'dom:gpu': {
54+
'latency': (6.6, None, 0.10, 's')
55+
},
56+
'daint:gpu': {
57+
'latency': (6.6, None, 0.10, 'us')
58+
},
59+
'kesch:cn': {
60+
'latency': (12.0, None, 0.10, 'us')
61+
},
62+
},
63+
'async': {
64+
'dom:gpu': {
65+
'latency': (2.2, None, 0.10, 'us')
66+
},
67+
'daint:gpu': {
68+
'latency': (2.2, None, 0.10, 's')
69+
},
70+
'kesch:cn': {
71+
'latency': (5.7, None, 0.10, 'us')
72+
},
73+
},
74+
}
75+
76+
self.reference = self.sys_reference[kernel_version]
77+
78+
self.maintainers = ['TM']
79+
self.tags = {'benchmark', 'diagnostic'}
80+
81+
@property
82+
@sn.sanity_function
83+
def num_tasks_assigned(self):
84+
return self.job.num_tasks
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#include <iostream>
2+
#include <chrono>
3+
#include <ratio>
4+
#include <unistd.h>
5+
#include <cuda.h>
6+
7+
__global__ void null_kernel() {
8+
};
9+
10+
int main(int argc, char* argv[]) {
11+
12+
char hostname[256];
13+
hostname[255]='\0';
14+
gethostname(hostname, 255);
15+
16+
cudaError_t error;
17+
int gpu_count = 0;
18+
19+
error = cudaGetDeviceCount(&gpu_count);
20+
21+
if (error == cudaSuccess) {
22+
if (gpu_count <= 0) {
23+
std::cout << "[" << hostname << "] " << "Could not find any gpu\n";
24+
return 1;
25+
}
26+
std::cout << "[" << hostname << "] " << "Found " << gpu_count << " gpu(s)\n";
27+
}
28+
else{
29+
std::cout << "[" << hostname << "] " << "Error getting gpu count, exiting...\n";
30+
return 1;
31+
}
32+
33+
for (int i = 0; i < gpu_count; i++) {
34+
35+
cudaSetDevice(i);
36+
// Single kernel launch to initialize cuda runtime
37+
null_kernel<<<1, 1>>>();
38+
39+
auto t_start = std::chrono::system_clock::now();
40+
const int kernel_count = 1000;
41+
42+
for (int i = 0; i < kernel_count; ++i) {
43+
null_kernel<<<1, 1>>>();
44+
#if SYNCKERNEL == 1
45+
cudaDeviceSynchronize();
46+
#endif
47+
}
48+
49+
#if SYNCKERNEL != 1
50+
cudaDeviceSynchronize();
51+
#endif
52+
53+
auto t_end = std::chrono::system_clock::now();
54+
std::cout << "[" << hostname << "] " << "[gpu " << i << "] " << "Kernel launch latency: " << std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(t_end - t_start).count() / kernel_count << " us\n";
55+
}
56+
57+
return 0;
58+
}
59+

0 commit comments

Comments
 (0)