Skip to content

Commit d5e7916

Browse files
authored
Merge branch 'master' into mpi_init_thread
2 parents 709a4fa + 0d8b785 commit d5e7916

File tree

31 files changed

+1659
-164
lines changed

31 files changed

+1659
-164
lines changed

.pep8speaks.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@ message:
66
header: "Hello @{name}, Thank you for updating!"
77
footer: "Do see the ReFrame [Coding Style Guide](https://github.com/eth-cscs/reframe/wiki/coding-style-guide)"
88
no_errors: "Cheers! There are no PEP8 issues in this Pull Request!"
9-
9+
1010
only_mention_files_with_errors: True
1111

1212
scanner:
1313
diff_only: True
14-
14+
1515
pycodestyle:
1616
max-line-length: 79
1717
ignore:
@@ -20,3 +20,7 @@ pycodestyle:
2020
- E226
2121
- E241
2222
- E272
23+
- E741
24+
- E742
25+
- E743
26+
- W504

Jenkinsfile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ def loginBash = '#!/bin/bash -l'
55
def bashScript = 'ci-scripts/ci-runner.bash'
66
def machinesList = ['daint', 'dom', 'kesch']
77
def machinesToRun = machinesList
8+
def runTests = true
89
def uniqueID
910

1011
stage('Initialization') {
@@ -37,6 +38,11 @@ stage('Initialization') {
3738
currentBuild.result = 'SUCCESS'
3839
return
3940
}
41+
else if (splittedComment[2] == 'none') {
42+
runTests = false
43+
currentBuild.result = 'SUCCESS'
44+
return
45+
}
4046

4147
machinesRequested = []
4248
for (i = 2; i < splittedComment.size(); i++) {
@@ -66,6 +72,11 @@ stage('Initialization') {
6672
}
6773
}
6874

75+
if (!runTests) {
76+
println "Won't execute any test (${currentBuild.result}). Exiting..."
77+
return
78+
}
79+
6980
if (currentBuild.result != 'SUCCESS') {
7081
println "Initialization failed (${currentBuild.result}). Exiting..."
7182
return

README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,21 @@ The documentation is now up on [localhost:8000](http://localhost:8000), where yo
4444
In the `cscs-checks/` folder, you can find realistic regression tests used for the CSCS systems that you can reuse and adapt to your system.
4545
Notice that these tests are published as examples and may not run as-is in your system.
4646
However, they can serve as a very good starting point for implementing your system tests in ReFrame.
47+
48+
49+
## Contact
50+
51+
You can get in contact with the ReFrame community in the following ways:
52+
53+
### Mailing list
54+
55+
For keeping up with the latest news about ReFrame, posting questions and, generally getting in touch with other users and the developers, you may follow the mailing list: [[email protected]](mailto:[email protected]).
56+
57+
Only subscribers may send messages to the list.
58+
To subscribe, please send an empty message to [[email protected]](mailto:[email protected]).
59+
60+
For unsubscribing, you may send an empty message to [[email protected]](mailto:[email protected]).
61+
62+
### Slack
63+
64+
You may also reach the community through Slack at [reframetalk.slack.com](https://reframetalk.slack.com/join/signup). Currently, you may join the Slack workspace by invitation only, which you will get as soon as you subscribe to the mailing list.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import reframe as rfm
2+
import reframe.utility.sanity as sn
3+
4+
5+
@rfm.required_version('>=2.16-dev0')
6+
@rfm.simple_test
7+
class TensorFlowHorovodTest(rfm.RunOnlyRegressionTest):
8+
def __init__(self):
9+
super().__init__()
10+
self.descr = 'Distributed training with TensorFlow and Horovod'
11+
self.valid_systems = ['daint:gpu', 'dom:gpu']
12+
self.valid_prog_environs = ['PrgEnv-gnu']
13+
tfshortver = '1.11'
14+
self.sourcesdir = 'https://github.com/tensorflow/benchmarks'
15+
self.modules = ['Horovod/0.15.0-CrayGNU-18.08-tf-%s.0' % tfshortver]
16+
self.reference = {
17+
'dom:gpu': {
18+
'throughput': (1133.6, None, 0.05, 'images/s'),
19+
},
20+
'daint:gpu': {
21+
'throughput': (4403.0, None, 0.05, 'images/s')
22+
},
23+
}
24+
self.perf_patterns = {
25+
'throughput': sn.avg(sn.extractall(
26+
r'total images/sec:\s+(?P<throughput>\S+)',
27+
self.stdout, 'throughput', float))
28+
}
29+
self.sanity_patterns = sn.assert_found(
30+
r'[\S+\s+] INFO NET\/IB : Using interface ipogif0'
31+
r' for sideband communication', self.stdout)
32+
self.num_tasks_per_node = 1
33+
if self.current_system.name == 'dom':
34+
self.num_tasks = 8
35+
elif self.current_system.name == 'daint':
36+
self.num_tasks = 32
37+
38+
self.pre_run = ['git checkout cnn_tf_v%s_compatible' % tfshortver]
39+
self.variables = {
40+
'NCCL_DEBUG': 'INFO',
41+
'NCCL_IB_HCA': 'ipogif0',
42+
'NCCL_IB_CUDA_SUPPORT': '1',
43+
'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK',
44+
}
45+
self.executable = ('python')
46+
self.executable_opts = [
47+
'scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py',
48+
'--model inception3',
49+
'--batch_size 64',
50+
'--variable_update horovod',
51+
'--log_dir ./logs',
52+
'--train_dir ./checkpoints']
53+
self.tags = {'production'}
54+
self.maintainers = ['MS', 'RS']
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import reframe as rfm
2+
import reframe.utility.sanity as sn
3+
4+
5+
@rfm.simple_test
6+
class HelloWorldHPXCheck(rfm.RunOnlyRegressionTest):
7+
def __init__(self):
8+
super().__init__()
9+
10+
self.descr = 'HPX hello, world check'
11+
self.valid_systems = ['daint:gpu, daint:mc', 'dom:gpu', 'dom:mc']
12+
self.valid_prog_environs = ['PrgEnv-gnu']
13+
14+
self.modules = ['HPX']
15+
self.executable = 'hello_world'
16+
self.sourcesdir = None
17+
18+
self.use_multithreading = None
19+
20+
self.tags = {'production'}
21+
self.maintainers = ['VH', 'JG']
22+
23+
def setup(self, partition, environ, **job_opts):
24+
hellos = sn.findall(r'hello world from OS-thread \s*(?P<tid>\d+) on '
25+
r'locality (?P<lid>\d+)', self.stdout)
26+
27+
if partition.fullname == 'daint:gpu':
28+
self.num_tasks = 2
29+
self.num_tasks_per_node = 1
30+
self.num_cpus_per_task = 12
31+
elif partition.fullname == 'daint:mc':
32+
self.num_tasks = 2
33+
self.num_tasks_per_node = 1
34+
self.num_cpus_per_task = 36
35+
elif partition.fullname == 'dom:gpu':
36+
self.num_tasks = 2
37+
self.num_tasks_per_node = 1
38+
self.num_cpus_per_task = 12
39+
elif partition.fullname == 'dom:mc':
40+
self.num_tasks = 2
41+
self.num_tasks_per_node = 1
42+
self.num_cpus_per_task = 36
43+
44+
self.executable_opts = ['--hpx:threads=%s' % self.num_cpus_per_task]
45+
46+
# https://stellar-group.github.io/hpx/docs/sphinx/branches/master/html/terminology.html#term-locality
47+
num_localities = self.num_tasks // self.num_tasks_per_node
48+
assert_num_tasks = sn.assert_eq(sn.count(hellos),
49+
self.num_tasks*self.num_cpus_per_task)
50+
assert_threads = sn.map(lambda x: sn.assert_lt(int(x.group('tid')),
51+
self.num_cpus_per_task), hellos)
52+
assert_localities = sn.map(lambda x: sn.assert_lt(int(x.group('lid')),
53+
num_localities), hellos)
54+
55+
self.sanity_patterns = sn.all(sn.chain([assert_num_tasks],
56+
assert_threads,
57+
assert_localities))
58+
59+
super().setup(partition, environ, **job_opts)
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
import reframe as rfm
2+
import reframe.utility.sanity as sn
3+
4+
5+
@rfm.simple_test
6+
class Stencil4HPXCheck(rfm.RunOnlyRegressionTest):
7+
def __init__(self):
8+
super().__init__()
9+
10+
self.descr = 'HPX 1d_stencil_4 check'
11+
self.valid_systems = ['daint:gpu, daint:mc', 'dom:gpu', 'dom:mc']
12+
self.valid_prog_environs = ['PrgEnv-gnu']
13+
14+
self.modules = ['HPX']
15+
self.executable = '1d_stencil_4'
16+
17+
self.nt_opts = '100' # number of time steps
18+
self.np_opts = '100' # number of partitions
19+
self.nx_opts = '10000000' # number of points per partition
20+
self.executable_opts = ['--nt', self.nt_opts,
21+
'--np', self.np_opts,
22+
'--nx', self.nx_opts]
23+
self.sourcesdir = None
24+
25+
self.use_multithreading = None
26+
27+
self.perf_patterns = {
28+
'time': sn.extractsingle(r'\d+,\s*(?P<time>(\d+)?.?\d+),\s*\d+,'
29+
r'\s*\d+,\s*\d+',
30+
self.stdout, 'time', float)
31+
}
32+
self.reference = {
33+
'dom:gpu': {
34+
'time': (42, None, 0.1, 's')
35+
},
36+
'dom:mc': {
37+
'time': (30, None, 0.1, 's')
38+
},
39+
'daint:gpu': {
40+
'time': (42, None, 0.1, 's')
41+
},
42+
'daint:mc': {
43+
'time': (30, None, 0.1, 's')
44+
},
45+
}
46+
47+
self.tags = {'production'}
48+
self.maintainers = ['VH', 'JG']
49+
50+
def setup(self, partition, environ, **job_opts):
51+
result = sn.findall(r'(?P<tid>\d+),\s*(?P<time>(\d+)?.?\d+),'
52+
r'\s*(?P<pts>\d+),\s*(?P<parts>\d+),'
53+
r'\s*(?P<steps>\d+)',
54+
self.stdout)
55+
56+
if partition.fullname == 'daint:gpu':
57+
self.num_tasks = 1
58+
self.num_tasks_per_node = 1
59+
self.num_cpus_per_task = 12
60+
elif partition.fullname == 'daint:mc':
61+
self.num_tasks = 1
62+
self.num_tasks_per_node = 1
63+
self.num_cpus_per_task = 36
64+
elif partition.fullname == 'dom:gpu':
65+
self.num_tasks = 1
66+
self.num_tasks_per_node = 1
67+
self.num_cpus_per_task = 12
68+
elif partition.fullname == 'dom:mc':
69+
self.num_tasks = 1
70+
self.num_tasks_per_node = 1
71+
self.num_cpus_per_task = 36
72+
73+
self.executable_opts += ['--hpx:threads=%s' % self.num_cpus_per_task]
74+
75+
assert_num_threads = sn.map(lambda x: sn.assert_eq(
76+
int(x.group('tid')), self.num_cpus_per_task), result)
77+
assert_num_points = sn.map(lambda x: sn.assert_eq(
78+
x.group('pts'), self.nx_opts), result)
79+
assert_num_parts = sn.map(lambda x: sn.assert_eq(x.group('parts'),
80+
self.np_opts), result)
81+
assert_num_steps = sn.map(lambda x: sn.assert_eq(x.group('steps'),
82+
self.nt_opts), result)
83+
84+
self.sanity_patterns = sn.all(sn.chain(assert_num_threads,
85+
assert_num_points,
86+
assert_num_parts,
87+
assert_num_steps))
88+
89+
super().setup(partition, environ, **job_opts)
90+
91+
92+
@rfm.simple_test
93+
class Stencil8HPXCheck(rfm.RunOnlyRegressionTest):
94+
def __init__(self):
95+
super().__init__()
96+
97+
self.descr = 'HPX 1d_stencil_8 check'
98+
self.valid_systems = ['daint:gpu, daint:mc', 'dom:gpu', 'dom:mc']
99+
self.valid_prog_environs = ['PrgEnv-gnu']
100+
101+
self.modules = ['HPX']
102+
self.executable = '1d_stencil_8'
103+
104+
self.nt_opts = '100' # number of time steps
105+
self.np_opts = '100' # number of partitions
106+
self.nx_opts = '10000000' # number of points per partition
107+
self.executable_opts = ['--nt', self.nt_opts,
108+
'--np', self.np_opts,
109+
'--nx', self.nx_opts]
110+
self.sourcesdir = None
111+
112+
self.use_multithreading = None
113+
114+
self.perf_patterns = {
115+
'time': sn.extractsingle(r'\d+,\s*\d+,\s*(?P<time>(\d+)?.?\d+),'
116+
r'\s*\d+,\s*\d+,\s*\d+',
117+
self.stdout, 'time', float)
118+
}
119+
self.reference = {
120+
'dom:gpu': {
121+
'time': (26, None, 0.1, 's')
122+
},
123+
'dom:mc': {
124+
'time': (19, None, 0.1, 's')
125+
},
126+
'daint:gpu': {
127+
'time': (26, None, 0.1, 's')
128+
},
129+
'daint:mc': {
130+
'time': (19, None, 0.1, 's')
131+
},
132+
}
133+
134+
self.tags = {'production'}
135+
self.maintainers = ['VH', 'JG']
136+
137+
def setup(self, partition, environ, **job_opts):
138+
result = sn.findall(r'(?P<lid>\d+),\s*(?P<tid>\d+),'
139+
r'\s*(?P<time>(\d+)?.?\d+),'
140+
r'\s*(?P<pts>\d+),'
141+
r'\s*(?P<parts>\d+),'
142+
r'\s*(?P<steps>\d+)', self.stdout)
143+
144+
if partition.fullname == 'daint:gpu':
145+
self.num_tasks = 2
146+
self.num_tasks_per_node = 1
147+
self.num_cpus_per_task = 12
148+
elif partition.fullname == 'daint:mc':
149+
self.num_tasks = 4
150+
self.num_tasks_per_node = 2
151+
self.num_cpus_per_task = 18
152+
self.num_tasks_per_socket = 1
153+
elif partition.fullname == 'dom:gpu':
154+
self.num_tasks = 2
155+
self.num_tasks_per_node = 1
156+
self.num_cpus_per_task = 12
157+
elif partition.fullname == 'dom:mc':
158+
self.num_tasks = 4
159+
self.num_tasks_per_node = 2
160+
self.num_cpus_per_task = 18
161+
self.num_tasks_per_socket = 1
162+
163+
self.executable_opts += ['--hpx:threads=%s' % self.num_cpus_per_task]
164+
165+
num_threads = self.num_tasks * self.num_cpus_per_task
166+
assert_num_tasks = sn.map(lambda x: sn.assert_eq(int(x.group('lid')),
167+
self.num_tasks), result)
168+
assert_num_threads = sn.map(lambda x: sn.assert_eq(int(x.group('tid')),
169+
num_threads), result)
170+
assert_num_points = sn.map(lambda x: sn.assert_eq(x.group('pts'),
171+
self.nx_opts), result)
172+
assert_num_parts = sn.map(lambda x: sn.assert_eq(x.group('parts'),
173+
self.np_opts), result)
174+
assert_num_steps = sn.map(lambda x: sn.assert_eq(x.group('steps'),
175+
self.nt_opts), result)
176+
177+
self.sanity_patterns = sn.all(sn.chain(assert_num_tasks,
178+
assert_num_threads,
179+
assert_num_points,
180+
assert_num_parts,
181+
assert_num_steps))
182+
183+
super().setup(partition, environ, **job_opts)

0 commit comments

Comments
 (0)