Skip to content

Commit f8c58ec

Browse files
author
Vasileios Karakasis
authored
Merge branch 'master' into feat/fixtures
2 parents 7aa9aea + d6c9a31 commit f8c58ec

File tree

5 files changed

+88
-70
lines changed

5 files changed

+88
-70
lines changed

cscs-checks/apps/jupyter/check_ipcmagic.py

Lines changed: 7 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -4,86 +4,26 @@
44
# SPDX-License-Identifier: BSD-3-Clause
55

66
import reframe as rfm
7-
import reframe.utility.osext as osext
8-
import reframe.utility.sanity as sn
9-
from reframe.core.backends import getlauncher
7+
8+
from hpctestlib.apps.jupyter.ipcmagic import ipcmagic_check
109

1110

1211
@rfm.simple_test
13-
class IPCMagicCheck(rfm.RunOnlyRegressionTest):
14-
descr = 'Distributed training with TensorFlow using ipyparallel'
12+
class cscs_ipcmagic_check(ipcmagic_check):
1513
valid_systems = ['daint:gpu', 'dom:gpu']
16-
valid_prog_environs = ['PrgEnv-gnu']
17-
modules = [
18-
f'ipcmagic', f'jupyterlab',
19-
f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0'
20-
]
21-
num_tasks = 2
22-
num_tasks_per_node = 1
23-
executable = 'ipython'
24-
executable_opts = ['tf-hvd-sgd-ipc-tf2.py']
14+
valid_prog_environs = ['builtin']
15+
modules = ['jupyterlab', 'Horovod']
16+
maintainers = ['RS', 'TR']
17+
tags = {'production'}
2518
reference = {
2619
'daint:gpu': {
2720
'slope': (2.0, -0.1, 0.1, 'N/A'),
2821
'offset': (0.0, -0.1, 0.1, 'N/A'),
2922
'retries': (0, None, None, 'N/A'),
30-
'time': (10, None, None, 's'),
3123
},
3224
'dom:gpu': {
3325
'slope': (2.0, -0.1, 0.1, 'N/A'),
3426
'offset': (0.0, -0.1, 0.1, 'N/A'),
3527
'retries': (0, None, None, 'N/A'),
36-
'time': (10, None, None, 's'),
3728
}
3829
}
39-
40-
maintainers = ['RS', 'TR']
41-
tags = {'production'}
42-
43-
@run_after('setup')
44-
def daint_module_workaround(self):
45-
if self.current_system.name == 'daint':
46-
# FIXME: Use the default modules once Dom/Daint are aligned
47-
self.modules = [
48-
f'ipcmagic/1.0.1-CrayGNU-{osext.cray_cdt_version()}',
49-
f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0'
50-
]
51-
# FIXME: Enforce loading of jupyterlab module since
52-
# `module show jupyterlab` throws a Tcl error on Daint
53-
self.prerun_cmds = ['module load jupyterlab']
54-
55-
@sanity_function
56-
def assert_successful_execution(self):
57-
nids = sn.extractall(r'nid(?P<nid>\d+)', self.stdout, 'nid', str)
58-
return sn.all([
59-
sn.assert_ne(nids, []), sn.assert_ne(nids[0], nids[1]),
60-
sn.assert_found(r'IPCluster is ready\!\s+', self.stdout),
61-
sn.assert_found(r'slope=\S+', self.stdout)
62-
])
63-
64-
@performance_function('N/A')
65-
def slope(self):
66-
return sn.extractsingle(r'slope=(?P<slope>\S+)', self.stdout,
67-
'slope', float)
68-
69-
@performance_function('N/A')
70-
def offset(self):
71-
return sn.extractsingle(r'offset=(?P<offset>\S+)', self.stdout,
72-
'offset', float)
73-
74-
@performance_function('N/A')
75-
def retries(self):
76-
return 4 - sn.count(sn.findall(r'IPCluster is already running',
77-
self.stdout))
78-
79-
@performance_function('s')
80-
def time(self):
81-
return sn.extractsingle(r'IPCluster is ready\!\s+'
82-
r'\((?P<time>\d+) seconds\)',
83-
self.stdout, 'time', float)
84-
85-
@run_before('run')
86-
def reset_launcher(self):
87-
# Change the job launcher since `ipython`
88-
# needs to be launched without `srun`.
89-
self.job.launcher = getlauncher('local')()

docs/hpctestlib.rst

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
ReFrame Test Library
2-
====================
1+
ReFrame Test Library (experimental)
2+
===================================
3+
4+
This is a collection of generic tests that you can either run out-of-the-box by specializing them for your system using the :option:`-S` option or create your site-specific tests by building upon them.
35

46

57
Scientific Applications
@@ -16,3 +18,11 @@ Python
1618
.. automodule:: hpctestlib.python.numpy.numpy_ops
1719
:members:
1820
:show-inheritance:
21+
22+
23+
Interactive Computing
24+
-----------------------
25+
26+
.. automodule:: hpctestlib.apps.jupyter.ipcmagic
27+
:members:
28+
:show-inheritance:
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
6+
import reframe as rfm
7+
import reframe.utility.sanity as sn
8+
9+
from reframe.core.backends import getlauncher
10+
11+
12+
@rfm.simple_test
13+
class ipcmagic_check(rfm.RunOnlyRegressionTest, pin_prefix=True):
14+
'''Test ipcmagic via a distributed TensorFlow training with ipyparallel.
15+
16+
`ipcmagic <https://github.com/eth-cscs/ipcluster_magic>`__ is a Python
17+
package and collection of CLI scripts for controlling clusters for
18+
Jupyter. For more information, please have a look
19+
`here <https://user.cscs.ch/tools/interactive/jupyterlab/#ipython>`__.
20+
21+
This test checks the ipcmagic performance.
22+
To do this, a single-layer neural network is trained against a noisy linear
23+
function. The parameters of the fitted linear function are returned in the
24+
end along with the resulting loss function. The default assumption is that
25+
ipcmagic is already installed on the system under test.
26+
27+
'''
28+
29+
executable = 'ipython'
30+
executable_opts = ['tf-hvd-sgd-ipc-tf2.py']
31+
num_tasks = 2
32+
num_tasks_per_node = 1
33+
descr = 'Distributed training with TensorFlow using ipyparallel'
34+
35+
@performance_function('N/A')
36+
def fitted_line_slope(self):
37+
return sn.extractsingle(r'slope=(?P<slope>\S+)',
38+
self.stdout, 'slope', float)
39+
40+
@performance_function('N/A')
41+
def fitted_line_offset(self):
42+
return sn.extractsingle(r'offset=(?P<offset>\S+)',
43+
self.stdout, 'offset', float)
44+
45+
@performance_function('N/A')
46+
def retries(self):
47+
return 4 - sn.count(sn.findall(r'IPCluster is already running',
48+
self.stdout))
49+
50+
@run_before('run')
51+
def reset_launcher(self):
52+
# Change the job launcher since `ipython`
53+
# needs to be launched without `srun`.
54+
self.job.launcher = getlauncher('local')()
55+
56+
@sanity_function
57+
def assert_successful_execution(self):
58+
'''Checks that the program is running on 2 different nodes (nids
59+
are different), that IPCMagic is configured and returns the correct
60+
end-of-program message (returns the slope parameter in the end).'''
61+
62+
nids = sn.extractall(r'nid(?P<nid>\d+)', self.stdout, 'nid', str)
63+
return sn.all([
64+
sn.assert_eq(sn.len(nids), 2),
65+
sn.assert_ne(nids[0], nids[1]),
66+
sn.assert_found(r'slope=\S+', self.stdout)
67+
])

reframe/core/pipeline.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1420,7 +1420,8 @@ def _clone_to_stagedir(self, url):
14201420
self.logger.debug(f'Cloning URL {url} into stage directory')
14211421
osext.git_clone(
14221422
self.sourcesdir, self._stagedir,
1423-
timeout=rt.runtime().get_option('general/0/git_timeout')
1423+
# FIXME: cast to float explicitly due to GH #2246
1424+
timeout=float(rt.runtime().get_option('general/0/git_timeout'))
14241425
)
14251426

14261427
@final

0 commit comments

Comments
 (0)