|
11 | 11 |
|
12 | 12 | @rfm.simple_test |
13 | 13 | class IPCMagicCheck(rfm.RunOnlyRegressionTest): |
14 | | - def __init__(self): |
15 | | - self.descr = 'Distributed training with TensorFlow using ipyparallel' |
16 | | - self.valid_systems = ['daint:gpu', 'dom:gpu'] |
17 | | - self.valid_prog_environs = ['PrgEnv-gnu'] |
18 | | - self.modules = [ |
19 | | - # FIXME: Use the default ipcmagic version when fixed |
20 | | - f'ipcmagic/0.1-CrayGNU-{osext.cray_cdt_version()}', |
21 | | - f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0' |
22 | | - ] |
23 | | - self.num_tasks = 2 |
24 | | - self.num_tasks_per_node = 1 |
25 | | - self.executable = 'ipython' |
26 | | - self.executable_opts = ['tf-hvd-sgd-ipc-tf2.py'] |
27 | | - nids = sn.extractall(r'nid(?P<nid>\d+)', |
28 | | - self.stdout, 'nid', str) |
29 | | - self.sanity_patterns = sn.all([ |
30 | | - sn.assert_ne(nids, []), |
31 | | - sn.assert_ne(nids[0], nids[1]) |
32 | | - ]) |
33 | | - self.reference = { |
34 | | - 'daint:gpu': { |
35 | | - 'slope': (2.0, -0.1, 0.1, None), |
36 | | - 'offset': (0.0, -0.1, 0.1, None), |
37 | | - 'retries': (0, None, None, None), |
38 | | - 'time': (10, None, None, 's'), |
39 | | - }, |
40 | | - 'dom:gpu': { |
41 | | - 'slope': (2.0, -0.1, 0.1, None), |
42 | | - 'offset': (0.0, -0.1, 0.1, None), |
43 | | - 'retries': (0, None, None, None), |
44 | | - 'time': (10, None, None, 's'), |
45 | | - } |
46 | | - } |
47 | | - self.perf_patterns = { |
48 | | - 'slope': sn.extractsingle(r'slope=(?P<slope>\S+)', |
49 | | - self.stdout, 'slope', float), |
50 | | - 'offset': sn.extractsingle(r'offset=(?P<offset>\S+)', |
51 | | - self.stdout, 'offset', float), |
52 | | - 'retries': 4 - sn.count(sn.findall(r'IPCluster is already running', |
53 | | - self.stdout)), |
54 | | - 'time': sn.extractsingle(r'IPCluster is ready\!\s+' |
55 | | - r'\((?P<time>\d+) seconds\)', |
56 | | - self.stdout, 'time', float) |
| 14 | + descr = 'Distributed training with TensorFlow using ipyparallel' |
| 15 | + valid_systems = ['daint:gpu', 'dom:gpu'] |
| 16 | + valid_prog_environs = ['PrgEnv-gnu'] |
| 17 | + modules = [ |
| 18 | + f'ipcmagic', f'jupyterlab', |
| 19 | + f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0' |
| 20 | + ] |
| 21 | + num_tasks = 2 |
| 22 | + num_tasks_per_node = 1 |
| 23 | + executable = 'ipython' |
| 24 | + executable_opts = ['tf-hvd-sgd-ipc-tf2.py'] |
| 25 | + reference = { |
| 26 | + 'daint:gpu': { |
| 27 | + 'slope': (2.0, -0.1, 0.1, 'N/A'), |
| 28 | + 'offset': (0.0, -0.1, 0.1, 'N/A'), |
| 29 | + 'retries': (0, None, None, 'N/A'), |
| 30 | + 'time': (10, None, None, 's'), |
| 31 | + }, |
| 32 | + 'dom:gpu': { |
| 33 | + 'slope': (2.0, -0.1, 0.1, 'N/A'), |
| 34 | + 'offset': (0.0, -0.1, 0.1, 'N/A'), |
| 35 | + 'retries': (0, None, None, 'N/A'), |
| 36 | + 'time': (10, None, None, 's'), |
57 | 37 | } |
58 | | - self.maintainers = ['RS', 'TR'] |
59 | | - self.tags = {'production'} |
| 38 | + } |
| 39 | + |
| 40 | + maintainers = ['RS', 'TR'] |
| 41 | + tags = {'production'} |
| 42 | + |
| 43 | + @run_after('setup') |
| 44 | + def daint_module_workaround(self): |
| 45 | + if self.current_system.name == 'daint': |
| 46 | + # FIXME: Use the default modules once Dom/Daint are aligned |
| 47 | + self.modules = [ |
| 48 | + f'ipcmagic/1.0.1-CrayGNU-{osext.cray_cdt_version()}', |
| 49 | + f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0' |
| 50 | + ] |
| 51 | + # FIXME: Enforce loading of jupyterlab module since |
| 52 | + # `module show jupyterlab` throws a Tcl error on Daint |
| 53 | + self.prerun_cmds = ['module load jupyterlab'] |
| 54 | + |
| 55 | + @sanity_function |
| 56 | + def assert_successful_execution(self): |
| 57 | + nids = sn.extractall(r'nid(?P<nid>\d+)', self.stdout, 'nid', str) |
| 58 | + return sn.all([ |
| 59 | + sn.assert_ne(nids, []), sn.assert_ne(nids[0], nids[1]), |
| 60 | + sn.assert_found(r'IPCluster is ready\!\s+', self.stdout), |
| 61 | + sn.assert_found(r'slope=\S+', self.stdout) |
| 62 | + ]) |
| 63 | + |
| 64 | + @performance_function('N/A') |
| 65 | + def slope(self): |
| 66 | + return sn.extractsingle(r'slope=(?P<slope>\S+)', self.stdout, |
| 67 | + 'slope', float) |
| 68 | + |
| 69 | + @performance_function('N/A') |
| 70 | + def offset(self): |
| 71 | + return sn.extractsingle(r'offset=(?P<offset>\S+)', self.stdout, |
| 72 | + 'offset', float) |
| 73 | + |
| 74 | + @performance_function('N/A') |
| 75 | + def retries(self): |
| 76 | + return 4 - sn.count(sn.findall(r'IPCluster is already running', |
| 77 | + self.stdout)) |
| 78 | + |
| 79 | + @performance_function('s') |
| 80 | + def time(self): |
| 81 | + return sn.extractsingle(r'IPCluster is ready\!\s+' |
| 82 | + r'\((?P<time>\d+) seconds\)', |
| 83 | + self.stdout, 'time', float) |
60 | 84 |
|
61 | 85 | @run_before('run') |
62 | | - def prepare_run(self): |
| 86 | + def reset_launcher(self): |
63 | 87 | # Change the job launcher since `ipython` |
64 | 88 | # needs to be launched without `srun`. |
65 | 89 | self.job.launcher = getlauncher('local')() |
0 commit comments