|
| 1 | +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) |
| 2 | +# ReFrame Project Developers. See the top-level LICENSE file for details. |
| 3 | +# |
| 4 | +# SPDX-License-Identifier: BSD-3-Clause |
| 5 | + |
| 6 | +import reframe as rfm |
| 7 | +import reframe.utility.sanity as sn |
| 8 | + |
| 9 | + |
| 10 | +class MemBandwidthTest(rfm.RunOnlyRegressionTest): |
| 11 | + modules = ['likwid'] |
| 12 | + valid_prog_environs = ['PrgEnv-gnu'] |
| 13 | + sourcesdir = None |
| 14 | + executable = 'likwid-bench' |
| 15 | + num_tasks = 1 |
| 16 | + num_tasks_per_node = 1 |
| 17 | + num_tasks_per_core = 2 |
| 18 | + |
| 19 | + # Test each level at half capacity times nthreads per domain |
| 20 | + # FIXME: This should be adapted to use the topology autodetection features |
| 21 | + system_cache_sizes = { |
| 22 | + 'daint:mc': { |
| 23 | + 'L1': '288kB', 'L2': '2304kB', 'L3': '23MB', 'memory': '1800MB' |
| 24 | + }, |
| 25 | + 'daint:gpu': { |
| 26 | + 'L1': '192kB', 'L2': '1536kB', 'L3': '15MB', 'memory': '1200MB' |
| 27 | + }, |
| 28 | + 'dom:mc': { |
| 29 | + 'L1': '288kB', 'L2': '2304kB', 'L3': '23MB', 'memory': '1800MB' |
| 30 | + }, |
| 31 | + 'dom:gpu': { |
| 32 | + 'L1': '192kB', 'L2': '1536kB', 'L3': '15MB', 'memory': '1200MB' |
| 33 | + } |
| 34 | + } |
| 35 | + maintainers = ['SK', 'CB'] |
| 36 | + tags = {'benchmark', 'diagnostic', 'health'} |
| 37 | + |
| 38 | + @sanity_function |
| 39 | + def validate_test(self): |
| 40 | + self.bw_pattern = sn.min(sn.extractall(r'MByte/s:\s*(?P<bw>\S+)', |
| 41 | + self.stdout, 'bw', float)) |
| 42 | + return sn.assert_ge(self.bw_pattern, 0.0) |
| 43 | + |
| 44 | + @performance_function('MB/s') |
| 45 | + def bandwidth(self): |
| 46 | + return self.bw_pattern |
| 47 | + |
| 48 | + def set_processor_properties(self): |
| 49 | + self.skip_if_no_procinfo() |
| 50 | + self.num_cpus_per_task = self.current_partition.processor.num_cpus |
| 51 | + numa_nodes = self.current_partition.processor.topology['numa_nodes'] |
| 52 | + self.numa_domains = [f'S{i}' for i, _ in enumerate(numa_nodes)] |
| 53 | + self.num_cpu_domain = ( |
| 54 | + self.num_cpus_per_task // (len(self.numa_domains) * |
| 55 | + self.num_tasks_per_core) |
| 56 | + ) |
| 57 | + |
| 58 | + |
| 59 | +@rfm.simple_test |
| 60 | +class CPUBandwidth(MemBandwidthTest): |
| 61 | + # FIXME: This should be expressed in a better way |
| 62 | + config = parameter([*[[l, k] for l in ['L1', 'L2', 'L3'] |
| 63 | + for k in ['load_avx', 'store_avx']], |
| 64 | + ['memory', 'load_avx'], |
| 65 | + ['memory', 'store_mem_avx']]) |
| 66 | + valid_systems = ['daint:mc', 'daint:gpu', 'dom:gpu', 'dom:mc'] |
| 67 | + # the kernel to run in likwid |
| 68 | + kernel_name = variable(str) |
| 69 | + mem_level = variable(str) |
| 70 | + refs = { |
| 71 | + 'mc': { |
| 72 | + 'load_avx': {'L1': 5100000, 'L2': 2000000, 'L3': 900000, |
| 73 | + 'memory': 130000}, |
| 74 | + 'store_avx': {'L1': 2800000, 'L2': 900000, 'L3': 480000}, |
| 75 | + 'store_mem_avx': {'memory': 85000}, |
| 76 | + }, |
| 77 | + 'gpu': { |
| 78 | + 'load_avx': {'L1': 2100000, 'L2': 850000, 'L3': 360000, |
| 79 | + 'memory': 65000}, |
| 80 | + 'store_avx': {'L1': 1200000, 'L2': 340000, 'L3': 210000}, |
| 81 | + 'store_mem_avx': {'memory': 42500}, |
| 82 | + } |
| 83 | + } |
| 84 | + |
| 85 | + @run_after('init') |
| 86 | + def setup_descr(self): |
| 87 | + self.mem_level, self.kernel_name = self.config |
| 88 | + self.descr = f'CPU <- {self.mem_level} {self.kernel_name} benchmark' |
| 89 | + |
| 90 | + @run_before('performance') |
| 91 | + def set_reference(self): |
| 92 | + ref_proxy = {part: self.refs[part][self.kernel_name][self.mem_level] |
| 93 | + for part in self.refs.keys()} |
| 94 | + self.reference = { |
| 95 | + 'daint:gpu': { |
| 96 | + 'bandwidth': (ref_proxy['gpu'], -0.1, None, 'MB/s') |
| 97 | + }, |
| 98 | + 'daint:mc': { |
| 99 | + 'bandwidth': (ref_proxy['mc'], -0.1, None, 'MB/s') |
| 100 | + }, |
| 101 | + 'dom:gpu': { |
| 102 | + 'bandwidth': (ref_proxy['gpu'], -0.1, None, 'MB/s') |
| 103 | + }, |
| 104 | + 'dom:mc': { |
| 105 | + 'bandwidth': (ref_proxy['mc'], -0.1, None, 'MB/s') |
| 106 | + }, |
| 107 | + } |
| 108 | + |
| 109 | + @run_before('run') |
| 110 | + def set_exec_opts(self): |
| 111 | + self.set_processor_properties() |
| 112 | + partname = self.current_partition.fullname |
| 113 | + data_size = self.system_cache_sizes[partname][self.mem_level] |
| 114 | + # result for daint:mc: '-w S0:100MB:18:1:2 -w S1:100MB:18:1:2' |
| 115 | + # format: -w domain:data_size:nthreads:chunk_size:stride |
| 116 | + # chunk_size and stride affect which cpus from <domain> are selected |
| 117 | + workgroups = [f'-w {dom}:{data_size}:{self.num_cpu_domain:d}:1:2' |
| 118 | + for dom in self.numa_domains] |
| 119 | + self.executable_opts = [f'-t {self.kernel_name}'] + workgroups |
| 120 | + |
| 121 | + |
| 122 | +@rfm.simple_test |
| 123 | +class CPUBandwidthCrossSocket(MemBandwidthTest): |
| 124 | + descr = ('CPU S0 <- main memory S1 read ' |
| 125 | + 'CPU S1 <- main memory S0 read') |
| 126 | + valid_systems = ['daint:mc', 'dom:mc'] |
| 127 | + kernel_name = 'load_avx' |
| 128 | + reference = { |
| 129 | + 'daint:mc': { |
| 130 | + 'bandwidth': (56000, -0.1, None, 'MB/s') |
| 131 | + }, |
| 132 | + 'dom:mc': { |
| 133 | + 'bandwidth': (56000, -0.1, None, 'MB/s') |
| 134 | + }, |
| 135 | + } |
| 136 | + |
| 137 | + @run_before('run') |
| 138 | + def set_exec_opts(self): |
| 139 | + self.set_processor_properties() |
| 140 | + # daint:mc: '-w S0:100MB:18:1:2-0:S1 -w S1:100MB:18:1:2-0:S0' |
| 141 | + # format: |
| 142 | + # -w domain:data_size:nthreads:chunk_size:stride-stream_nr:mem_domain |
| 143 | + # chunk_size and stride affect which cpus from <domain> are selected |
| 144 | + workgroups = [ |
| 145 | + f'-w {dom_cpu}:100MB:{self.num_cpu_domain:d}:1:2-0:{dom_mem}' |
| 146 | + for dom_cpu, dom_mem in |
| 147 | + zip(self.numa_domains[:2], reversed(self.numa_domains[:2])) |
| 148 | + ] |
| 149 | + self.executable_opts = ['-t %s' % self.kernel_name] + workgroups |
0 commit comments