reframe-hpc
diff --git a/‎cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py‎
Lines changed: 59 additions & 131 deletions b/‎cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py‎
Lines changed: 59 additions & 131 deletions
diff --git a/‎docs/config_reference.rst‎
Lines changed: 3 additions & 0 deletions b/‎docs/config_reference.rst‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎reframe/core/backends.py‎
Lines changed: 2 additions & 1 deletion b/‎reframe/core/backends.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎reframe/core/launchers/mpi.py‎
Lines changed: 19 additions & 0 deletions b/‎reframe/core/launchers/mpi.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎reframe/core/schedulers/lsf.py‎
Lines changed: 129 additions & 0 deletions b/‎reframe/core/schedulers/lsf.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎reframe/core/schedulers/registry.py‎
Lines changed: 1 addition & 0 deletions b/‎reframe/core/schedulers/registry.py‎
Lines changed: 1 addition & 0 deletions
@@ -7,138 +7,66 @@
 import reframe.utility.sanity as sn
 
 
-class StridedBase(rfm.RegressionTest):
-    def __init__(self):
-        self.sourcepath = 'strides.cpp'
-        self.build_system = 'SingleSource'
-        self.valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc',
-                              'eiger:mc', 'pilatus:mc']
-        self.valid_prog_environs = ['PrgEnv-gnu']
-        self.num_tasks = 1
-        self.num_tasks_per_node = 1
-
-        self.sanity_patterns = sn.assert_eq(
-            sn.count(sn.findall(r'bandwidth', self.stdout)),
-            self.num_tasks_assigned)
-
-        self.perf_patterns = {
-            'bandwidth': sn.extractsingle(
-                r'bandwidth: (?P<bw>\S+) GB/s',
-                self.stdout, 'bw', float)
-        }
-
-        self.system_num_cpus = {
-            'daint:mc':  72,
-            'daint:gpu': 24,
-            'dom:mc':  72,
-            'dom:gpu': 24,
-            'eiger:mc': 128,
-            'pilatus:mc': 128
-        }
-
-        self.maintainers = ['SK']
-        self.tags = {'benchmark', 'diagnostic'}
-
-    @property
-    @sn.sanity_function
-    def num_tasks_assigned(self):
-        return self.job.num_tasks
-
-
 @rfm.simple_test
-class StridedBandwidthTest(StridedBase):
-    def __init__(self):
-        super().__init__()
-
-        self.reference = {
-            'dom:gpu': {
-                'bandwidth': (50, -0.1, 0.1, 'GB/s')
-            },
-            'dom:mc': {
-                'bandwidth': (100, -0.1, 0.1, 'GB/s')
-            },
-            'daint:gpu': {
-                'bandwidth': (50, -0.1, 0.1, 'GB/s')
-            },
-            'daint:mc': {
-                'bandwidth': (100, -0.1, 0.1, 'GB/s')
-            },
-            'eiger:mc': {
-                'bandwidth': (270, -0.1, 0.1, 'GB/s')
-            },
-            'pilatus:mc': {
-                'bandwidth': (270, -0.1, 0.1, 'GB/s')
-            }
-        }
+class StridedBandwidthTest(rfm.RegressionTest):
+    sourcepath = 'strides.cpp'
+    build_system = 'SingleSource'
+    valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc',
+                     'eiger:mc', 'pilatus:mc']
+    valid_prog_environs = ['PrgEnv-gnu']
+    num_tasks = 1
+    num_tasks_per_node = 1
+    maintainers = ['SK']
+    tags = {'benchmark', 'diagnostic'}
+    stride_bytes = parameter([8, 64, 128])
+    reference_bw = {
+        8: {
+            'haswell': (50, -0.1, 0.1, 'GB/s'),
+            'broadwell': (100, -0.1, 0.1, 'GB/s'),
+            'zen2': (270, -0.1, 0.1, 'GB/s')
+        },
+        64: {
+            'haswell': (6, -0.1, 0.2, 'GB/s'),
+            'broadwell': (12.5, -0.1, 0.2, 'GB/s'),
+            'zen2': (33, -0.1, 0.2, 'GB/s')
+        },
+        128: {
+            'haswell': (4.5, -0.1, 0.2, 'GB/s'),
+            'broadwell': (9.1, -0.1, 0.2, 'GB/s'),
+            'zen2': (33, -0.1, 0.2, 'GB/s')
+        },
+    }
+
+    @run_after('setup')
+    def skip_if_no_topo(self):
+        proc = self.current_partition.processor
+        pname = self.current_partition.fullname
+        if not proc.info:
+            self.skip(f'no topology information found for partition {pname!r}')
+
+    @sanity_function
+    def assert_num_tasks(self):
+        return sn.assert_eq(sn.count(sn.findall(r'bandwidth', self.stdout)),
+                            self.num_tasks)
+
+    @performance_function('GB/s')
+    def bandwidth(self):
+        return sn.extractsingle(r'bandwidth: (?P<bw>\S+) GB/s',
+                                self.stdout, 'bw', float)
 
     @run_before('run')
     def set_exec_opts(self):
-        self.num_cpus = self.system_num_cpus[self.current_partition.fullname]
-
-        # 8-byte stride, using the full cacheline
-        self.executable_opts = ['100000000', '1', f'{self.num_cpus}']
-
-
-@rfm.simple_test
-class StridedBandwidthTest64(StridedBase):
-    def __init__(self):
-        super().__init__()
-
-        self.reference = {
-            'dom:gpu': {
-                'bandwidth': (6, -0.1, 0.2, 'GB/s')
-            },
-            'dom:mc': {
-                'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
-            },
-            'daint:gpu': {
-                'bandwidth': (6, -0.05, 0.2, 'GB/s')
-            },
-            'daint:mc': {
-                'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
-            },
-            'eiger:mc': {
-                'bandwidth': (33, -0.1, 0.2, 'GB/s')
-            },
-            'pilatus:mc': {
-                'bandwidth': (33, -0.1, 0.2, 'GB/s')
-            }
-        }
-
-    @run_before('run')
-    def set_exec_opts(self):
-        self.num_cpus = self.system_num_cpus[self.current_partition.fullname]
-
-        # 64-byte stride, using 1/8 of the cacheline
-        self.executable_opts = ['100000000', '8', '%s' % self.num_cpus]
-
-
-@rfm.simple_test
-class StridedBandwidthTest128(StridedBase):
-    def __init__(self):
-        super().__init__()
-
-        self.reference = {
-            'dom:gpu': {
-                'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
-            },
-            'dom:mc': {
-                'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
-            },
-            'daint:gpu': {
-                'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
-            },
-            'daint:mc': {
-                'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
-            },
-            'eiger:mc': {
-                'bandwidth': (33, -0.1, 0.2, 'GB/s')
-            },
-        }
-
-    @run_before('run')
-    def set_exec_opts(self):
-        self.num_cpus = self.system_num_cpus[self.current_partition.fullname]
-
-        # 128-byte stride, using 1/8 of every 2nd cacheline
-        self.executable_opts = ['100000000', '16', '%s' % self.num_cpus]
+        proc = self.current_partition.processor
+        self.executable_opts = [
+            '100000000', str(self.stride_bytes // 8), f'{proc.num_cpus}'
+        ]
+
+    @run_before('performance')
+    def set_reference(self):
+        proc = self.current_partition.processor
+        try:
+            ref = self.reference_bw[self.stride_bytes][proc.arch]
+        except KeyError:
+            return
+        else:
+            self.reference = {'*': {'bandwidth': ref}}
@@ -218,6 +218,7 @@ System Partition Configuration
      If not, you should consider using the ``squeue`` backend below.
    - ``squeue``: Jobs will be launched using the `Slurm <https://www.schedmd.com/>`__ scheduler.
      This backend does not rely on job accounting to retrieve job statuses, but ReFrame does its best to query the job state as reliably as possible.
+   - ``lsf``: Jobs will be launched using the `LSF <https://www.ibm.com/docs/en/spectrum-lsf/>`__ scheduler.
 
    .. versionadded:: 3.7.2
       Support for the SGE scheduler is added.
@@ -276,6 +277,8 @@ System Partition Configuration
      This is a custom parallel program launcher used at `TACC <https://portal.tacc.utexas.edu/user-guides/stampede2>`__.
    - ``local``: No parallel program launcher will be used.
      The program will be launched locally.
+   - ``lrun``: Parallel programs will be launched using `LC Launcher  <https://hpc.llnl.gov/training/tutorials/using-lcs-sierra-system#lrun>`__'s ``lrun`` command.
+   - ``lrun-gpu``: Parallel programs will be launched using `LC Launcher <https://hpc.llnl.gov/training/tutorials/using-lcs-sierra-system#lrun>`__'s ``lrun -M "-gpu"`` command that enables the CUDA-aware Spectrum MPI.
    - ``mpirun``: Parallel programs will be launched using the ``mpirun`` command.
    - ``mpiexec``: Parallel programs will be launched using the ``mpiexec`` command.
    - ``srun``: Parallel programs will be launched using `Slurm <https://slurm.schedmd.com/srun.html>`__'s ``srun`` command.
 
@@ -20,7 +20,8 @@
     'reframe.core.schedulers.local',
     'reframe.core.schedulers.slurm',
     'reframe.core.schedulers.pbs',
-    'reframe.core.schedulers.sge'
+    'reframe.core.schedulers.sge',
+    'reframe.core.schedulers.lsf'
 ]
 _schedulers = {}
 
 
@@ -124,3 +124,22 @@ def command(self, job):
             ret.append(opt)
 
         return ret
+
+
+@register_launcher('lrun')
+class LrunLauncher(JobLauncher):
+    '''LLNL's custom parallel job launcher'''
+
+    def command(self, job):
+        num_tasks_per_node = job.num_tasks_per_node or 1
+        num_nodes = job.num_tasks // num_tasks_per_node
+        return ['lrun', '-N', str(num_nodes),
+                '-T', str(num_tasks_per_node)]
+
+
+@register_launcher('lrun-gpu')
+class LrungpuLauncher(LrunLauncher):
+    '''LLNL's custom parallel job launcher w/ CUDA aware Spectum MPI'''
+
+    def command(self, job):
+        return super().command(job) + ['-M "-gpu"']
@@ -0,0 +1,129 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+#
+# LSF backend
+#
+# - Initial version submitted by Ryan Goodner, UNM (based on PBS backend)
+#
+
+import functools
+import re
+import time
+
+import reframe.core.runtime as rt
+import reframe.utility.osext as osext
+from reframe.core.backends import register_scheduler
+from reframe.core.exceptions import JobSchedulerError
+from reframe.core.schedulers.pbs import PbsJobScheduler
+
+_run_strict = functools.partial(osext.run_command, check=True)
+
+
+@register_scheduler('lsf')
+class LsfJobScheduler(PbsJobScheduler):
+    def __init__(self):
+        self._prefix = '#BSUB'
+        self._submit_timeout = rt.runtime().get_option(
+            f'schedulers/@{self.registered_name}/job_submit_timeout'
+        )
+
+    def emit_preamble(self, job):
+        num_tasks_per_node = job.num_tasks_per_node or 1
+        num_nodes = job.num_tasks // num_tasks_per_node
+
+        preamble = [
+            self._format_option(f'-J {job.name}'),
+            self._format_option(f'-o {job.stdout}'),
+            self._format_option(f'-e {job.stderr}'),
+            self._format_option(f'-nnodes {num_nodes}')
+        ]
+
+        # add job time limit in minutes
+        if job.time_limit is not None:
+            preamble.append(
+                self._format_option(f'-W {int(job.time_limit // 60)}')
+            )
+
+        # emit the rest of the options
+        options = job.options + job.cli_options
+        for opt in options:
+            if opt.startswith('#'):
+                preamble.append(opt)
+            else:
+                preamble.append(self._format_option(opt))
+
+        # change to working dir with cd
+        preamble.append(f'cd {job.workdir}')
+
+        return preamble
+
+    def submit(self, job):
+        cmd = f'bsub {job.script_filename}'
+        completed = _run_strict(cmd, timeout=self._submit_timeout)
+        jobid_match = re.search(r'^Job <(?P<jobid>\S+)> is submitted',
+                                completed.stdout)
+        if not jobid_match:
+            raise JobSchedulerError('could not retrieve the job id '
+                                    'of the submitted job')
+
+        job._jobid = jobid_match.group('jobid')
+        job._submit_time = time.time()
+
+    def poll(self, *jobs):
+        if jobs:
+            # filter out non-jobs
+            jobs = [job for job in jobs if job is not None]
+
+        if not jobs:
+            return
+
+        completed = _run_strict(
+            f'bjobs -noheader {" ".join(job.jobid for job in jobs)}'
+        )
+        job_status = {}
+        job_status_lines = completed.stdout.split('\n')
+
+        for line in job_status_lines:
+            job_regex = (r'(?P<jobid>\d+)\s+'
+                         r'(?P<user>\S+)\s+'
+                         r'(?P<status>\S+)\s+'
+                         r'(?P<queue>\S+)')
+            job_match = re.search(job_regex, line)
+            if job_match:
+                job_status[job_match['jobid']] = job_match['status']
+
+        for job in jobs:
+            if job.jobid not in job_status:
+                # job id not found
+                self.log(f'Job {job.jobid} not known to scheduler, '
+                         f'assuming job completed')
+                job._state = 'COMPLETED'
+                job._completed = True
+            elif job_status[job.jobid] in ('DONE', 'EXIT'):
+                # job done
+                job._state = 'COMPLETED'
+                job._completed = True
+            elif job_status[job.jobid] == 'RUN':
+                # job running
+                job._state = 'RUNNING'
+            elif job_status[job.jobid] == 'PEND':
+                # job pending
+                job._state = 'PENDING'
+            elif job_status[job.jobid] in ['PSUSP', 'SSUSP', 'USUSP']:
+                # job suspended
+                job._state = 'SUSPENDED'
+            else:
+                # job status unknown
+                self.log(f'Job {job_status[job.jobid]} not known, '
+                         f'assuming job completed')
+                job._state = 'COMPLETED'
+                job._completed = True
+
+    def finished(self, job):
+        if job.exception:
+            raise job.exception
+
+        return job.state == 'COMPLETED'
@@ -40,3 +40,4 @@ def getscheduler(name):
 import reframe.core.schedulers.pbs     # noqa: F401, F403
 import reframe.core.schedulers.sge     # noqa: F401, F403
 import reframe.core.schedulers.torque  # noqa: F401, F403
+import reframe.core.schedulers.lsf     # noqa: F401, F403
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,8 @@`
`20`	`20`	`'reframe.core.schedulers.local',`
`21`	`21`	`'reframe.core.schedulers.slurm',`
`22`	`22`	`'reframe.core.schedulers.pbs',`
`23`		`- 'reframe.core.schedulers.sge'`
	`23`	`+ 'reframe.core.schedulers.sge',`
	`24`	`+ 'reframe.core.schedulers.lsf'`
`24`	`25`	`]`
`25`	`26`	`_schedulers = {}`
`26`	`27`