reframe-hpc
diff --git a/‎.pep8speaks.yml‎
Lines changed: 4 additions & 2 deletions b/‎.pep8speaks.yml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎Jenkinsfile‎
Lines changed: 11 additions & 0 deletions b/‎Jenkinsfile‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 18 additions & 0 deletions b/‎README.md‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎cscs-checks/apps/tensorflow/tf_horovod_check.py‎
Lines changed: 54 additions & 0 deletions b/‎cscs-checks/apps/tensorflow/tf_horovod_check.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py‎
Lines changed: 165 additions & 0 deletions b/‎cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py‎
Lines changed: 165 additions & 0 deletions
@@ -6,12 +6,12 @@ message:
         header: "Hello @{name}, Thank you for updating!"
         footer: "Do see the ReFrame [Coding Style Guide](https://github.com/eth-cscs/reframe/wiki/coding-style-guide)"
     no_errors: "Cheers! There are no PEP8 issues in this Pull Request!"
-    
+
 only_mention_files_with_errors: True
 
 scanner:
     diff_only: True
-    
+
 pycodestyle:
     max-line-length: 79
     ignore:
@@ -21,4 +21,6 @@ pycodestyle:
         - E241
         - E272
         - E741
+        - E742
+        - E743
         - W504
@@ -5,6 +5,7 @@ def loginBash = '#!/bin/bash -l'
 def bashScript = 'ci-scripts/ci-runner.bash'
 def machinesList = ['daint', 'dom', 'kesch']
 def machinesToRun = machinesList
+def runTests = true
 def uniqueID
 
 stage('Initialization') {
@@ -37,6 +38,11 @@ stage('Initialization') {
                 currentBuild.result = 'SUCCESS'
                 return
             }
+            else if (splittedComment[2] == 'none') {
+                runTests = false
+                currentBuild.result = 'SUCCESS'
+                return
+            }
 
             machinesRequested = []
             for (i = 2; i < splittedComment.size(); i++) {
@@ -66,6 +72,11 @@ stage('Initialization') {
     }
 }
 
+if (!runTests) {
+    println "Won't execute any test (${currentBuild.result}). Exiting..."
+    return
+}
+
 if (currentBuild.result != 'SUCCESS') {
     println "Initialization failed (${currentBuild.result}). Exiting..."
     return
 
@@ -44,3 +44,21 @@ The documentation is now up on [localhost:8000](http://localhost:8000), where yo
 In the `cscs-checks/` folder, you can find realistic regression tests used for the CSCS systems that you can reuse and adapt to your system.
 Notice that these tests are published as examples and may not run as-is in your system.
 However, they can serve as a very good starting point for implementing your system tests in ReFrame.
+
+
+## Contact
+
+You can get in contact with the ReFrame community in the following ways:
+
+### Mailing list
+
+For keeping up with the latest news about ReFrame, posting questions and, generally getting in touch with other users and the developers, you may follow the mailing list: [[email protected]](mailto:[email protected]).
+
+Only subscribers may send messages to the list.
+To subscribe, please send an empty message to [[email protected]](mailto:[email protected]).
+
+For unsubscribing, you may send an empty message to [[email protected]](mailto:[email protected]).
+
+### Slack
+
+You may also reach the community through Slack at [reframetalk.slack.com](https://reframetalk.slack.com/join/signup). Currently, you may join the Slack workspace by invitation only, which you will get as soon as you subscribe to the mailing list.
@@ -0,0 +1,54 @@
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.required_version('>=2.16-dev0')
+@rfm.simple_test
+class TensorFlowHorovodTest(rfm.RunOnlyRegressionTest):
+    def __init__(self):
+        super().__init__()
+        self.descr = 'Distributed training with TensorFlow and Horovod'
+        self.valid_systems = ['daint:gpu', 'dom:gpu']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        tfshortver = '1.11'
+        self.sourcesdir = 'https://github.com/tensorflow/benchmarks'
+        self.modules = ['Horovod/0.15.0-CrayGNU-18.08-tf-%s.0' % tfshortver]
+        self.reference = {
+            'dom:gpu': {
+                'throughput': (1133.6, None, 0.05, 'images/s'),
+            },
+            'daint:gpu': {
+                'throughput': (4403.0, None, 0.05, 'images/s')
+            },
+        }
+        self.perf_patterns = {
+            'throughput': sn.avg(sn.extractall(
+                r'total images/sec:\s+(?P<throughput>\S+)',
+                self.stdout, 'throughput', float))
+        }
+        self.sanity_patterns = sn.assert_found(
+            r'[\S+\s+] INFO NET\/IB : Using interface ipogif0'
+            r' for sideband communication', self.stdout)
+        self.num_tasks_per_node = 1
+        if self.current_system.name == 'dom':
+            self.num_tasks = 8
+        elif self.current_system.name == 'daint':
+            self.num_tasks = 32
+
+        self.pre_run = ['git checkout cnn_tf_v%s_compatible' % tfshortver]
+        self.variables = {
+            'NCCL_DEBUG': 'INFO',
+            'NCCL_IB_HCA': 'ipogif0',
+            'NCCL_IB_CUDA_SUPPORT': '1',
+            'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK',
+        }
+        self.executable = ('python')
+        self.executable_opts = [
+            'scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py',
+            '--model inception3',
+            '--batch_size 64',
+            '--variable_update horovod',
+            '--log_dir ./logs',
+            '--train_dir ./checkpoints']
+        self.tags = {'production'}
+        self.maintainers = ['MS', 'RS']
@@ -0,0 +1,165 @@
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.required_version('>=2.16-dev0')
+@rfm.simple_test
+class HPCGCheckRef(rfm.RegressionTest):
+    def __init__(self):
+        super().__init__()
+
+        self.descr = 'HPCG reference benchmark'
+        self.valid_systems = ['daint:mc', 'daint:gpu', 'dom:gpu', 'dom:mc']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.modules = ['craype-hugepages8M']
+        self.build_system = 'Make'
+        self.build_system.options = ['arch=MPI_GCC_OMP']
+        self.sourcesdir = 'https://github.com/hpcg-benchmark/hpcg.git'
+
+        self.executable = 'bin/xhpcg'
+        self.executable_opts = ['--nx=104', '--ny=104', '--nz=104', '-t2']
+        # use glob to catch the output file suffix dependent on execution time
+        self.output_file = sn.getitem(sn.glob('HPCG*.txt'), 0)
+
+        self.num_tasks = 0
+        self.num_cpus_per_task = 1
+        self.system_num_tasks = {
+            'daint:mc':  36,
+            'daint:gpu': 12,
+            'dom:mc':  36,
+            'dom:gpu': 12,
+        }
+
+        self.reference = {
+            'daint:gpu': {
+                'gflops': (7.6, -0.1, None, 'Gflop/s')
+            },
+            'daint:mc': {
+                'gflops': (13.4, -0.1, None, 'Gflop/s')
+            },
+            'dom:gpu': {
+                'gflops': (7.6, -0.1, None, 'Gflop/s')
+            },
+            'dom:mc': {
+                'gflops': (13.4, -0.1, None, 'Gflop/s')
+            },
+        }
+
+        self.maintainers = ['SK']
+        self.tags = {'diagnostic'}
+
+    @property
+    @sn.sanity_function
+    def num_tasks_assigned(self):
+        return self.job.num_tasks
+
+    def setup(self, partition, environ, **job_opts):
+        self.num_tasks_per_node = self.system_num_tasks[partition.fullname]
+
+        num_nodes = self.num_tasks_assigned / self.num_tasks_per_node
+        self.perf_patterns = {
+            'gflops': sn.extractsingle(
+                r'HPCG result is VALID with a GFLOP\/s rating of=\s*'
+                r'(?P<perf>\S+)',
+                self.output_file, 'perf',  float) / num_nodes
+        }
+
+        self.sanity_patterns = sn.all([
+            sn.assert_eq(4, sn.count(
+                sn.findall(r'PASSED', self.output_file))),
+            sn.assert_eq(0, self.num_tasks_assigned % self.num_tasks_per_node)
+        ])
+
+        super().setup(partition, environ, **job_opts)
+
+
+@rfm.required_version('>=2.16-dev0')
+@rfm.simple_test
+class HPCGCheckMKL(rfm.RegressionTest):
+    def __init__(self):
+        super().__init__()
+
+        self.descr = 'HPCG benchmark Intel MKL implementation'
+        self.valid_systems = ['daint:mc', 'dom:mc', 'daint:gpu', 'dom:gpu']
+        self.valid_prog_environs = ['PrgEnv-intel']
+        self.modules = ['craype-hugepages8M']
+        self.build_system = 'Make'
+        self.prebuild_cmd = ['cp -r ${MKLROOT}/benchmarks/hpcg/* .',
+                             'mv Make.CrayXC setup',
+                             './configure CrayXC']
+
+        self.num_tasks = 0
+        self.num_tasks_per_core = 2
+        self.problem_size = 104
+
+        self.variables = {
+            'HUGETLB_VERBOSE': '0',
+            'MPICH_MAX_THREAD_SAFETY': 'multiple',
+            'MPICH_USE_DMAPP_COLL': '1',
+            'PMI_NO_FORK': '1',
+            'KMP_HW_SUBSET': '9c,2t',
+            'KMP_AFFINITY': 'granularity=fine,compact'
+        }
+
+        self.executable = 'bin/xhpcg_avx2'
+        self.executable_opts = ['--nx=%d' % self.problem_size,
+                                '--ny=%d' % self.problem_size,
+                                '--nz=%d' % self.problem_size, '-t2']
+
+        self.reference = {
+            'dom:mc': {
+                'gflops': (22, -0.1, None, 'Gflop/s')
+            },
+            'daint:mc': {
+                'gflops': (22, -0.1, None, 'Gflop/s')
+            },
+            'dom:gpu': {
+                'gflops': (10.7, -0.1, None, 'Gflop/s')
+            },
+            'daint:gpu': {
+                'gflops': (10.7, -0.1, None, 'Gflop/s')
+            },
+        }
+
+        self.maintainers = ['SK']
+        self.tags = {'diagnostic'}
+
+    @property
+    @sn.sanity_function
+    def num_tasks_assigned(self):
+        return self.job.num_tasks
+
+    @property
+    @sn.sanity_function
+    def outfile_lazy(self):
+        pattern = 'n%d-%dp-%dt-*.yaml' % (self.problem_size,
+                                          self.job.num_tasks,
+                                          self.num_cpus_per_task)
+        return sn.getitem(sn.glob(pattern), 0)
+
+    def setup(self, partition, environ, **job_opts):
+        if partition.fullname in ['daint:gpu', 'dom:gpu']:
+            self.num_tasks_per_node = 2
+            self.num_cpus_per_task = 12
+        else:
+            self.num_tasks_per_node = 4
+            self.num_cpus_per_task = 18
+
+        # since this is a flexible test, we divide the extracted
+        # performance by the number of nodes and compare
+        # against a single reference
+        num_nodes = self.num_tasks_assigned / self.num_tasks_per_node
+        self.perf_patterns = {
+            'gflops': sn.extractsingle(
+                r'HPCG result is VALID with a GFLOP\/s rating of:\s*'
+                r'(?P<perf>\S+)',
+                self.outfile_lazy, 'perf',  float) / num_nodes
+        }
+
+        self.sanity_patterns = sn.all([
+            sn.assert_eq(4, sn.count(
+                sn.findall(r'PASSED', self.outfile_lazy))),
+            sn.assert_eq(0, self.num_tasks_assigned % self.num_tasks_per_node)
+        ])
+
+        super().setup(partition, environ, **job_opts)