Merge pull request #2172 from hurricane642/amber_test

Vasileios Karakasis · web-flow · commit 3abb6dbe2fe2 · 2021-10-06T22:13:07.000+02:00
[test] Add new Amber NVE library test and refactor the CSCS Amber check
diff --git a/cscs-checks/apps/amber/amber_check.py b/cscs-checks/apps/amber/amber_check.py
@@ -3,197 +3,122 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
+import contextlib
 import reframe as rfm
-import reframe.utility.sanity as sn
+from hpctestlib.apps.amber.nve import amber_nve_check
 
 
-# FIXME: Use tuples as dictionary keys as soon as
-# https://github.com/eth-cscs/reframe/issues/2022 is in
-daint_gpu_performance = {
-    'Cellulose_production_NVE': (30.0, -0.05, None, 'ns/day'),
-    'FactorIX_production_NVE': (134.0, -0.05, None, 'ns/day'),
-    'JAC_production_NVE': (388.0, -0.05, None, 'ns/day'),
-    'JAC_production_NVE_4fs': (742, -0.05, None, 'ns/day'),
-}
-
-REFERENCE_GPU_PERFORMANCE = {
-    'daint:gpu': daint_gpu_performance,
-    'dom:gpu': daint_gpu_performance
-}
-
-daint_mc_performance_small = {
-    'Cellulose_production_NVE': (8.0, -0.30, None, 'ns/day'),
-    'FactorIX_production_NVE': (34.0, -0.30, None, 'ns/day'),
-    'JAC_production_NVE': (90.0, -0.30, None, 'ns/day'),
-    'JAC_production_NVE_4fs': (150.0, -0.30, None, 'ns/day'),
-}
-
-eiger_mc_performance_small = {
-    'Cellulose_production_NVE': (3.2, -0.30, None, 'ns/day'),
-    'FactorIX_production_NVE': (7.0, -0.30, None, 'ns/day'),
-    'JAC_production_NVE': (30.0, -0.30, None, 'ns/day'),
-    'JAC_production_NVE_4fs': (45.0, -0.30, None, 'ns/day'),
-}
-
-REFERENCE_CPU_PERFORMANCE_SMALL = {
-    'daint:mc': daint_mc_performance_small,
-    'dom:mc': daint_mc_performance_small,
-    'eiger:mc': eiger_mc_performance_small,
-    'pilatus:mc': eiger_mc_performance_small,
-}
-
-REFERENCE_CPU_PERFORMANCE_LARGE = {
-    'daint:mc': {
-        'Cellulose_production_NVE': (10.0, -0.30, None, 'ns/day'),
-        'FactorIX_production_NVE': (36.0, -0.30, None, 'ns/day'),
-        'JAC_production_NVE': (78.0, -0.30, None, 'ns/day'),
-        'JAC_production_NVE_4fs': (135.0, -0.30, None, 'ns/day'),
-    },
-    'eiger:mc': {
-        'Cellulose_production_NVE': (1.3, -0.30, None, 'ns/day'),
-        'FactorIX_production_NVE': (3.5, -0.30, None, 'ns/day'),
-        'JAC_production_NVE': (17.0, -0.30, None, 'ns/day'),
-        'JAC_production_NVE_4fs': (30.5, -0.30, None, 'ns/day'),
-    },
-}
-
-
-class AmberBaseCheck(rfm.RunOnlyRegressionTest):
-    valid_prog_environs = ['builtin']
-    strict_check = False
+@rfm.simple_test
+class cscs_amber_check(amber_nve_check):
     modules = ['Amber']
+    valid_prog_environs = ['builtin']
     extra_resources = {
         'switches': {
             'num_switches': 1
         }
     }
+    tags |= {'maintenance', 'production'}
     maintainers = ['VH', 'SO']
-    tags = {'scs', 'external-resources'}
-
-    benchmark = parameter([
-        # NVE simulations
-        'Cellulose_production_NVE',
-        'FactorIX_production_NVE',
-        'JAC_production_NVE_4fs',
-        'JAC_production_NVE',
-    ])
-
-    @run_after('init')
-    def download_files(self):
-        self.prerun_cmds = [
-            # cannot use wget because it is not installed on eiger
-            f'curl -LJO https://github.com/victorusu/amber_benchmark_suite'
-            f'/raw/main/amber_16_benchmark_suite/PME/{self.benchmark}.tar.bz2',
-            f'tar xf {self.benchmark}.tar.bz2'
-        ]
-
-    @run_after('init')
-    def set_energy_and_tolerance_reference(self):
-        self.ener_ref = {
-            # every system has a different reference energy and drift
-            'Cellulose_production_NVE': (-443246, 5.0E-05),
-            'FactorIX_production_NVE': (-234188, 1.0E-04),
-            'JAC_production_NVE_4fs': (-44810, 1.0E-03),
-            'JAC_production_NVE': (-58138, 5.0E-04),
-        }
-
-    @run_after('setup')
-    def set_executable_opts(self):
-        self.executable_opts = ['-O',
-                                '-i', self.input_file,
-                                '-o', self.output_file]
-        self.keep_files = [self.output_file]
-
-    @run_after('setup')
-    def set_sanity_patterns(self):
-        energy = sn.extractsingle(r' Etot\s+=\s+(?P<energy>\S+)',
-                                  self.output_file, 'energy', float, item=-2)
-        energy_reference = self.ener_ref[self.benchmark][0]
-        energy_diff = sn.abs(energy - energy_reference)
-        ref_ener_diff = sn.abs(self.ener_ref[self.benchmark][0] *
-                               self.ener_ref[self.benchmark][1])
-        self.sanity_patterns = sn.all([
-            sn.assert_found(r'Final Performance Info:', self.output_file),
-            sn.assert_lt(energy_diff, ref_ener_diff)
-        ])
-
-    @run_after('setup')
-    def set_generic_perf_references(self):
-        self.reference.update({'*': {
-            self.benchmark: (0, None, None, 'ns/day')
-        }})
-
-    @run_after('setup')
-    def set_perf_patterns(self):
-        self.perf_patterns = {
-            self.benchmark: sn.extractsingle(r'ns/day =\s+(?P<perf>\S+)',
-                                             self.output_file, 'perf',
-                                             float, item=1)
+    num_nodes = parameter([1, 4, 6, 8, 16])
+    allref = {
+        1: {
+            'p100': {
+                'Cellulose_production_NVE': (30.0, -0.05, None, 'ns/day'),
+                'FactorIX_production_NVE': (134.0, -0.05, None, 'ns/day'),
+                'JAC_production_NVE': (388.0, -0.05, None, 'ns/day'),
+                'JAC_production_NVE_4fs': (742, -0.05, None, 'ns/day')
+            }
+        },
+        4: {
+            'zen2': {
+                'Cellulose_production_NVE': (3.2, -0.30, None, 'ns/day'),
+                'FactorIX_production_NVE': (7.0, -0.30, None, 'ns/day'),
+                'JAC_production_NVE': (30.0, -0.30, None, 'ns/day'),
+                'JAC_production_NVE_4fs': (45.0, -0.30, None, 'ns/day')
+            }
+        },
+        6: {
+            'broadwell': {
+                'Cellulose_production_NVE': (8.0, -0.30, None, 'ns/day'),
+                'FactorIX_production_NVE': (34.0, -0.30, None, 'ns/day'),
+                'JAC_production_NVE': (90.0, -0.30, None, 'ns/day'),
+                'JAC_production_NVE_4fs': (150.0, -0.30, None, 'ns/day')
+            }
+        },
+        8: {
+            'zen2': {
+                'Cellulose_production_NVE': (1.3, -0.30, None, 'ns/day'),
+                'FactorIX_production_NVE': (3.5, -0.30, None, 'ns/day'),
+                'JAC_production_NVE': (17.0, -0.30, None, 'ns/day'),
+                'JAC_production_NVE_4fs': (30.5, -0.30, None, 'ns/day')
+            }
+        },
+        16: {
+            'broadwell': {
+                'Cellulose_production_NVE': (10.0, -0.30, None, 'ns/day'),
+                'FactorIX_production_NVE': (36.0, -0.30, None, 'ns/day'),
+                'JAC_production_NVE': (78.0, -0.30, None, 'ns/day'),
+                'JAC_production_NVE_4fs': (135.0, -0.30, None, 'ns/day')
+            }
         }
-
-
-@rfm.simple_test
-class AmberGPUCheck(AmberBaseCheck):
-    num_tasks = 1
-    num_tasks_per_node = 1
-    num_gpus_per_node = 1
-    valid_systems = ['daint:gpu', 'dom:gpu']
-    executable = 'pmemd.cuda.MPI'
-    input_file = 'mdin.GPU'
-    output_file = 'amber.out'
-    descr = f'Amber GPU check'
-    tags = {'maintenance', 'production', 'health'}
-    reference = REFERENCE_GPU_PERFORMANCE
-
-
-@rfm.simple_test
-class AmberCPUCheck(AmberBaseCheck):
-    scale = parameter(['small', 'large'])
-    valid_systems = ['daint:mc', 'eiger:mc']
-    executable = 'pmemd.MPI'
-    input_file = 'mdin.CPU'
-    output_file = 'amber.out'
-    tags = {'maintenance', 'production'}
-
-    @run_after('init')
-    def set_description(self):
-        self.mydescr = f'Amber parallel {self.scale} CPU check'
+    }
 
     @run_after('init')
-    def set_additional_systems(self):
-        if self.scale == 'small':
-            self.valid_systems += ['dom:mc', 'pilatus:mc']
+    def scope_systems(self):
+        valid_systems = {
+            'cuda': {1: ['daint:gpu', 'dom:gpu']},
+            'mpi': {
+                4: ['eiger:mc', 'pilatus:mc'],
+                6: ['daint:mc', 'dom:mc'],
+                8: ['pilatus:mc'],
+                16: ['daint:mc']
+            }
+        }
+        try:
+            self.valid_systems = valid_systems[self.variant][self.num_nodes]
+        except KeyError:
+            self.valid_systems = []
 
     @run_after('init')
     def set_hierarchical_prgenvs(self):
         if self.current_system.name in ['eiger', 'pilatus']:
             self.valid_prog_environs = ['cpeIntel']
 
+    @run_after('init')
+    def set_num_gpus_per_node(self):
+        if self.variant == 'cuda':
+            self.num_gpus_per_node = 1
+
     @run_after('setup')
-    def set_perf_reference(self):
-        if self.scale == 'small':
-            self.reference = REFERENCE_CPU_PERFORMANCE_SMALL
+    def skip_if_no_topo(self):
+        proc = self.current_partition.processor
+        pname = self.current_partition.fullname
+        if not proc.info:
+            self.skip(f'no topology information found for partition {pname!r}')
+
+    @run_after('setup')
+    def set_num_tasks(self):
+        if self.variant == 'cuda':
+            self.num_tasks_per_node = 1
         else:
-            self.reference = REFERENCE_CPU_PERFORMANCE_LARGE
+            proc = self.current_partition.processor
+            pname = self.current_partition.fullname
+            self.num_tasks_per_node = proc.num_cores
 
-    @run_after('init')
-    def set_num_tasks_cray_xc(self):
-        if self.current_system.name in ['daint', 'dom']:
-            self.num_tasks_per_node = 36
-            if self.scale == 'small':
-                self.num_nodes = 6
-            else:
-                self.num_nodes = 16
-            self.num_tasks = self.num_nodes * self.num_tasks_per_node
+        self.num_tasks = self.num_nodes * self.num_tasks_per_node
 
-    @run_after('init')
-    def set_num_tasks_cray_shasta(self):
-        if self.current_system.name in ['eiger', 'pilatus']:
-            self.num_tasks_per_node = 128
-            if self.scale == 'small':
-                self.num_nodes = 4
-            else:
-                # there are too many processors, the large jobs cannot start
-                # need to decrease to just 8 nodes
-                self.num_nodes = 8
-            self.num_tasks = self.num_nodes * self.num_tasks_per_node
+    @run_before('performance')
+    def set_perf_reference(self):
+        proc = self.current_partition.processor
+        pname = self.current_partition.fullname
+        if pname in ('daint:gpu', 'dom:gpu'):
+            arch = 'p100'
+        else:
+            arch = proc.arch
+
+        with contextlib.suppress(KeyError):
+            self.reference = {
+                pname: {
+                    'perf': self.allref[self.num_nodes][arch][self.benchmark]
+                }
+            }
diff --git a/docs/index.rst b/docs/index.rst
@@ -56,3 +56,4 @@ Publications
    usecases
    migration_2_to_3
    manuals
+   hpctestlib
diff --git a/hpctestlib/apps/amber/nve.py b/hpctestlib/apps/amber/nve.py