Make test flexible and support multiple gpus

Theofilos Manitaras · Theofilos Manitaras · commit 42c8e33b9a7e · 2019-01-18T14:51:28.000+01:00
diff --git a/cscs-checks/microbenchmarks/kernel_latency/kernel_latency.py b/cscs-checks/microbenchmarks/kernel_latency/kernel_latency.py
@@ -2,6 +2,7 @@
 import reframe.utility.sanity as sn
 
 
+@rfm.required_version('>=2.16-dev0')
 @rfm.parameterized_test(['sync'], ['async'])
 class KernelLatencyTest(rfm.RegressionTest):
     def __init__(self, kernel_version):
@@ -10,13 +11,16 @@ def __init__(self, kernel_version):
         self.build_system = 'SingleSource'
         self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
         self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
-        self.num_gpus_per_node = 1
+        self.num_tasks = 0
+        self.num_tasks_per_node = 1
 
         if self.current_system.name in {'dom', 'daint'}:
+            self.num_gpus_per_node = 1
             gpu_arch = '60'
             self.modules = ['craype-accel-nvidia60']
             self.valid_prog_environs += ['PrgEnv-gnu']
         else:
+            self.num_gpus_per_node = 16
             self.modules = ['craype-accel-nvidia35']
             gpu_arch = '37'
 
@@ -28,12 +32,21 @@ def __init__(self, kernel_version):
         else:
             self.build_system.cppflags = ['-D SYNCKERNEL=0']
 
-        self.sanity_patterns = sn.assert_found(r'Found \d+ gpu\(s\)',
-                                               self.stdout)
+        self.sanity_patterns = sn.all([
+            sn.assert_eq(
+                sn.count(sn.findall(r'\[\S+\] Found \d+ gpu\(s\)',
+                                    self.stdout)),
+                self.num_tasks_assigned),
+            sn.assert_eq(
+                sn.count(sn.findall(r'\[\S+\] \[gpu \d+\] Kernel launch '
+                                    r'latency: \S+ us', self.stdout)),
+                self.num_tasks_assigned * self.num_gpus_per_node)
+        ])
+
         self.perf_patterns = {
-            'latency': sn.extractsingle(
-                r'Kernel launch latency: (?P<latency>\S+) us',
-                self.stdout, 'latency', float)
+            'latency': sn.max(sn.extractall(
+                r'\[\S+\] \[gpu \d+\] Kernel launch latency: '
+                r'(?P<latency>\S+) us', self.stdout, 'latency', float))
         }
         self.sys_reference = {
             'sync': {
@@ -63,4 +76,9 @@ def __init__(self, kernel_version):
         self.reference = self.sys_reference[kernel_version]
 
         self.maintainers = ['TM']
-        self.tags = {'production'}
+        self.tags = {'benchmark', 'diagnostic'}
+
+    @property
+    @sn.sanity_function
+    def num_tasks_assigned(self):
+        return self.job.num_tasks
diff --git a/cscs-checks/microbenchmarks/kernel_latency/src/kernel_latency.cu b/cscs-checks/microbenchmarks/kernel_latency/src/kernel_latency.cu
@@ -1,49 +1,58 @@
 #include <iostream>
 #include <chrono>
 #include <ratio>
+#include <unistd.h>
 #include <cuda.h>
 
 __global__ void null_kernel() {
 };
 
 int main(int argc, char* argv[]) {
 
+    char hostname[256];
+    hostname[255]='\0';
+    gethostname(hostname, 255);
+
     cudaError_t error;
     int gpu_count = 0;
 
     error = cudaGetDeviceCount(&gpu_count);
 
     if (error == cudaSuccess) {
         if (gpu_count <= 0) {
-            std::cout << "Could not found any gpu\n";
+            std::cout << "[" << hostname << "] " << "Could not find any gpu\n";
             return 1;
         }
-        std::cout << "Found " << gpu_count << " gpu(s)\n";
+        std::cout << "[" << hostname << "] " << "Found " << gpu_count << " gpu(s)\n";
     }
     else{
-        std::cout << "Error getting gpu count, exiting...\n";
+        std::cout << "[" << hostname << "] " << "Error getting gpu count, exiting...\n";
         return 1;
     }
 
-    // Single kernel launch to initialize cuda runtime
-    null_kernel<<<1, 1>>>();
-
-    auto t_start = std::chrono::system_clock::now();
-    const int kernel_count = 1000;
+    for (int i = 0; i < gpu_count; i++) {
 
-    for (int i = 0; i < kernel_count; ++i) {
+        cudaSetDevice(i);
+        // Single kernel launch to initialize cuda runtime
         null_kernel<<<1, 1>>>();
-        #if SYNCKERNEL == 1
+
+        auto t_start = std::chrono::system_clock::now();
+        const int kernel_count = 1000;
+
+        for (int i = 0; i < kernel_count; ++i) {
+            null_kernel<<<1, 1>>>();
+            #if SYNCKERNEL == 1
+            cudaDeviceSynchronize();
+            #endif
+        }
+
+        #if SYNCKERNEL != 1
         cudaDeviceSynchronize();
         #endif
-    }
 
-    #if SYNCKERNEL != 1
-    cudaDeviceSynchronize();
-    #endif
-
-    auto t_end = std::chrono::system_clock::now();
-    std::cout << "Kernel launch latency: " << std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(t_end - t_start).count() / kernel_count << " us\n";
+        auto t_end = std::chrono::system_clock::now();
+        std::cout << "[" << hostname << "] " << "[gpu " << i << "] " <<  "Kernel launch latency: " << std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(t_end - t_start).count() / kernel_count << " us\n";
+    }
 
     return 0;
 }