add support for launch_bounds in templated kernels

benvanwerkhoven · benvanwerkhoven · commit 672e0c0ba9ad · 2022-10-28T17:30:22.000+02:00
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -685,40 +685,48 @@ def get_templated_typenames(template_parameters, template_arguments):
 
 def wrap_templated_kernel(kernel_string, kernel_name):
     """rewrite kernel_string to insert wrapper function for templated kernel"""
-    #parse kernel_name to find template_arguments and real kernel name
+    # parse kernel_name to find template_arguments and real kernel name
     name = kernel_name.split("<")[0]
     template_arguments = re.search(r".*?<(.*)>", kernel_name, re.S).group(1).split(',')
 
-    #parse templated kernel definition
-    #relatively strict regex that does not allow nested template parameters like vector<TF>
-    #within the template parameter list
-    regex = r"template\s*<([^>]*?)>\s*__global__\s+void\s+" + name + r"\s*\((.*?)\)\s*\{"
+    # parse templated kernel definition
+    # relatively strict regex that does not allow nested template parameters like vector<TF>
+    # within the template parameter list
+    regex = r"template\s*<([^>]*?)>\s*__global__\s+void\s+(__launch_bounds__\([^\)]+?\)\s+)?" + name + r"\s*\((.*?)\)\s*\{"
     match = re.search(regex, kernel_string, re.S)
     if not match:
         raise ValueError("could not find templated kernel definition")
 
     template_parameters = match.group(1).split(',')
-    argument_list = match.group(2).split(',')
+    argument_list = match.group(3).split(',')
     argument_list = [s.strip() for s in argument_list]    #remove extra whitespace around 'type name' strings
 
     type_list, name_list = split_argument_list(argument_list)
 
     templated_typenames = get_templated_typenames(template_parameters, template_arguments)
     apply_template_typenames(type_list, templated_typenames)
 
-    #replace __global__ with __device__ in the templated kernel definition
-    #could do a more precise replace, but __global__ cannot be used elsewhere in the definition
+    # replace __global__ with __device__ in the templated kernel definition
+    # could do a more precise replace, but __global__ cannot be used elsewhere in the definition
     definition = match.group(0).replace("__global__", "__device__")
 
-    #generate code for the compile-time template instantiation
+    # there is a __launch_bounds__() group that is matched
+    launch_bounds = ""
+    if match.group(2):
+        print(f"found launch bounds: {match.group(2)=}")
+
+        definition = definition.replace(match.group(2), " ")
+        launch_bounds = match.group(2)
+
+    # generate code for the compile-time template instantiation
     template_instantiation = f"template __device__ void {kernel_name}(" + ", ".join(type_list) + ");\n"
 
-    #generate code for the wrapper kernel
+    # generate code for the wrapper kernel
     new_arg_list = ", ".join([" ".join((a, b)) for a, b in zip(type_list, name_list)])
-    wrapper_function = "\nextern \"C\" __global__ void " + name + "_wrapper(" + new_arg_list + ") {\n  " + \
+    wrapper_function = "\nextern \"C\" __global__ void " + launch_bounds + name + "_wrapper(" + new_arg_list + ") {\n  " + \
        kernel_name + "(" + ", ".join(name_list) + ");\n}\n"
 
-    #copy kernel_string, replace definition and append template instantiation and wrapper function
+    # copy kernel_string, replace definition and append template instantiation and wrapper function
     new_kernel_string = kernel_string[:]
     new_kernel_string = new_kernel_string.replace(match.group(0), definition)
     new_kernel_string += "\n" + template_instantiation
diff --git a/test/test_core.py b/test/test_core.py
@@ -219,3 +219,62 @@ def test_wrap_templated_kernel():
     #check if original kernel is called
     assert "vector_add<float>(c, a, b, n);" in ans
 
+def test_wrap_templated_kernel2():
+    kernel_string = """
+template<typename TF> __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add(TF *c, const TF *__restrict__ a, TF * b , int n) {
+    auto i = blockIdx.x * block_size_x + threadIdx.x;
+    if (i<n) {
+        c[i] = a[i] + b[i];
+    }
+}
+"""
+    kernel_name = "vector_add<float>"
+    # test no exception is thrown
+    ans, _ = core.wrap_templated_kernel(kernel_string, kernel_name)
+    assert True
+
+def test_wrap_templated_kernel3():
+    kernel_string = """
+template<typename TF> __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1(TF *c, const TF *__restrict__ a, TF * b , int n) {
+    auto i = blockIdx.x * block_size_x + threadIdx.x;
+    if (i<n) {
+        c[i] = a[i] + b[i];
+    }
+}
+
+template<typename TF> __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_WRONG) test_vector_add1(TF *a, const TF *__restrict__ a, TF * b , int n) {
+    auto i = blockIdx.x * block_size_x + threadIdx.x;
+    if (i<n) {
+        c[i] = a[i] + b[i];
+    }
+}
+"""
+    kernel_name = "vector_add1<float>"
+    ans, _ = core.wrap_templated_kernel(kernel_string, kernel_name)
+
+    # test that the template wrapper matches the right kernel (the first and not the second)
+    assert 'extern "C" __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1_wrapper(float * c, const float *__restrict__ a, float * b, int n)' in ans
+
+
+def test_wrap_templated_kernel4():
+    kernel_string = """
+template<typename TF> __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_WRONG) test_vector_add1(TF *a, const TF *__restrict__ a, TF * b , int n) {
+    auto i = blockIdx.x * block_size_x + threadIdx.x;
+    if (i<n) {
+        c[i] = a[i] + b[i];
+    }
+}
+
+template<typename TF> __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1(TF *c, const TF *__restrict__ a, TF * b , int n) {
+    auto i = blockIdx.x * block_size_x + threadIdx.x;
+    if (i<n) {
+        c[i] = a[i] + b[i];
+    }
+}
+
+"""
+    kernel_name = "vector_add1<float>"
+    ans, _ = core.wrap_templated_kernel(kernel_string, kernel_name)
+
+    # test that the template wrapper matches the right kernel (the second not the first)
+    assert 'extern "C" __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1_wrapper(float * c, const float *__restrict__ a, float * b, int n)' in ans