Explicitly export CUDA kernels in shared libraries

fwyzard · fwyzard · commit d6b11f4c685b · 2025-10-28T10:34:38.000+01:00
Mark CUDA kernels in shared libraries as having default visibility, so
they can be used by other executables, libraries or plugins.

The default linkage of CUDA kernels changed from public to private in
CUDA 13.0.
diff --git a/HeterogeneousTest/CUDAKernel/README.md b/HeterogeneousTest/CUDAKernel/README.md
@@ -14,8 +14,8 @@ kernels that call the device functions defined in the `HeterogeneousTest/CUDADev
 ```c++
 namespace cms::cudatest {
 
-  __global__ void kernel_add_vectors_f(...);
-  __global__ void kernel_add_vectors_d(...);
+  __global__ __attribute__((visibility("default"))) void kernel_add_vectors_f(...);
+  __global__ __attribute__((visibility("default"))) void kernel_add_vectors_d(...);
 
 }  // namespace cms::cudatest
 ```
diff --git a/HeterogeneousTest/CUDAKernel/interface/DeviceAdditionKernel.h b/HeterogeneousTest/CUDAKernel/interface/DeviceAdditionKernel.h
@@ -7,15 +7,17 @@
 
 namespace cms::cudatest {
 
-  __global__ void kernel_add_vectors_f(const float* __restrict__ in1,
-                                       const float* __restrict__ in2,
-                                       float* __restrict__ out,
-                                       size_t size);
+  // Mark the kernel with default visibility to export it as a public symbol for CUDA 12.8 and later
+  __global__ __attribute__((visibility("default"))) void kernel_add_vectors_f(const float* __restrict__ in1,
+                                                                              const float* __restrict__ in2,
+                                                                              float* __restrict__ out,
+                                                                              size_t size);
 
-  __global__ void kernel_add_vectors_d(const double* __restrict__ in1,
-                                       const double* __restrict__ in2,
-                                       double* __restrict__ out,
-                                       size_t size);
+  // Mark the kernel with default visibility to export it as a public symbol for CUDA 12.8 and later
+  __global__ __attribute__((visibility("default"))) void kernel_add_vectors_d(const double* __restrict__ in1,
+                                                                              const double* __restrict__ in2,
+                                                                              double* __restrict__ out,
+                                                                              size_t size);
 
 }  // namespace cms::cudatest
 
diff --git a/HeterogeneousTest/CUDAKernel/src/DeviceAdditionKernel.cu b/HeterogeneousTest/CUDAKernel/src/DeviceAdditionKernel.cu
@@ -7,17 +7,17 @@
 
 namespace cms::cudatest {
 
-  __global__ void kernel_add_vectors_f(const float* __restrict__ in1,
-                                       const float* __restrict__ in2,
-                                       float* __restrict__ out,
-                                       size_t size) {
+  __global__ __attribute__((visibility("default"))) void kernel_add_vectors_f(const float* __restrict__ in1,
+                                                                              const float* __restrict__ in2,
+                                                                              float* __restrict__ out,
+                                                                              size_t size) {
     add_vectors_f(in1, in2, out, size);
   }
 
-  __global__ void kernel_add_vectors_d(const double* __restrict__ in1,
-                                       const double* __restrict__ in2,
-                                       double* __restrict__ out,
-                                       size_t size) {
+  __global__ __attribute__((visibility("default"))) void kernel_add_vectors_d(const double* __restrict__ in1,
+                                                                              const double* __restrict__ in2,
+                                                                              double* __restrict__ out,
+                                                                              size_t size) {
     add_vectors_d(in1, in2, out, size);
   }