Update CUDA propagation and fit kernel launch

stephenswat · stephenswat · commit 7f97695416a0 · 2025-06-25T16:51:34.000+02:00
This commit updates the CUDA propagation and fitting kernel launch
parameters in two ways. Firstly it increases the block size from 64
threads to 128 threads. The reason for this is that some Compute
Capabilities (specificially 8.6 and 8.7) cannot achieve optimal
occupancy with 64 threads, as the resident block limit per SM is too
small. Secondly, this commit adds launch bounds to the kernels,
requesting a minimum of 8 blocks of 128 threads per SM. This will
increase the theoretical occupancy to 66% on CC 8.6 and to at least 50%
on all other Compute Capabilities.
diff --git a/device/cuda/src/finding/combinatorial_kalman_filter.cuh b/device/cuda/src/finding/combinatorial_kalman_filter.cuh
@@ -369,7 +369,7 @@ combinatorial_kalman_filter(
                     .tips_view = tips_buffer,
                     .tip_lengths_view = tip_length_buffer};
 
-                const unsigned int nThreads = warp_size * 2;
+                const unsigned int nThreads = warp_size * 4;
                 const unsigned int nBlocks =
                     (n_candidates + nThreads - 1) / nThreads;
                 propagate_to_next_surface<
diff --git a/device/cuda/src/finding/kernels/specializations/propagate_to_next_surface_src.cuh b/device/cuda/src/finding/kernels/specializations/propagate_to_next_surface_src.cuh
@@ -18,7 +18,7 @@ namespace traccc::cuda {
 namespace kernels {
 
 template <typename propagator_t, typename bfield_t>
-__global__ void propagate_to_next_surface(
+__global__ __launch_bounds__(128, 8) void propagate_to_next_surface(
     const finding_config cfg,
     device::propagate_to_next_surface_payload<propagator_t, bfield_t> payload) {
 
diff --git a/device/cuda/src/fitting/kalman_fitting.cuh b/device/cuda/src/fitting/kalman_fitting.cuh
@@ -109,7 +109,7 @@ track_state_container_types::buffer kalman_fitting(
     param_liveness_setup_event->ignore();
 
     // Launch parameters for all the kernels.
-    const unsigned int nThreads = warp_size * 2;
+    const unsigned int nThreads = warp_size * 4;
     const unsigned int nBlocks = (n_tracks + nThreads - 1) / nThreads;
 
     // Fill the keys and param_ids buffers.
diff --git a/device/cuda/src/fitting/kernels/specializations/fit_backward_src.cuh b/device/cuda/src/fitting/kernels/specializations/fit_backward_src.cuh
@@ -14,8 +14,8 @@
 namespace traccc::cuda {
 namespace kernels {
 template <typename fitter_t>
-__global__ void fit_backward(const fitting_config cfg,
-                             const device::fit_payload<fitter_t> payload) {
+__global__ __launch_bounds__(128, 8) void fit_backward(
+    const fitting_config cfg, const device::fit_payload<fitter_t> payload) {
     device::fit_backward<fitter_t>(details::global_index1(), cfg, payload);
 }
 }  // namespace kernels
diff --git a/device/cuda/src/fitting/kernels/specializations/fit_forward_src.cuh b/device/cuda/src/fitting/kernels/specializations/fit_forward_src.cuh
@@ -14,8 +14,8 @@
 namespace traccc::cuda {
 namespace kernels {
 template <typename fitter_t>
-__global__ void fit_forward(const fitting_config cfg,
-                            const device::fit_payload<fitter_t> payload) {
+__global__ __launch_bounds__(128, 8) void fit_forward(
+    const fitting_config cfg, const device::fit_payload<fitter_t> payload) {
     device::fit_forward<fitter_t>(details::global_index1(), cfg, payload);
 }
 }  // namespace kernels