Skip to content

Commit 7f97695

Browse files
committed
Update CUDA propagation and fit kernel launch
This commit updates the CUDA propagation and fitting kernel launch parameters in two ways. Firstly it increases the block size from 64 threads to 128 threads. The reason for this is that some Compute Capabilities (specificially 8.6 and 8.7) cannot achieve optimal occupancy with 64 threads, as the resident block limit per SM is too small. Secondly, this commit adds launch bounds to the kernels, requesting a minimum of 8 blocks of 128 threads per SM. This will increase the theoretical occupancy to 66% on CC 8.6 and to at least 50% on all other Compute Capabilities.
1 parent e57a5d6 commit 7f97695

File tree

5 files changed

+7
-7
lines changed

5 files changed

+7
-7
lines changed

device/cuda/src/finding/combinatorial_kalman_filter.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ combinatorial_kalman_filter(
369369
.tips_view = tips_buffer,
370370
.tip_lengths_view = tip_length_buffer};
371371

372-
const unsigned int nThreads = warp_size * 2;
372+
const unsigned int nThreads = warp_size * 4;
373373
const unsigned int nBlocks =
374374
(n_candidates + nThreads - 1) / nThreads;
375375
propagate_to_next_surface<

device/cuda/src/finding/kernels/specializations/propagate_to_next_surface_src.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ namespace traccc::cuda {
1818
namespace kernels {
1919

2020
template <typename propagator_t, typename bfield_t>
21-
__global__ void propagate_to_next_surface(
21+
__global__ __launch_bounds__(128, 8) void propagate_to_next_surface(
2222
const finding_config cfg,
2323
device::propagate_to_next_surface_payload<propagator_t, bfield_t> payload) {
2424

device/cuda/src/fitting/kalman_fitting.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ track_state_container_types::buffer kalman_fitting(
109109
param_liveness_setup_event->ignore();
110110

111111
// Launch parameters for all the kernels.
112-
const unsigned int nThreads = warp_size * 2;
112+
const unsigned int nThreads = warp_size * 4;
113113
const unsigned int nBlocks = (n_tracks + nThreads - 1) / nThreads;
114114

115115
// Fill the keys and param_ids buffers.

device/cuda/src/fitting/kernels/specializations/fit_backward_src.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
namespace traccc::cuda {
1515
namespace kernels {
1616
template <typename fitter_t>
17-
__global__ void fit_backward(const fitting_config cfg,
18-
const device::fit_payload<fitter_t> payload) {
17+
__global__ __launch_bounds__(128, 8) void fit_backward(
18+
const fitting_config cfg, const device::fit_payload<fitter_t> payload) {
1919
device::fit_backward<fitter_t>(details::global_index1(), cfg, payload);
2020
}
2121
} // namespace kernels

device/cuda/src/fitting/kernels/specializations/fit_forward_src.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
namespace traccc::cuda {
1515
namespace kernels {
1616
template <typename fitter_t>
17-
__global__ void fit_forward(const fitting_config cfg,
18-
const device::fit_payload<fitter_t> payload) {
17+
__global__ __launch_bounds__(128, 8) void fit_forward(
18+
const fitting_config cfg, const device::fit_payload<fitter_t> payload) {
1919
device::fit_forward<fitter_t>(details::global_index1(), cfg, payload);
2020
}
2121
} // namespace kernels

0 commit comments

Comments
 (0)