From 13238d67ae6f5acf52e6c88cf0cc8499207f44c5 Mon Sep 17 00:00:00 2001 From: Stephen Nicholas Swatman Date: Wed, 20 Aug 2025 11:00:23 +0200 Subject: [PATCH] Enable register spilling to shared memory CUDA 13.0 enables the PTX assembler to spill registers to shared memory instead of local memory, which should both be much faster, and also reduce the local memory usage of our fitting and finding kernels which are currently bottlenecking our throughput. --- .../specializations/propagate_to_next_surface_src.cuh | 4 ++++ .../src/fitting/kernels/specializations/fit_backward_src.cuh | 5 +++++ .../src/fitting/kernels/specializations/fit_forward_src.cuh | 5 +++++ 3 files changed, 14 insertions(+) diff --git a/device/cuda/src/finding/kernels/specializations/propagate_to_next_surface_src.cuh b/device/cuda/src/finding/kernels/specializations/propagate_to_next_surface_src.cuh index b139c6503e..6152a66c16 100644 --- a/device/cuda/src/finding/kernels/specializations/propagate_to_next_surface_src.cuh +++ b/device/cuda/src/finding/kernels/specializations/propagate_to_next_surface_src.cuh @@ -22,6 +22,10 @@ __global__ __launch_bounds__(128) void propagate_to_next_surface( const finding_config cfg, device::propagate_to_next_surface_payload payload) { +#if defined(__CUDA_ARCH__) && CUDART_VERSION >= 13000 + asm(".pragma \"enable_smem_spilling\";"); +#endif + device::propagate_to_next_surface( details::global_index1(), cfg, payload); } diff --git a/device/cuda/src/fitting/kernels/specializations/fit_backward_src.cuh b/device/cuda/src/fitting/kernels/specializations/fit_backward_src.cuh index 28286cfa30..5c077e9ba4 100644 --- a/device/cuda/src/fitting/kernels/specializations/fit_backward_src.cuh +++ b/device/cuda/src/fitting/kernels/specializations/fit_backward_src.cuh @@ -16,6 +16,11 @@ namespace kernels { template __global__ __launch_bounds__(128) void fit_backward( const fitting_config cfg, const device::fit_payload payload) { + +#if defined(__CUDA_ARCH__) && CUDART_VERSION >= 13000 + asm(".pragma \"enable_smem_spilling\";"); +#endif + device::fit_backward(details::global_index1(), cfg, payload); } } // namespace kernels diff --git a/device/cuda/src/fitting/kernels/specializations/fit_forward_src.cuh b/device/cuda/src/fitting/kernels/specializations/fit_forward_src.cuh index 2727c331b6..b114cba0b3 100644 --- a/device/cuda/src/fitting/kernels/specializations/fit_forward_src.cuh +++ b/device/cuda/src/fitting/kernels/specializations/fit_forward_src.cuh @@ -16,6 +16,11 @@ namespace kernels { template __global__ __launch_bounds__(128) void fit_forward( const fitting_config cfg, const device::fit_payload payload) { + +#if defined(__CUDA_ARCH__) && CUDART_VERSION >= 13000 + asm(".pragma \"enable_smem_spilling\";"); +#endif + device::fit_forward(details::global_index1(), cfg, payload); } } // namespace kernels