Improve load distribution of solver for lowres gpu simulations

imreddyTeja · imreddyTeja · commit ad541e0f2be5 · 2026-02-02T14:41:52.000-08:00
diff --git a/ext/cuda/cuda_utils.jl b/ext/cuda/cuda_utils.jl
@@ -208,6 +208,40 @@ function threads_via_occupancy(f!::F!, args) where {F!}
     return config.threads
 end
 
+"""
+    config_via_occupancy(f!::F!, nitems, args) where {F!}
+
+Returns a named tuple of `(:threads, :blocks)` that contains an approximate
+optimal launch configuration for the kernel `f!` with arguments `args`, given
+`nitems` total items to process.
+
+If the number of items is greater than the minimal number of threads required for the config
+suggested by `CUDA.launch_configuration` to be valid, that config is returned. Otherwise,
+the threads are spread out across more SMs to improve occupancy.
+"""
+function config_via_occupancy(f!::F!, nitems, args) where {F!}
+    kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
+    config = CUDA.launch_configuration(kernel.fun)
+    SM_count = CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
+    max_block_size = CUDA.attribute(CUDA.device(), CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)
+    if cld(nitems, config.threads) < config.blocks
+        # gpu will not saturate, so spread out threads across more SMs
+        even_distribution_threads = cld(nitems, SM_count)
+        # Ensure we don't exceed max block size (usually limited by register pressure)
+        # If so, attempt to halve the number of threads
+        even_distribution_threads =
+            even_distribution_threads > max_block_size ? div(even_distribution_threads, 2) :
+            even_distribution_threads
+        # it should be safe to assume even_distribution_threads < config.threads here
+        threads = min(even_distribution_threads, config.threads)
+        blocks = cld(nitems, threads)
+    else
+        threads = min(nitems, config.threads)
+        blocks = cld(nitems, threads)
+    end
+    return (; threads, blocks)
+end
+
 """
     thread_index()
 
diff --git a/ext/cuda/matrix_fields_multiple_field_solve.jl b/ext/cuda/matrix_fields_multiple_field_solve.jl
@@ -38,15 +38,12 @@ NVTX.@annotate function multiple_field_solve!(
     args = (device, caches, xs, As, bs, x1, us, mask, cart_inds, Val(Nnames))
 
     nitems = Ni * Nj * Nh * Nnames
-    threads = threads_via_occupancy(multiple_field_solve_kernel!, args)
-    n_max_threads = min(threads, nitems)
-    p = linear_partition(nitems, n_max_threads)
-
+    (; threads, blocks) = config_via_occupancy(multiple_field_solve_kernel!, nitems, args)
     auto_launch!(
         multiple_field_solve_kernel!,
         args;
-        threads_s = p.threads,
-        blocks_s = p.blocks,
+        threads_s = threads,
+        blocks_s = blocks,
         always_inline = true,
     )
     call_post_op_callback() && post_op_callback(x, dev, cache, x, A, b, x1)
diff --git a/ext/cuda/matrix_fields_single_field_solve.jl b/ext/cuda/matrix_fields_single_field_solve.jl
@@ -19,15 +19,13 @@ function single_field_solve!(device::ClimaComms.CUDADevice, cache, x, A, b)
     mask = Spaces.get_mask(axes(x))
     cart_inds = cartesian_indices_columnwise(us)
     args = (device, cache, x, A, b, us, mask, cart_inds)
-    threads = threads_via_occupancy(single_field_solve_kernel!, args)
     nitems = Ni * Nj * Nh
-    n_max_threads = min(threads, nitems)
-    p = linear_partition(nitems, n_max_threads)
+    (; threads, blocks) = config_via_occupancy(single_field_solve_kernel!, nitems, args)
     auto_launch!(
         single_field_solve_kernel!,
         args;
-        threads_s = p.threads,
-        blocks_s = p.blocks,
+        threads_s = threads,
+        blocks_s = blocks,
     )
     call_post_op_callback() && post_op_callback(x, device, cache, x, A, b)
 end