@@ -208,6 +208,40 @@ function threads_via_occupancy(f!::F!, args) where {F!}
208208 return config. threads
209209end
210210
211+ """
212+ config_via_occupancy(f!::F!, nitems, args) where {F!}
213+
214+ Returns a named tuple of `(:threads, :blocks)` that contains an approximate
215+ optimal launch configuration for the kernel `f!` with arguments `args`, given
216+ `nitems` total items to process.
217+
218+ If the number of items is greater than the minimal number of threads required for the config
219+ suggested by `CUDA.launch_configuration` to be valid, that config is returned. Otherwise,
220+ the threads are spread out across more SMs to improve occupancy.
221+ """
222+ function config_via_occupancy (f!:: F! , nitems, args) where {F!}
223+ kernel = CUDA. @cuda always_inline = true launch = false f! (args... )
224+ config = CUDA. launch_configuration (kernel. fun)
225+ SM_count = CUDA. attribute (CUDA. device (), CUDA. DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
226+ max_block_size = CUDA. attribute (CUDA. device (), CUDA. DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)
227+ if cld (nitems, config. threads) < config. blocks
228+ # gpu will not saturate, so spread out threads across more SMs
229+ even_distribution_threads = cld (nitems, SM_count)
230+ # Ensure we don't exceed max block size (usually limited by register pressure)
231+ # If so, attempt to halve the number of threads
232+ even_distribution_threads =
233+ even_distribution_threads > max_block_size ? div (even_distribution_threads, 2 ) :
234+ even_distribution_threads
235+ # it should be safe to assume even_distribution_threads < config.threads here
236+ threads = min (even_distribution_threads, config. threads)
237+ blocks = cld (nitems, threads)
238+ else
239+ threads = min (nitems, config. threads)
240+ blocks = cld (nitems, threads)
241+ end
242+ return (; threads, blocks)
243+ end
244+
211245"""
212246 thread_index()
213247
0 commit comments