@@ -233,6 +233,46 @@ void operatorKernel(
233233 op ( args... );
234234}
235235
236+ // / @brief Cuda kernel that launches device operator functors with arbitrary arguments, using dynamic shared memory
237+ template <class Operator , typename ... Args>
238+ __global__
239+ __launch_bounds__ (Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)
240+ void operatorKernelDynamic(Args... args)
241+ {
242+ extern __shared__ char smem_buf[];
243+ Operator op;
244+ op ( args..., smem_buf );
245+ }
246+
247+ // / @brief Wrapper for launching a device operator that leverages dynamic shared memory, with a specified size
248+ // / @code
249+ // / struct MyFunctor
250+ // / {
251+ // / // These are passed to __launch_bounds__
252+ // / static constexpr int MaxThreadsPerBlock = <nThreads>
253+ // / static constexpr int MinBlocksPerMultiprocessor = 1;
254+ // /
255+ // / struct SharedStorage {
256+ // / // Include whatever is needed in smem
257+ // / };
258+ // /
259+ // / __device__
260+ // / void operator()(Args ... myArgs, char smem_buf[])
261+ // / { ... }
262+ // / };
263+ // /
264+ // / dynamicSharedMemoryLauncher<MyFunctor>(nBlocks, sizeof(typename MyFunctor::SharedStorage), myArgs...);
265+ // / // smem_buff of size sizeof(MyFunctor::SharedStorage) will be automatically passed along
266+ // / @endcode
267+ template <class Operator , typename ... Args>
268+ void dynamicSharedMemoryLauncher (const size_t numItems, const size_t smem_size, cudaStream_t stream, Args... args)
269+ {
270+ cudaCheck (cudaFuncSetAttribute (operatorKernelDynamic<Operator, Args...>,
271+ cudaFuncAttributeMaxDynamicSharedMemorySize,smem_size));
272+ operatorKernelDynamic<Operator>
273+ <<<numItems, Operator::MaxThreadsPerBlock, smem_size, stream>>>( args ... );
274+ }
275+
236276#endif // __CUDACC__
237277
238278}// namespace util::cuda ============================================================
0 commit comments