33
33
34
34
namespace moe ::dev {
35
35
36
- #define CHECK_CUDA (cmd ) \
36
+ #define CHECK_CUDA_ERROR (cmd ) \
37
37
do { \
38
38
cudaError_t e = cmd; \
39
39
if (e != cudaSuccess) { \
@@ -46,34 +46,34 @@ namespace moe::dev {
46
46
47
47
#define LAUNCH_ESC (...) __VA_ARGS__
48
48
49
- #define LAUNCH_PDL (data, coopLaunch, types, kernel, numBlocks, numThreads, smemSize, stream ) \
50
- cudaLaunchConfig_t config{}; \
51
- config.gridDim = numBlocks; \
52
- config.blockDim = numThreads; \
53
- config.dynamicSmemBytes = smemSize; \
54
- config.stream = (cudaStream_t)stream; \
55
- \
56
- cudaLaunchAttribute attributes[2 ] = {}; \
57
- attributes[0 ].id = cudaLaunchAttributeProgrammaticStreamSerialization; \
58
- attributes[0 ].val.programmaticStreamSerializationAllowed = int (data.mUsePdl ); \
59
- attributes[1 ].id = cudaLaunchAttributeCooperative; \
60
- attributes[1 ].val.cooperative = int (coopLaunch); \
61
- config.attrs = attributes; \
62
- config.numAttrs = 2 ; \
63
- if (data.mUsePdl ) { \
64
- auto params = KernelParams<types, true >::setKernelParams (data); \
65
- auto kernelTyped = kernel<KernelParams<types, true >>; \
66
- if (smemSize > 48 * 1024 ) \
67
- CHECK_CUDA (cudaFuncSetAttribute (kernelTyped, cudaFuncAttributeMaxDynamicSharedMemorySize, \
68
- smemSize)); \
69
- CHECK_CUDA (cudaLaunchKernelEx (&config, kernelTyped, params)); \
70
- } else { \
71
- auto params = KernelParams<types, false >::setKernelParams (data); \
72
- auto kernelTyped = kernel<KernelParams<types, false >>; \
73
- if (smemSize > 48 * 1024 ) \
74
- CHECK_CUDA (cudaFuncSetAttribute (kernelTyped, cudaFuncAttributeMaxDynamicSharedMemorySize, \
75
- smemSize)); \
76
- CHECK_CUDA (cudaLaunchKernelEx (&config, kernelTyped, params)); \
49
+ #define LAUNCH_PDL (data, coopLaunch, types, kernel, numBlocks, numThreads, smemSize, stream ) \
50
+ cudaLaunchConfig_t config{}; \
51
+ config.gridDim = numBlocks; \
52
+ config.blockDim = numThreads; \
53
+ config.dynamicSmemBytes = smemSize; \
54
+ config.stream = (cudaStream_t)stream; \
55
+ \
56
+ cudaLaunchAttribute attributes[2 ] = {}; \
57
+ attributes[0 ].id = cudaLaunchAttributeProgrammaticStreamSerialization; \
58
+ attributes[0 ].val.programmaticStreamSerializationAllowed = int (data.mUsePdl ); \
59
+ attributes[1 ].id = cudaLaunchAttributeCooperative; \
60
+ attributes[1 ].val.cooperative = int (coopLaunch); \
61
+ config.attrs = attributes; \
62
+ config.numAttrs = 2 ; \
63
+ if (data.mUsePdl ) { \
64
+ auto params = KernelParams<types, true >::setKernelParams (data); \
65
+ auto kernelTyped = kernel<KernelParams<types, true >>; \
66
+ if (smemSize > 48 * 1024 ) \
67
+ CHECK_CUDA_ERROR (cudaFuncSetAttribute ( \
68
+ kernelTyped, cudaFuncAttributeMaxDynamicSharedMemorySize, smemSize)); \
69
+ CHECK_CUDA_ERROR (cudaLaunchKernelEx (&config, kernelTyped, params)); \
70
+ } else { \
71
+ auto params = KernelParams<types, false >::setKernelParams (data); \
72
+ auto kernelTyped = kernel<KernelParams<types, false >>; \
73
+ if (smemSize > 48 * 1024 ) \
74
+ CHECK_CUDA_ERROR (cudaFuncSetAttribute ( \
75
+ kernelTyped, cudaFuncAttributeMaxDynamicSharedMemorySize, smemSize)); \
76
+ CHECK_CUDA_ERROR (cudaLaunchKernelEx (&config, kernelTyped, params)); \
77
77
}
78
78
79
79
#define LAUNCH (data, kernel, numBlocks, numThreads, smemSize, stream ) \
0 commit comments