@@ -25,6 +25,29 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
2525 blockDim.x = blockX;
2626 blockDim.y = blockY;
2727 blockDim.z = blockZ;
28+ bool gridIsStar = (gridX < 0 ); // <<<*, block>>> syntax was used.
29+ if (gridIsStar) {
30+ int maxBlocks, nbBlocks, dev, multiProcCount;
31+ cudaError_t err1, err2;
32+ nbBlocks = blockDim.x * blockDim.y * blockDim.z ;
33+ cudaGetDevice (&dev);
34+ err1 = cudaDeviceGetAttribute (
35+ &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
36+ err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor (
37+ &maxBlocks, kernel, nbBlocks, smem);
38+ if (err1 == cudaSuccess && err2 == cudaSuccess)
39+ maxBlocks = multiProcCount * maxBlocks;
40+ if (maxBlocks > 0 ) {
41+ if (gridDim.y > 0 )
42+ maxBlocks = maxBlocks / gridDim.y ;
43+ if (gridDim.z > 0 )
44+ maxBlocks = maxBlocks / gridDim.z ;
45+ if (maxBlocks < 1 )
46+ maxBlocks = 1 ;
47+ if (gridIsStar)
48+ gridDim.x = maxBlocks;
49+ }
50+ }
2851 cudaStream_t stream = 0 ; // TODO stream managment
2952 CUDA_REPORT_IF_ERROR (
3053 cudaLaunchKernel (kernel, gridDim, blockDim, params, smem, stream));
@@ -41,6 +64,29 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
4164 config.blockDim .x = blockX;
4265 config.blockDim .y = blockY;
4366 config.blockDim .z = blockZ;
67+ bool gridIsStar = (gridX < 0 ); // <<<*, block>>> syntax was used.
68+ if (gridIsStar) {
69+ int maxBlocks, nbBlocks, dev, multiProcCount;
70+ cudaError_t err1, err2;
71+ nbBlocks = config.blockDim .x * config.blockDim .y * config.blockDim .z ;
72+ cudaGetDevice (&dev);
73+ err1 = cudaDeviceGetAttribute (
74+ &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
75+ err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor (
76+ &maxBlocks, kernel, nbBlocks, smem);
77+ if (err1 == cudaSuccess && err2 == cudaSuccess)
78+ maxBlocks = multiProcCount * maxBlocks;
79+ if (maxBlocks > 0 ) {
80+ if (config.gridDim .y > 0 )
81+ maxBlocks = maxBlocks / config.gridDim .y ;
82+ if (config.gridDim .z > 0 )
83+ maxBlocks = maxBlocks / config.gridDim .z ;
84+ if (maxBlocks < 1 )
85+ maxBlocks = 1 ;
86+ if (gridIsStar)
87+ config.gridDim .x = maxBlocks;
88+ }
89+ }
4490 config.dynamicSmemBytes = smem;
4591 config.stream = 0 ; // TODO stream managment
4692 cudaLaunchAttribute launchAttr[1 ];
0 commit comments