Skip to content

Commit 3fbd025

Browse files
committed
[flang][cuda] Compute grid x when star is used
1 parent 5005f8d commit 3fbd025

File tree

1 file changed

+46
-0
lines changed

1 file changed

+46
-0
lines changed

flang/runtime/CUDA/kernel.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,29 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
2525
blockDim.x = blockX;
2626
blockDim.y = blockY;
2727
blockDim.z = blockZ;
28+
bool gridIsStar = (gridX < 0); // <<<*, block>>> syntax was used.
29+
if (gridIsStar) {
30+
int maxBlocks, nbBlocks, dev, multiProcCount;
31+
cudaError_t err1, err2;
32+
nbBlocks = blockDim.x * blockDim.y * blockDim.z;
33+
cudaGetDevice(&dev);
34+
err1 = cudaDeviceGetAttribute(
35+
&multiProcCount, cudaDevAttrMultiProcessorCount, dev);
36+
err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
37+
&maxBlocks, kernel, nbBlocks, smem);
38+
if (err1 == cudaSuccess && err2 == cudaSuccess)
39+
maxBlocks = multiProcCount * maxBlocks;
40+
if (maxBlocks > 0) {
41+
if (gridDim.y > 0)
42+
maxBlocks = maxBlocks / gridDim.y;
43+
if (gridDim.z > 0)
44+
maxBlocks = maxBlocks / gridDim.z;
45+
if (maxBlocks < 1)
46+
maxBlocks = 1;
47+
if (gridIsStar)
48+
gridDim.x = maxBlocks;
49+
}
50+
}
2851
cudaStream_t stream = 0; // TODO stream managment
2952
CUDA_REPORT_IF_ERROR(
3053
cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, stream));
@@ -41,6 +64,29 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
4164
config.blockDim.x = blockX;
4265
config.blockDim.y = blockY;
4366
config.blockDim.z = blockZ;
67+
bool gridIsStar = (gridX < 0); // <<<*, block>>> syntax was used.
68+
if (gridIsStar) {
69+
int maxBlocks, nbBlocks, dev, multiProcCount;
70+
cudaError_t err1, err2;
71+
nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z;
72+
cudaGetDevice(&dev);
73+
err1 = cudaDeviceGetAttribute(
74+
&multiProcCount, cudaDevAttrMultiProcessorCount, dev);
75+
err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
76+
&maxBlocks, kernel, nbBlocks, smem);
77+
if (err1 == cudaSuccess && err2 == cudaSuccess)
78+
maxBlocks = multiProcCount * maxBlocks;
79+
if (maxBlocks > 0) {
80+
if (config.gridDim.y > 0)
81+
maxBlocks = maxBlocks / config.gridDim.y;
82+
if (config.gridDim.z > 0)
83+
maxBlocks = maxBlocks / config.gridDim.z;
84+
if (maxBlocks < 1)
85+
maxBlocks = 1;
86+
if (gridIsStar)
87+
config.gridDim.x = maxBlocks;
88+
}
89+
}
4490
config.dynamicSmemBytes = smem;
4591
config.stream = 0; // TODO stream managment
4692
cudaLaunchAttribute launchAttr[1];

0 commit comments

Comments
 (0)