@@ -25,6 +25,55 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
2525 blockDim.x = blockX;
2626 blockDim.y = blockY;
2727 blockDim.z = blockZ;
28+ unsigned nbNegGridDim{0 };
29+ if (gridX < 0 ) {
30+ ++nbNegGridDim;
31+ }
32+ if (gridY < 0 ) {
33+ ++nbNegGridDim;
34+ }
35+ if (gridZ < 0 ) {
36+ ++nbNegGridDim;
37+ }
38+ if (nbNegGridDim == 1 ) {
39+ int maxBlocks, nbBlocks, dev, multiProcCount;
40+ cudaError_t err1, err2;
41+ nbBlocks = blockDim.x * blockDim.y * blockDim.z ;
42+ cudaGetDevice (&dev);
43+ err1 = cudaDeviceGetAttribute (
44+ &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
45+ err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor (
46+ &maxBlocks, kernel, nbBlocks, smem);
47+ if (err1 == cudaSuccess && err2 == cudaSuccess) {
48+ maxBlocks = multiProcCount * maxBlocks;
49+ }
50+ if (maxBlocks > 0 ) {
51+ if (gridDim.x > 0 ) {
52+ maxBlocks = maxBlocks / gridDim.x ;
53+ }
54+ if (gridDim.y > 0 ) {
55+ maxBlocks = maxBlocks / gridDim.y ;
56+ }
57+ if (gridDim.z > 0 ) {
58+ maxBlocks = maxBlocks / gridDim.z ;
59+ }
60+ if (maxBlocks < 1 ) {
61+ maxBlocks = 1 ;
62+ }
63+ if (gridX < 0 ) {
64+ gridDim.x = maxBlocks;
65+ }
66+ if (gridY < 0 ) {
67+ gridDim.y = maxBlocks;
68+ }
69+ if (gridZ < 0 ) {
70+ gridDim.z = maxBlocks;
71+ }
72+ }
73+ } else if (nbNegGridDim > 1 ) {
74+ Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
75+ terminator.Crash (" Too many invalid grid dimensions" );
76+ }
2877 cudaStream_t stream = 0 ; // TODO stream managment
2978 CUDA_REPORT_IF_ERROR (
3079 cudaLaunchKernel (kernel, gridDim, blockDim, params, smem, stream));
@@ -41,6 +90,55 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
4190 config.blockDim .x = blockX;
4291 config.blockDim .y = blockY;
4392 config.blockDim .z = blockZ;
93+ unsigned nbNegGridDim{0 };
94+ if (gridX < 0 ) {
95+ ++nbNegGridDim;
96+ }
97+ if (gridY < 0 ) {
98+ ++nbNegGridDim;
99+ }
100+ if (gridZ < 0 ) {
101+ ++nbNegGridDim;
102+ }
103+ if (nbNegGridDim == 1 ) {
104+ int maxBlocks, nbBlocks, dev, multiProcCount;
105+ cudaError_t err1, err2;
106+ nbBlocks = config.blockDim .x * config.blockDim .y * config.blockDim .z ;
107+ cudaGetDevice (&dev);
108+ err1 = cudaDeviceGetAttribute (
109+ &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
110+ err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor (
111+ &maxBlocks, kernel, nbBlocks, smem);
112+ if (err1 == cudaSuccess && err2 == cudaSuccess) {
113+ maxBlocks = multiProcCount * maxBlocks;
114+ }
115+ if (maxBlocks > 0 ) {
116+ if (config.gridDim .x > 0 ) {
117+ maxBlocks = maxBlocks / config.gridDim .x ;
118+ }
119+ if (config.gridDim .y > 0 ) {
120+ maxBlocks = maxBlocks / config.gridDim .y ;
121+ }
122+ if (config.gridDim .z > 0 ) {
123+ maxBlocks = maxBlocks / config.gridDim .z ;
124+ }
125+ if (maxBlocks < 1 ) {
126+ maxBlocks = 1 ;
127+ }
128+ if (gridX < 0 ) {
129+ config.gridDim .x = maxBlocks;
130+ }
131+ if (gridY < 0 ) {
132+ config.gridDim .y = maxBlocks;
133+ }
134+ if (gridZ < 0 ) {
135+ config.gridDim .z = maxBlocks;
136+ }
137+ }
138+ } else if (nbNegGridDim > 1 ) {
139+ Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
140+ terminator.Crash (" Too many invalid grid dimensions" );
141+ }
44142 config.dynamicSmemBytes = smem;
45143 config.stream = 0 ; // TODO stream managment
46144 cudaLaunchAttribute launchAttr[1 ];
0 commit comments