@@ -194,60 +194,6 @@ mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
194194 extra));
195195}
196196
197- extern " C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel (
198- CUfunction function, intptr_t clusterX, intptr_t clusterY,
199- intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
200- intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem,
201- CUstream stream, void **params, void **extra, size_t /* paramsCount*/ ) {
202- ScopedContext scopedContext;
203- if (smem > 0 ) {
204- // Avoid checking driver as it's more expensive than if statement
205- int32_t maxShmem = 0 ;
206- CUdevice device = getDefaultCuDevice ();
207- CUDA_REPORT_IF_ERROR (cuDeviceGet (&device, /* ordinal=*/ defaultDevice));
208- CUDA_REPORT_IF_ERROR (cuDeviceGetAttribute (
209- &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
210- device));
211- if (maxShmem < smem) {
212- fprintf (stderr,
213- " Requested shared memory (%dkb) is larger than maximum allowed "
214- " shared memory (%dkb) for this device\n " ,
215- smem, maxShmem);
216- }
217- CUDA_REPORT_IF_ERROR (cuFuncSetAttribute (
218- function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
219- }
220- CUlaunchConfig config;
221- config.gridDimX = gridX;
222- config.gridDimY = gridY;
223- config.gridDimZ = gridZ;
224- config.blockDimX = blockX;
225- config.blockDimY = blockY;
226- config.blockDimZ = blockZ;
227- config.sharedMemBytes = smem;
228- config.hStream = stream;
229- CUlaunchAttribute launchAttr[2 ];
230- launchAttr[0 ].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
231- launchAttr[0 ].value .clusterDim .x = clusterX;
232- launchAttr[0 ].value .clusterDim .y = clusterY;
233- launchAttr[0 ].value .clusterDim .z = clusterZ;
234- launchAttr[1 ].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
235- launchAttr[1 ].value .clusterSchedulingPolicyPreference =
236- CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
237- config.numAttrs = 2 ;
238- config.attrs = launchAttr;
239-
240- debug_print (" Launching kernel,"
241- " cluster: %ld, %ld, %ld, "
242- " grid=%ld,%ld,%ld, "
243- " threads: %ld, %ld, %ld, "
244- " smem: %dkb\n " ,
245- clusterX, clusterY, clusterZ, gridX, gridY, gridZ, blockX, blockY,
246- blockZ, smem);
247-
248- CUDA_REPORT_IF_ERROR (cuLaunchKernelEx (&config, function, params, extra));
249- }
250-
251197extern " C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate () {
252198 ScopedContext scopedContext;
253199 CUstream stream = nullptr ;
@@ -383,6 +329,60 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
383329
384330#if (CUDA_VERSION >= 12000)
385331
332+ extern " C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel (
333+ CUfunction function, intptr_t clusterX, intptr_t clusterY,
334+ intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
335+ intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem,
336+ CUstream stream, void **params, void **extra, size_t /* paramsCount*/ ) {
337+ ScopedContext scopedContext;
338+ if (smem > 0 ) {
339+ // Avoid checking driver as it's more expensive than if statement
340+ int32_t maxShmem = 0 ;
341+ CUdevice device = getDefaultCuDevice ();
342+ CUDA_REPORT_IF_ERROR (cuDeviceGet (&device, /* ordinal=*/ defaultDevice));
343+ CUDA_REPORT_IF_ERROR (cuDeviceGetAttribute (
344+ &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
345+ device));
346+ if (maxShmem < smem) {
347+ fprintf (stderr,
348+ " Requested shared memory (%dkb) is larger than maximum allowed "
349+ " shared memory (%dkb) for this device\n " ,
350+ smem, maxShmem);
351+ }
352+ CUDA_REPORT_IF_ERROR (cuFuncSetAttribute (
353+ function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
354+ }
355+ CUlaunchConfig config;
356+ config.gridDimX = gridX;
357+ config.gridDimY = gridY;
358+ config.gridDimZ = gridZ;
359+ config.blockDimX = blockX;
360+ config.blockDimY = blockY;
361+ config.blockDimZ = blockZ;
362+ config.sharedMemBytes = smem;
363+ config.hStream = stream;
364+ CUlaunchAttribute launchAttr[2 ];
365+ launchAttr[0 ].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
366+ launchAttr[0 ].value .clusterDim .x = clusterX;
367+ launchAttr[0 ].value .clusterDim .y = clusterY;
368+ launchAttr[0 ].value .clusterDim .z = clusterZ;
369+ launchAttr[1 ].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
370+ launchAttr[1 ].value .clusterSchedulingPolicyPreference =
371+ CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
372+ config.numAttrs = 2 ;
373+ config.attrs = launchAttr;
374+
375+ debug_print (" Launching kernel,"
376+ " cluster: %ld, %ld, %ld, "
377+ " grid=%ld,%ld,%ld, "
378+ " threads: %ld, %ld, %ld, "
379+ " smem: %dkb\n " ,
380+ clusterX, clusterY, clusterZ, gridX, gridY, gridZ, blockX, blockY,
381+ blockZ, smem);
382+
383+ CUDA_REPORT_IF_ERROR (cuLaunchKernelEx (&config, function, params, extra));
384+ }
385+
386386extern " C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled (
387387 CUtensorMap *tensorMap, // Tensor map object
388388 CUtensorMapDataType tensorDataType, // Tensor data type
0 commit comments