@@ -530,6 +530,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
530530 }
531531
532532 std::vector<CUlaunchAttribute> launch_attribute (numPropsInLaunchPropList);
533+
534+ // Early exit for zero size kernel
535+ if (*pGlobalWorkSize == 0 ) {
536+ return urEnqueueEventsWaitWithBarrier (hQueue, numEventsInWaitList,
537+ phEventWaitList, phEvent);
538+ }
539+
540+ // Set the number of threads per block to the number of threads per warp
541+ // by default unless user has provided a better number
542+ size_t ThreadsPerBlock[3 ] = {32u , 1u , 1u };
543+ size_t BlocksPerGrid[3 ] = {1u , 1u , 1u };
544+
545+ uint32_t LocalSize = hKernel->getLocalSize ();
546+ CUfunction CuFunc = hKernel->get ();
547+
533548 for (uint32_t i = 0 ; i < numPropsInLaunchPropList; i++) {
534549 switch (launchPropList[i].id ) {
535550 case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
@@ -540,12 +555,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
540555
541556 launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
542557 // Note that cuda orders from right to left wrt SYCL dimensional order.
543- launch_attribute[i].value .clusterDim .x =
544- launchPropList[i].value .clusterDim [2 ];
545- launch_attribute[i].value .clusterDim .y =
546- launchPropList[i].value .clusterDim [1 ];
547- launch_attribute[i].value .clusterDim .z =
548- launchPropList[i].value .clusterDim [0 ];
558+ if (workDim == 3 ) {
559+ launch_attribute[i].value .clusterDim .x =
560+ launchPropList[i].value .clusterDim [2 ];
561+ launch_attribute[i].value .clusterDim .y =
562+ launchPropList[i].value .clusterDim [1 ];
563+ launch_attribute[i].value .clusterDim .z =
564+ launchPropList[i].value .clusterDim [0 ];
565+ } else if (workDim == 2 ) {
566+ launch_attribute[i].value .clusterDim .x =
567+ launchPropList[i].value .clusterDim [1 ];
568+ launch_attribute[i].value .clusterDim .y =
569+ launchPropList[i].value .clusterDim [0 ];
570+ launch_attribute[i].value .clusterDim .z =
571+ launchPropList[i].value .clusterDim [2 ];
572+ } else {
573+ launch_attribute[i].value .clusterDim .x =
574+ launchPropList[i].value .clusterDim [0 ];
575+ launch_attribute[i].value .clusterDim .y =
576+ launchPropList[i].value .clusterDim [1 ];
577+ launch_attribute[i].value .clusterDim .z =
578+ launchPropList[i].value .clusterDim [2 ];
579+ }
580+
581+ UR_CHECK_ERROR (cuFuncSetAttribute (
582+ CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1 ));
583+
549584 break ;
550585 }
551586 case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
@@ -560,20 +595,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
560595 }
561596 }
562597
563- // Early exit for zero size kernel
564- if (*pGlobalWorkSize == 0 ) {
565- return urEnqueueEventsWaitWithBarrier (hQueue, numEventsInWaitList,
566- phEventWaitList, phEvent);
567- }
568-
569- // Set the number of threads per block to the number of threads per warp
570- // by default unless user has provided a better number
571- size_t ThreadsPerBlock[3 ] = {32u , 1u , 1u };
572- size_t BlocksPerGrid[3 ] = {1u , 1u , 1u };
573-
574- uint32_t LocalSize = hKernel->getLocalSize ();
575- CUfunction CuFunc = hKernel->get ();
576-
577598 // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled
578599 // using the standard UR_CHECK_ERROR
579600 if (ur_result_t Ret =
0 commit comments