@@ -245,13 +245,14 @@ setKernelParams(const ur_context_handle_t Context,
245245 return UR_RESULT_SUCCESS;
246246 };
247247
248- size_t KernelLocalWorkGroupSize = 0 ;
248+ size_t KernelLocalWorkGroupSize = 1 ;
249249 for (size_t Dim = 0 ; Dim < WorkDim; Dim++) {
250250 auto Err = IsValid (Dim);
251251 if (Err != UR_RESULT_SUCCESS)
252252 return Err;
253- // If no error then sum the total local work size per dim.
254- KernelLocalWorkGroupSize += LocalWorkSize[Dim];
253+ // If no error then compute the total local work size as a product of
254+ // all dims.
255+ KernelLocalWorkGroupSize *= LocalWorkSize[Dim];
255256 }
256257
257258 if (hasExceededMaxRegistersPerBlock (Device, Kernel,
@@ -493,6 +494,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
493494 return Result;
494495}
495496
497+ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp (
498+ ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
499+ const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
500+ const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
501+ const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
502+ return urEnqueueKernelLaunch (hQueue, hKernel, workDim, pGlobalWorkOffset,
503+ pGlobalWorkSize, pLocalWorkSize,
504+ numEventsInWaitList, phEventWaitList, phEvent);
505+ }
506+
496507// / Set parameters for general 3D memory copy.
497508// / If the source and/or destination is on the device, SrcPtr and/or DstPtr
498509// / must be a pointer to a CUdeviceptr
0 commit comments