diff --git a/include/ur_api.h b/include/ur_api.h index 28569597c4..e504a3aa88 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -9536,6 +9536,7 @@ urEnqueueCooperativeKernelLaunchExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hKernel` +/// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pLocalWorkSize` /// + `NULL == pGroupCountRet` @@ -9543,6 +9544,7 @@ urEnqueueCooperativeKernelLaunchExp( UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items const size_t *pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the @@ -11083,6 +11085,7 @@ typedef struct ur_kernel_set_specialization_constants_params_t { /// allowing the callback the ability to modify the parameter's value typedef struct ur_kernel_suggest_max_cooperative_group_count_exp_params_t { ur_kernel_handle_t *phKernel; + ur_device_handle_t *phDevice; uint32_t *pworkDim; const size_t **ppLocalWorkSize; size_t *pdynamicSharedMemorySize; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index eeb323fc58..2384a68ea1 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -651,6 +651,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetKernelProcAddrTable_t)( /// @brief Function-pointer for urKernelSuggestMaxCooperativeGroupCountExp typedef ur_result_t(UR_APICALL *ur_pfnKernelSuggestMaxCooperativeGroupCountExp_t)( ur_kernel_handle_t, + ur_device_handle_t, uint32_t, const size_t *, size_t, diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 5255a20f78..08a2fc6ce2 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -13187,6 +13187,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->phKernel)); + os << ", "; + os << ".hDevice = "; + + ur::details::printPtr(os, + *(params->phDevice)); + os << ", "; os << ".workDim = "; diff --git a/scripts/core/exp-cooperative-kernels.yml b/scripts/core/exp-cooperative-kernels.yml index ad3ba0ffba..6020ca5f45 100644 --- a/scripts/core/exp-cooperative-kernels.yml +++ b/scripts/core/exp-cooperative-kernels.yml @@ -78,6 +78,9 @@ params: - type: $x_kernel_handle_t name: hKernel desc: "[in] handle of the kernel object" + - type: $x_device_handle_t + name: hDevice + desc: "[in] handle of the device object" - type: uint32_t name: workDim desc: "[in] number of dimensions, from 1 to 3, to specify the work-group work-items" diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp index 46c4907d4b..340e5ff634 100644 --- a/source/adapters/cuda/kernel.cpp +++ b/source/adapters/cuda/kernel.cpp @@ -190,10 +190,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, - size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { + ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, + const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, + uint32_t *pGroupCountRet) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL); + std::ignore = hDevice; + size_t localWorkSize = pLocalWorkSize[0]; localWorkSize *= (workDim >= 2 ? pLocalWorkSize[1] : 1); localWorkSize *= (workDim == 3 ? pLocalWorkSize[2] : 1); diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index 1ba50c4360..a5aefb1293 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -169,9 +169,11 @@ urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) { } UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, - size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { + ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, + const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, + uint32_t *pGroupCountRet) { std::ignore = hKernel; + std::ignore = hDevice; std::ignore = workDim; std::ignore = pLocalWorkSize; std::ignore = dynamicSharedMemorySize; diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index b15b4ce147..db9337289f 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -1054,8 +1054,9 @@ ur_result_t urKernelGetNativeHandle( } ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, - size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { + ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, + const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, + uint32_t *pGroupCountRet) { (void)dynamicSharedMemorySize; std::shared_lock Guard(hKernel->Mutex); @@ -1066,8 +1067,10 @@ ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( ZE2UR_CALL(zeKernelSetGroupSize, (hKernel->ZeKernel, WG[0], WG[1], WG[2])); uint32_t TotalGroupCount = 0; + ze_kernel_handle_t ZeKernel; + UR_CALL(getZeKernel(hDevice->ZeDevice, hKernel, &ZeKernel)); ZE2UR_CALL(zeKernelSuggestMaxCooperativeGroupCount, - (hKernel->ZeKernel, &TotalGroupCount)); + (ZeKernel, &TotalGroupCount)); *pGroupCountRet = TotalGroupCount; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp index 5bd7c904f1..f2fd6a46d4 100644 --- a/source/adapters/level_zero/ur_interface_loader.hpp +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -691,8 +691,9 @@ ur_result_t urEnqueueCooperativeKernelLaunchExp( const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, - size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet); + ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, + const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, + uint32_t *pGroupCountRet); ur_result_t urEnqueueTimestampRecordingExp( ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index c04d4cf6ca..5fa6478118 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -560,8 +560,9 @@ ur_result_t urCommandBufferCommandGetInfoExp( } ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, - size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { + ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, + const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, + uint32_t *pGroupCountRet) { logger::error("{} function not implemented!", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index b60be1d561..b27c4efaa1 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -10057,6 +10057,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items @@ -10072,7 +10073,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_result_t result = UR_RESULT_SUCCESS; ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = { - &hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize, + &hKernel, + &hDevice, + &workDim, + &pLocalWorkSize, + &dynamicSharedMemorySize, &pGroupCountRet}; auto beforeCallback = reinterpret_cast( diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp index df160b65eb..fb2c735adc 100644 --- a/source/adapters/opencl/kernel.cpp +++ b/source/adapters/opencl/kernel.cpp @@ -390,6 +390,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( [[maybe_unused]] ur_kernel_handle_t hKernel, + [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] uint32_t workDim, [[maybe_unused]] const size_t *pLocalWorkSize, [[maybe_unused]] size_t dynamicSharedMemorySize, diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 55f8d00bea..3e8043a258 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -8633,6 +8633,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items @@ -8654,7 +8655,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( } ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = { - &hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize, + &hKernel, + &hDevice, + &workDim, + &pLocalWorkSize, + &dynamicSharedMemorySize, &pGroupCountRet}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, @@ -8664,7 +8669,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( logger.info(" ---> urKernelSuggestMaxCooperativeGroupCountExp\n"); ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp( - hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, pGroupCountRet); getContext()->notify_end( diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 6e48f79edc..d13df673cd 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -9656,6 +9656,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items @@ -9681,6 +9682,10 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } + if (NULL == hDevice) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + if (NULL == pLocalWorkSize) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } @@ -9695,8 +9700,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( getContext()->refCountContext->logInvalidReference(hKernel); } + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hDevice)) { + getContext()->refCountContext->logInvalidReference(hDevice); + } + ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp( - hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, pGroupCountRet); return result; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index c74b9d6caf..480678d598 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -8844,6 +8844,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items @@ -8871,9 +8872,12 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( // convert loader handle to platform handle hKernel = reinterpret_cast(hKernel)->handle; + // convert loader handle to platform handle + hDevice = reinterpret_cast(hDevice)->handle; + // forward to device-platform result = pfnSuggestMaxCooperativeGroupCountExp( - hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, pGroupCountRet); return result; diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index e257366a7f..fc24d9347b 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -8935,12 +8935,14 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hKernel` +/// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pLocalWorkSize` /// + `NULL == pGroupCountRet` /// - ::UR_RESULT_ERROR_INVALID_KERNEL ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items @@ -8961,7 +8963,7 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( } return pfnSuggestMaxCooperativeGroupCountExp( - hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, pGroupCountRet); } catch (...) { return exceptionToResult(std::current_exception()); diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 793045bcb4..eb3f20c77b 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -7578,12 +7578,14 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hKernel` +/// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pLocalWorkSize` /// + `NULL == pGroupCountRet` /// - ::UR_RESULT_ERROR_INVALID_KERNEL ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items