diff --git a/.github/intel-llvm-mirror-base-commit b/.github/intel-llvm-mirror-base-commit index dd0312365f..0eed1c8ea3 100644 --- a/.github/intel-llvm-mirror-base-commit +++ b/.github/intel-llvm-mirror-base-commit @@ -1 +1 @@ -a50acd0244276fb9efb231abae5ce9d71495768b +f119abe1b69f99602067517a64300a659471134b diff --git a/include/ur_api.h b/include/ur_api.h index 1bba8a950e..5882015c55 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -475,6 +475,8 @@ typedef enum ur_function_t { UR_FUNCTION_MEMORY_EXPORT_EXPORT_MEMORY_HANDLE_EXP = 287, /// Enumerator for ::urBindlessImagesSupportsImportingHandleTypeExp UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP = 288, + /// Enumerator for ::urEnqueueKernelLaunchWithArgsExp + UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP = 289, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -588,6 +590,8 @@ typedef enum ur_structure_type_t { UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES = 0x3000, /// ::ur_exp_enqueue_ext_properties_t UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES = 0x4000, + /// ::ur_exp_kernel_arg_properties_t + UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES = 0x5000, /// @cond UR_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -12843,6 +12847,166 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( /// propName. size_t *pPropSizeRet); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental API for setting args at kernel +// launch +#if !defined(__GNUC__) +#pragma region enqueue_kernel_launch_with_args_(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief What kind of kernel arg is this +typedef enum ur_exp_kernel_arg_type_t { + /// Kernel arg is a value. + UR_EXP_KERNEL_ARG_TYPE_VALUE = 0, + /// Kernel arg is a pointer. + UR_EXP_KERNEL_ARG_TYPE_POINTER = 1, + /// Kernel arg is a memory object. + UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ = 2, + /// Kernel arg is a local allocation. + UR_EXP_KERNEL_ARG_TYPE_LOCAL = 3, + /// Kernel arg is a sampler. + UR_EXP_KERNEL_ARG_TYPE_SAMPLER = 4, + /// @cond + UR_EXP_KERNEL_ARG_TYPE_FORCE_UINT32 = 0x7fffffff + /// @endcond + +} ur_exp_kernel_arg_type_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Mem obj/properties tuple +typedef struct ur_exp_kernel_arg_mem_obj_tuple_t { + /// [in] Handle of a memory object + ur_mem_handle_t hMem; + /// [in] Memory flags to associate with `hMem`. Allowed values are: + /// ::UR_MEM_FLAG_READ_WRITE, ::UR_MEM_FLAG_WRITE_ONLY, + /// ::UR_MEM_FLAG_READ_ONLY. + ur_mem_flags_t flags; + +} ur_exp_kernel_arg_mem_obj_tuple_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Typesafe container for a kernel argument value +typedef union ur_exp_kernel_arg_value_t { + /// [in] argument value represented as matching arg type. + /// The data pointed to will be copied and therefore can be reused on return. + const void *value; + /// [in] Allocation obtained by USM allocation or virtual memory mapping + /// operation, or pointer to a literal value. + const void *pointer; + /// [in] Struct containing a memory object and associated flags. + ur_exp_kernel_arg_mem_obj_tuple_t memObjTuple; + /// [in] Handle of a sampler object. + ur_sampler_handle_t sampler; + +} ur_exp_kernel_arg_value_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Kernel arg properties +typedef struct ur_exp_kernel_arg_properties_t { + /// [in] type of this structure, must be + /// ::UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES + ur_structure_type_t stype; + /// [in,out][optional] pointer to extension-specific structure + void *pNext; + /// [in] type of the kernel arg + ur_exp_kernel_arg_type_t type; + /// [in] index of the kernel arg + uint32_t index; + /// [in] size of the kernel arg + size_t size; + /// [in][tagged_by(type)] Union containing the argument value. + ur_exp_kernel_arg_value_t value; + +} ur_exp_kernel_arg_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command to execute a kernel +/// +/// @remarks +/// _Analogues_ +/// - **clEnqueueNDRangeKernel** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// + `NULL == hKernel` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkSize` +/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `pArgs == NULL && numArgs > 0` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS +/// + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR. +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || +/// pGlobalWorkSize[2] == 0` +/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE +/// + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == +/// 0 || pLocalWorkSize[2] == 0)` +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values +/// have not been specified." +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If any property in `launchPropList` isn't supported by the device. +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent); + #if !defined(__GNUC__) #pragma endregion #endif @@ -14445,6 +14609,26 @@ typedef struct ur_enqueue_write_host_pipe_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_write_host_pipe_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urEnqueueKernelLaunchWithArgsExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_enqueue_kernel_launch_with_args_exp_params_t { + ur_queue_handle_t *phQueue; + ur_kernel_handle_t *phKernel; + uint32_t *pworkDim; + const size_t **ppGlobalWorkOffset; + const size_t **ppGlobalWorkSize; + const size_t **ppLocalWorkSize; + uint32_t *pnumArgs; + const ur_exp_kernel_arg_properties_t **ppArgs; + uint32_t *pnumPropsInLaunchPropList; + const ur_kernel_launch_property_t **plaunchPropList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; + ur_event_handle_t **pphEvent; +} ur_enqueue_kernel_launch_with_args_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urEnqueueEventsWaitWithBarrierExt /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/include/ur_api_funcs.def b/include/ur_api_funcs.def index f0c92445b9..97092258a5 100644 --- a/include/ur_api_funcs.def +++ b/include/ur_api_funcs.def @@ -133,6 +133,7 @@ _UR_API(urEnqueueDeviceGlobalVariableRead) _UR_API(urEnqueueReadHostPipe) _UR_API(urEnqueueWriteHostPipe) _UR_API(urEnqueueEventsWaitWithBarrierExt) +_UR_API(urEnqueueKernelLaunchWithArgsExp) _UR_API(urEnqueueUSMDeviceAllocExp) _UR_API(urEnqueueUSMSharedAllocExp) _UR_API(urEnqueueUSMHostAllocExp) diff --git a/include/ur_ddi.h b/include/ur_ddi.h index 8ab686aa58..b1033a027a 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1097,6 +1097,15 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( typedef ur_result_t(UR_APICALL *ur_pfnGetEnqueueProcAddrTable_t)( ur_api_version_t, ur_enqueue_dditable_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urEnqueueKernelLaunchWithArgsExp +typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchWithArgsExp_t)( + ur_queue_handle_t, ur_kernel_handle_t, uint32_t, const size_t *, + const size_t *, const size_t *, uint32_t, + const ur_exp_kernel_arg_properties_t *, uint32_t, + const ur_kernel_launch_property_t *, uint32_t, const ur_event_handle_t *, + ur_event_handle_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urEnqueueUSMDeviceAllocExp typedef ur_result_t(UR_APICALL *ur_pfnEnqueueUSMDeviceAllocExp_t)( @@ -1147,6 +1156,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueNativeCommandExp_t)( /////////////////////////////////////////////////////////////////////////////// /// @brief Table of EnqueueExp functions pointers typedef struct ur_enqueue_exp_dditable_t { + ur_pfnEnqueueKernelLaunchWithArgsExp_t pfnKernelLaunchWithArgsExp; ur_pfnEnqueueUSMDeviceAllocExp_t pfnUSMDeviceAllocExp; ur_pfnEnqueueUSMSharedAllocExp_t pfnUSMSharedAllocExp; ur_pfnEnqueueUSMHostAllocExp_t pfnUSMHostAllocExp; diff --git a/include/ur_print.h b/include/ur_print.h index 8130df0c5b..3e1f03a3aa 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -1415,6 +1415,36 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_kernel_arg_type_t enum +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL +urPrintExpKernelArgType(enum ur_exp_kernel_arg_type_t value, char *buffer, + const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_kernel_arg_mem_obj_tuple_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpKernelArgMemObjTuple( + const struct ur_exp_kernel_arg_mem_obj_tuple_t params, char *buffer, + const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_kernel_arg_properties_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpKernelArgProperties( + const struct ur_exp_kernel_arg_properties_t params, char *buffer, + const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_exp_enqueue_ext_flag_t enum /// @returns @@ -2684,6 +2714,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams( const struct ur_enqueue_write_host_pipe_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_enqueue_kernel_launch_with_args_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueKernelLaunchWithArgsExpParams( + const struct ur_enqueue_kernel_launch_with_args_exp_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_enqueue_events_wait_with_barrier_ext_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 91c9973a3a..93cc0d5f2b 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -50,6 +50,8 @@ inline ur_result_t printFlag(std::ostream &os, uint32_t flag); template inline ur_result_t printTagged(std::ostream &os, const void *ptr, T value, size_t size); +template +inline ur_result_t printArray(std::ostream &os, const T *ptr); inline ur_result_t printStruct(std::ostream &os, const void *ptr); @@ -263,6 +265,10 @@ template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_peer_info_t value, size_t size); +inline ur_result_t printUnion(std::ostream &os, + const union ur_exp_kernel_arg_value_t params, + const enum ur_exp_kernel_arg_type_t tag); + template <> inline ur_result_t printFlag(std::ostream &os, uint32_t flag); @@ -592,6 +598,14 @@ operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_command_buffer_update_kernel_launch_desc_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_peer_info_t value); +inline std::ostream &operator<<(std::ostream &os, + enum ur_exp_kernel_arg_type_t value); +inline std::ostream &operator<<( + std::ostream &os, + [[maybe_unused]] const struct ur_exp_kernel_arg_mem_obj_tuple_t params); +inline std::ostream & +operator<<(std::ostream &os, + [[maybe_unused]] const struct ur_exp_kernel_arg_properties_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_enqueue_ext_flag_t value); inline std::ostream &operator<<( @@ -1274,6 +1288,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP: os << "UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP"; break; + case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP: + os << "UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP"; + break; default: os << "unknown enumerator"; break; @@ -1443,6 +1460,9 @@ inline std::ostream &operator<<(std::ostream &os, case UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES: os << "UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES"; break; + case UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES: + os << "UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES"; + break; default: os << "unknown enumerator"; break; @@ -1759,6 +1779,12 @@ inline ur_result_t printStruct(std::ostream &os, const void *ptr) { (const ur_exp_enqueue_ext_properties_t *)ptr; printPtr(os, pstruct); } break; + + case UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES: { + const ur_exp_kernel_arg_properties_t *pstruct = + (const ur_exp_kernel_arg_properties_t *)ptr; + printPtr(os, pstruct); + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -10894,13 +10920,7 @@ printUnion(std::ostream &os, case UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: os << ".clusterDim = {"; - for (auto i = 0; i < 3; i++) { - if (i != 0) { - os << ", "; - } - - os << (params.clusterDim[i]); - } + ur::details::printArray<3>(os, params.clusterDim); os << "}"; break; @@ -11545,13 +11565,7 @@ operator<<(std::ostream &os, const struct ur_exp_sampler_addr_modes_t params) { os << ", "; os << ".addrModes = {"; - for (auto i = 0; i < 3; i++) { - if (i != 0) { - os << ", "; - } - - os << (params.addrModes[i]); - } + ur::details::printArray<3>(os, params.addrModes); os << "}"; os << "}"; @@ -12270,6 +12284,141 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, } } // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_kernel_arg_type_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, + enum ur_exp_kernel_arg_type_t value) { + switch (value) { + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + os << "UR_EXP_KERNEL_ARG_TYPE_VALUE"; + break; + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + os << "UR_EXP_KERNEL_ARG_TYPE_POINTER"; + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: + os << "UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ"; + break; + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + os << "UR_EXP_KERNEL_ARG_TYPE_LOCAL"; + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: + os << "UR_EXP_KERNEL_ARG_TYPE_SAMPLER"; + break; + default: + os << "unknown enumerator"; + break; + } + return os; +} +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_kernel_arg_mem_obj_tuple_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, + const struct ur_exp_kernel_arg_mem_obj_tuple_t params) { + os << "(struct ur_exp_kernel_arg_mem_obj_tuple_t){"; + + os << ".hMem = "; + + ur::details::printPtr(os, (params.hMem)); + + os << ", "; + os << ".flags = "; + + ur::details::printFlag(os, (params.flags)); + + os << "}"; + return os; +} +namespace ur::details { + +/////////////////////////////////////////////////////////////////////////////// +// @brief Print ur_exp_kernel_arg_value_t union +inline ur_result_t printUnion(std::ostream &os, + const union ur_exp_kernel_arg_value_t params, + const enum ur_exp_kernel_arg_type_t tag) { + os << "(union ur_exp_kernel_arg_value_t){"; + + switch (tag) { + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + + os << ".value = "; + + ur::details::printPtr(os, (params.value)); + + break; + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + + os << ".pointer = "; + + ur::details::printPtr(os, (params.pointer)); + + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: + + os << ".memObjTuple = "; + + os << (params.memObjTuple); + + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: + + os << ".sampler = "; + + ur::details::printPtr(os, (params.sampler)); + + break; + default: + os << ""; + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + os << "}"; + return UR_RESULT_SUCCESS; +} +} // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_kernel_arg_properties_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, + const struct ur_exp_kernel_arg_properties_t params) { + os << "(struct ur_exp_kernel_arg_properties_t){"; + + os << ".stype = "; + + os << (params.stype); + + os << ", "; + os << ".pNext = "; + + ur::details::printStruct(os, (params.pNext)); + + os << ", "; + os << ".type = "; + + os << (params.type); + + os << ", "; + os << ".index = "; + + os << (params.index); + + os << ", "; + os << ".size = "; + + os << (params.size); + + os << ", "; + os << ".value = "; + ur::details::printUnion(os, (params.value), params.type); + + os << "}"; + return os; +} /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_exp_enqueue_ext_flag_t type /// @returns @@ -16919,6 +17068,114 @@ inline std::ostream &operator<<( return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the +/// ur_enqueue_kernel_launch_with_args_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, [[maybe_unused]] const struct + ur_enqueue_kernel_launch_with_args_exp_params_t *params) { + + os << ".hQueue = "; + + ur::details::printPtr(os, *(params->phQueue)); + + os << ", "; + os << ".hKernel = "; + + ur::details::printPtr(os, *(params->phKernel)); + + os << ", "; + os << ".workDim = "; + + os << *(params->pworkDim); + + os << ", "; + os << ".pGlobalWorkOffset = "; + + ur::details::printPtr(os, *(params->ppGlobalWorkOffset)); + + os << ", "; + os << ".pGlobalWorkSize = "; + + ur::details::printPtr(os, *(params->ppGlobalWorkSize)); + + os << ", "; + os << ".pLocalWorkSize = "; + + ur::details::printPtr(os, *(params->ppLocalWorkSize)); + + os << ", "; + os << ".numArgs = "; + + os << *(params->pnumArgs); + + os << ", "; + os << ".pArgs = "; + ur::details::printPtr(os, reinterpret_cast(*(params->ppArgs))); + if (*(params->ppArgs) != NULL) { + os << " {"; + for (size_t i = 0; i < *params->pnumArgs; ++i) { + if (i != 0) { + os << ", "; + } + + os << (*(params->ppArgs))[i]; + } + os << "}"; + } + + os << ", "; + os << ".numPropsInLaunchPropList = "; + + os << *(params->pnumPropsInLaunchPropList); + + os << ", "; + os << ".launchPropList = "; + ur::details::printPtr( + os, reinterpret_cast(*(params->plaunchPropList))); + if (*(params->plaunchPropList) != NULL) { + os << " {"; + for (size_t i = 0; i < *params->pnumPropsInLaunchPropList; ++i) { + if (i != 0) { + os << ", "; + } + + os << (*(params->plaunchPropList))[i]; + } + os << "}"; + } + + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = "; + ur::details::printPtr( + os, reinterpret_cast(*(params->pphEventWaitList))); + if (*(params->pphEventWaitList) != NULL) { + os << " {"; + for (size_t i = 0; i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, (*(params->pphEventWaitList))[i]); + } + os << "}"; + } + + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, *(params->pphEvent)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the /// ur_enqueue_events_wait_with_barrier_ext_params_t type @@ -21047,6 +21304,25 @@ inline ur_result_t printPtr(std::ostream &os, const T *ptr) { return UR_RESULT_SUCCESS; } + +/////////////////////////////////////////////////////////////////////////////// +// @brief Print array of literals +template +inline ur_result_t printArray(std::ostream &os, const T *ptr) { + if (ptr == NULL) { + return printPtr(os, ptr); + } + + for (size_t i = 0; i < size; i++) { + if (i != 0) { + os << ", "; + } + + os << ptr[i]; + } + + return UR_RESULT_SUCCESS; +} } // namespace ur::details namespace ur::extras { @@ -21441,6 +21717,10 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE: { os << (const struct ur_enqueue_write_host_pipe_params_t *)params; } break; + case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP: { + os << (const struct ur_enqueue_kernel_launch_with_args_exp_params_t *) + params; + } break; case UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT: { os << (const struct ur_enqueue_events_wait_with_barrier_ext_params_t *) params; diff --git a/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst b/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst new file mode 100644 index 0000000000..703cd1e935 --- /dev/null +++ b/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst @@ -0,0 +1,77 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-enqueue-kernel-launch-with-args: + +================================================================================ +Enqueue Kernel Launch With Args +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + + +Motivation +-------------------------------------------------------------------------------- + +If an application is setting a kernel's args and launching that kernel in the +same place, we can eliminate some overhead by allowing this to be accomplished +with one API call, rather than requiring one call for each argument and one to +launch. This also aligns with developments in the Level Zero backend, as well +as how CUDA and HIP handle kernel args. + +API +-------------------------------------------------------------------------------- + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}_structure_type_t + ${X}_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES + +* ${x}_exp_kernel_arg_type_t + ${X}_EXP_KERNEL_ARG_TYPE_VALUE + ${X}_EXP_KERNEL_ARG_TYPE_POINTER + ${X}_EXP_KERNEL_ARG_TYPE_MEM_OBJ + ${X}_EXP_KERNEL_ARG_TYPE_LOCAL + ${X}_EXP_KERNEL_ARG_TYPE_SAMPLER + +Types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}_exp_kernel_arg_mem_obj_tuple_t +* ${x}_exp_kernel_arg_value_t +* ${x}_exp_kernel_arg_properties_t + +Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}EnqueueKernelLaunchWithArgsExp + +Changelog +-------------------------------------------------------------------------------- + ++-----------+---------------------------------------------+ +| Revision | Changes | ++===========+=============================================+ +| 1.0 | Initial Draft | ++-----------+---------------------------------------------+ + +Support +-------------------------------------------------------------------------------- + +Adapters must support this feature. A naive implementation can easily be +constructed as a wrapper around the existing APIs for setting kernel args and +launching. + +Contributors +-------------------------------------------------------------------------------- + +* Aaron Greig `aaron.greig@codeplay.com `_ diff --git a/scripts/core/exp-enqueue-kernel-launch-with-args.yml b/scripts/core/exp-enqueue-kernel-launch-with-args.yml new file mode 100644 index 0000000000..6656b6a6d0 --- /dev/null +++ b/scripts/core/exp-enqueue-kernel-launch-with-args.yml @@ -0,0 +1,170 @@ +# +# Copyright (C) 2025 Intel Corporation +# +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# See YaML.md for syntax definition +# +--- #-------------------------------------------------------------------------- +type: header +desc: "Intel $OneApi Unified Runtime Experimental API for setting args at kernel launch" +ordinal: "100" +--- #-------------------------------------------------------------------------- +type: enum +desc: "What kind of kernel arg is this" +class: $xKernel +name: $x_exp_kernel_arg_type_t +etors: + - name: VALUE + desc: "Kernel arg is a value." + - name: POINTER + desc: "Kernel arg is a pointer." + - name: MEM_OBJ + desc: "Kernel arg is a memory object." + - name: LOCAL + desc: "Kernel arg is a local allocation." + - name: SAMPLER + desc: "Kernel arg is a sampler." +--- #-------------------------------------------------------------------------- +type: struct +desc: "Mem obj/properties tuple" +name: $x_exp_kernel_arg_mem_obj_tuple_t +members: + - type: $x_mem_handle_t + name: hMem + desc: "[in] Handle of a memory object" + - type: $x_mem_flags_t + name: flags + desc: "[in] Memory flags to associate with `hMem`. Allowed values are: $X_MEM_FLAG_READ_WRITE, $X_MEM_FLAG_WRITE_ONLY, $X_MEM_FLAG_READ_ONLY." +--- #-------------------------------------------------------------------------- +# We have redundant entries in the union (value + pointer) to make printing work +# as it relies on the tags and we can currently only have one tag per member. +type: union +desc: "Typesafe container for a kernel argument value" +name: $x_exp_kernel_arg_value_t +tag: $x_exp_kernel_arg_type_t +members: + - type: "const void*" + name: value + desc: | + [in] argument value represented as matching arg type. + The data pointed to will be copied and therefore can be reused on return. + tag: $X_EXP_KERNEL_ARG_TYPE_VALUE + - type: "const void*" + name: pointer + desc: "[in] Allocation obtained by USM allocation or virtual memory mapping operation, or pointer to a literal value." + tag: $X_EXP_KERNEL_ARG_TYPE_POINTER + - type: $x_exp_kernel_arg_mem_obj_tuple_t + name: memObjTuple + desc: "[in] Struct containing a memory object and associated flags." + tag: $X_EXP_KERNEL_ARG_TYPE_MEM_OBJ + - type: $x_sampler_handle_t + name: sampler + desc: "[in] Handle of a sampler object." + tag: $X_EXP_KERNEL_ARG_TYPE_SAMPLER +--- #-------------------------------------------------------------------------- +type: struct +desc: "Kernel arg properties" +name: $x_exp_kernel_arg_properties_t +base: $x_base_properties_t +members: + - type: $x_exp_kernel_arg_type_t + name: type + desc: "[in] type of the kernel arg" + - type: uint32_t + name: index + desc: "[in] index of the kernel arg" + - type: size_t + name: size + desc: "[in] size of the kernel arg" + - type: $x_exp_kernel_arg_value_t + name: value + desc: "[in][tagged_by(type)] Union containing the argument value." +--- #-------------------------------------------------------------------------- +type: enum +extend: true +desc: "Structure Type experimental enumerations." +name: $x_structure_type_t +etors: + - name: EXP_KERNEL_ARG_PROPERTIES + desc: $x_exp_kernel_arg_properties_t + value: "0x5000" +--- #-------------------------------------------------------------------------- +type: function +desc: "Enqueue a command to execute a kernel" +class: $xEnqueue +name: KernelLaunchWithArgsExp +ordinal: "0" +analogue: + - "**clEnqueueNDRangeKernel**" +params: + - type: $x_queue_handle_t + name: hQueue + desc: "[in] handle of the queue object" + - type: $x_kernel_handle_t + name: hKernel + desc: "[in] handle of the kernel object" + - type: uint32_t + name: workDim + desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items" + - type: "const size_t*" + name: pGlobalWorkOffset + desc: "[in][optional] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item" + - type: "const size_t*" + name: pGlobalWorkSize + desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function" + - type: "const size_t*" + name: pLocalWorkSize + desc: | + [in][optional] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. + If nullptr, the runtime implementation will choose the work-group size. + - type: uint32_t + name: numArgs + desc: "[in] Number of entries in pArgs" + - type: "const $x_exp_kernel_arg_properties_t*" + name: pArgs + desc: "[in][optional][range(0, numArgs)] pointer to a list of kernel arg properties." + - type: uint32_t + name: numPropsInLaunchPropList + desc: "[in] size of the launch prop list" + - type: const $x_kernel_launch_property_t* + name: launchPropList + desc: "[in][optional][range(0, numPropsInLaunchPropList)] pointer to a list of launch properties" + - type: uint32_t + name: numEventsInWaitList + desc: "[in] size of the event wait list" + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: | + [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. + If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + - type: $x_event_handle_t* + name: phEvent + desc: | + [out][optional][alloc] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: + - $X_RESULT_ERROR_INVALID_QUEUE + - $X_RESULT_ERROR_INVALID_KERNEL + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS: + - "An event in `phEventWaitList` has $X_EVENT_STATUS_ERROR." + - $X_RESULT_ERROR_INVALID_WORK_DIMENSION: + - "`pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || pGlobalWorkSize[2] == 0`" + - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE: + - "`pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == 0 || pLocalWorkSize[2] == 0)`" + - $X_RESULT_ERROR_INVALID_VALUE + - $X_RESULT_ERROR_INVALID_KERNEL_ARGS + - "The kernel argument values have not been specified." + - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY + - $X_RESULT_ERROR_OUT_OF_RESOURCES + - $X_RESULT_ERROR_INVALID_OPERATION: + - "If any property in `launchPropList` isn't supported by the device." + - $X_RESULT_ERROR_INVALID_NULL_POINTER: + - "`launchPropList == NULL && numPropsInLaunchPropList > 0`" + - "`pArgs == NULL && numArgs > 0`" diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index 349ac97a27..0646945b08 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -670,6 +670,9 @@ etors: - name: BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP desc: Enumerator for $xBindlessImagesSupportsImportingHandleTypeExp value: '288' +- name: ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP + desc: Enumerator for $xEnqueueKernelLaunchWithArgsExp + value: '289' --- type: enum desc: Defines structure types diff --git a/scripts/parse_specs.py b/scripts/parse_specs.py index 9fdb69eedc..8adbc6de19 100644 --- a/scripts/parse_specs.py +++ b/scripts/parse_specs.py @@ -909,7 +909,9 @@ def _append(lst, key, val): rets[idx][key].append(val) def append_nullchecks(param, accessor: str): - if type_traits.is_pointer(param["type"]): + if type_traits.is_pointer(param["type"]) or type_traits.is_array( + param["type"] + ): _append( rets, "$X_RESULT_ERROR_INVALID_NULL_POINTER", diff --git a/scripts/templates/helper.py b/scripts/templates/helper.py index 00de01e347..5b4fc0c18c 100644 --- a/scripts/templates/helper.py +++ b/scripts/templates/helper.py @@ -987,12 +987,17 @@ def make_param_lines( words = [] if "type*" in format: - words.append(tname + "*") + ptname = tname + "*" + if type_traits.is_array(item["type"]): + ptname += "*" + words.append(ptname) name = "p" + name elif "type" in format: words.append(tname) if "name" in format: words.append(name) + if type_traits.is_array(item["type"]) and "type" in format: + words.append(f"[{type_traits.get_array_length(item['type'])}]") prologue = " ".join(words) if "delim" in format: diff --git a/scripts/templates/print.hpp.mako b/scripts/templates/print.hpp.mako index 4481847130..ed94cd9227 100644 --- a/scripts/templates/print.hpp.mako +++ b/scripts/templates/print.hpp.mako @@ -99,14 +99,7 @@ def findMemberType(_item): ${x}::details::printUnion(os, ${deref}(params${access}${item['name']}), params${access}${th.param_traits.tagged_member(item)}); %elif th.type_traits.is_array(item['type']): os << ".${iname} = {"; - for(auto i = 0; i < ${th.type_traits.get_array_length(item['type'])}; i++){ - if(i != 0){ - os << ", "; - } - <%call expr="member(iname, itype, True)"> - ${deref}(params${access}${item['name']}[i]) - - } + ${x}::details::printArray<${th.type_traits.get_array_length(item['type'])}>(os, ${deref}params${access}${pname}); os << "}"; %elif typename is not None: os << ".${iname} = "; @@ -140,6 +133,7 @@ inline constexpr bool is_handle_v = is_handle::value; template inline ${x}_result_t printPtr(std::ostream &os, const T *ptr); template inline ${x}_result_t printFlag(std::ostream &os, uint32_t flag); template inline ${x}_result_t printTagged(std::ostream &os, const void *ptr, T value, size_t size); +template inline ur_result_t printArray(std::ostream &os, const T *ptr); %for spec in specs: %for obj in spec['objects']: @@ -564,6 +558,25 @@ template inline ${x}_result_t printPtr(std::ostream &os, const T *p return ${X}_RESULT_SUCCESS; } + +/////////////////////////////////////////////////////////////////////////////// +// @brief Print array of literals +template +inline ur_result_t printArray(std::ostream &os, const T *ptr) { + if(ptr == NULL) { + return printPtr(os, ptr); + } + + for (size_t i = 0; i < size; i++) { + if (i != 0) { + os << ", "; + } + + os << ptr[i]; + } + + return ${X}_RESULT_SUCCESS; +} } // namespace ${x}::details namespace ${x}::extras { diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 8eb00ccab2..091e8e9d53 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -15,6 +15,7 @@ #include "kernel.hpp" #include "memory.hpp" #include "queue.hpp" +#include "sampler.hpp" #include #include @@ -619,6 +620,60 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( #endif // CUDA_VERSION >= 11080 } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + try { + for (uint32_t i = 0; i < numArgs; i++) { + switch (pArgs[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { + hKernel->setKernelLocalArg(pArgs[i].index, pArgs[i].size); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_VALUE: { + hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, + pArgs[i].value.value); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: { + // setKernelArg is expecting a pointer to our argument + hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, + &pArgs[i].value.pointer); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Props = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + pArgs[i].value.memObjTuple.flags}; + UR_CALL(urKernelSetArgMemObj(hKernel, pArgs[i].index, &Props, + pArgs[i].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + uint32_t SamplerProps = pArgs[i].value.sampler->Props; + hKernel->setKernelArg(pArgs[i].index, sizeof(uint32_t), + (void *)&SamplerProps); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + } catch (ur_result_t Err) { + return Err; + } + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} + /// Set parameters for general 3D memory copy. /// If the source and/or destination is on the device, SrcPtr and/or DstPtr /// must be a pointer to a CUdeviceptr diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index 8430df0ab0..a9b072472b 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -454,6 +454,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnUSMHostAllocExp = urEnqueueUSMHostAllocExp; pDdiTable->pfnUSMFreeExp = urEnqueueUSMFreeExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; + pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index dc0fac8050..54ea1ca91a 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -16,6 +16,7 @@ #include "logger/ur_logger.hpp" #include "memory.hpp" #include "queue.hpp" +#include "sampler.hpp" #include "ur_api.h" #include @@ -340,6 +341,60 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + try { + for (uint32_t i = 0; i < numArgs; i++) { + switch (pArgs[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { + hKernel->setKernelLocalArg(pArgs[i].index, pArgs[i].size); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_VALUE: { + hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, + pArgs[i].value.value); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: { + // setKernelArg is expecting a pointer to our argument + hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, + &pArgs[i].value.pointer); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Props = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + pArgs[i].value.memObjTuple.flags}; + UR_CALL(urKernelSetArgMemObj(hKernel, pArgs[i].index, &Props, + pArgs[i].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + uint32_t SamplerProps = pArgs[i].value.sampler->Props; + hKernel->setKernelArg(pArgs[i].index, sizeof(uint32_t), + (void *)&SamplerProps); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + } catch (ur_result_t Err) { + return Err; + } + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} + /// Enqueues a wait on the given queue for all events. /// See \ref enqueueEventWait /// diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index dfb4382cad..d8ec6bb3b5 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -447,6 +447,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; + pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index b6d3d2e64c..06d1366a11 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -56,6 +56,173 @@ ur_result_t urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } +inline ur_result_t KernelSetArgValueHelper( + ur_kernel_handle_t Kernel, + /// [in] argument index in range [0, num args - 1] + uint32_t ArgIndex, + /// [in] size of argument type + size_t ArgSize, + /// [in] argument value represented as matching arg type. + const void *PArgValue) { + // OpenCL: "the arg_value pointer can be NULL or point to a NULL value + // in which case a NULL value will be used as the value for the argument + // declared as a pointer to global or constant memory in the kernel" + // + // We don't know the type of the argument but it seems that the only time + // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument + // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL. + if (ArgSize == sizeof(void *) && PArgValue && + *(void **)(const_cast(PArgValue)) == nullptr) { + PArgValue = nullptr; + } + + if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; + } + + ze_result_t ZeResult = ZE_RESULT_SUCCESS; + if (Kernel->ZeKernelMap.empty()) { + auto ZeKernel = Kernel->ZeKernel; + ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue, + (ZeKernel, ArgIndex, ArgSize, PArgValue)); + } else { + for (auto It : Kernel->ZeKernelMap) { + auto ZeKernel = It.second; + ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue, + (ZeKernel, ArgIndex, ArgSize, PArgValue)); + } + } + + if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE; + } + + return ze2urResult(ZeResult); +} + +inline ur_result_t KernelSetArgMemObjHelper( + /// [in] handle of the kernel object + ur_kernel_handle_t Kernel, + /// [in] argument index in range [0, num args - 1] + uint32_t ArgIndex, + /// [in][optional] pointer to Memory object properties. + const ur_kernel_arg_mem_obj_properties_t *Properties, + /// [in][optional] handle of Memory object. + ur_mem_handle_t ArgValue) { + // The ArgValue may be a NULL pointer in which case a NULL value is used for + // the kernel argument declared as a pointer to global or constant memory. + + if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; + } + + ur_mem_handle_t_ *UrMem = ur_cast(ArgValue); + + ur_mem_handle_t_::access_mode_t UrAccessMode = ur_mem_handle_t_::read_write; + if (Properties) { + switch (Properties->memoryAccess) { + case UR_MEM_FLAG_READ_WRITE: + UrAccessMode = ur_mem_handle_t_::read_write; + break; + case UR_MEM_FLAG_WRITE_ONLY: + UrAccessMode = ur_mem_handle_t_::write_only; + break; + case UR_MEM_FLAG_READ_ONLY: + UrAccessMode = ur_mem_handle_t_::read_only; + break; + case 0: + break; + default: + return UR_RESULT_ERROR_INVALID_ARGUMENT; + } + } + auto Arg = UrMem ? UrMem : nullptr; + Kernel->PendingArguments.push_back( + {ArgIndex, sizeof(void *), Arg, UrAccessMode}); + + return UR_RESULT_SUCCESS; +} + +ur_result_t urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t Queue, + /// [in] handle of the kernel object + ur_kernel_handle_t Kernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *GlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *GlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *LocalWorkSize, + /// [in] size of the event wait list + uint32_t NumArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *Args, + /// [in] size of the launch prop list + uint32_t NumPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *LaunchPropList, + uint32_t NumEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. If + /// nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *EventWaitList, + /// [in,out][optional] return an event object that identifies this + /// particular kernel execution instance. + ur_event_handle_t *OutEvent) { + { + std::scoped_lock Guard(Kernel->Mutex); + for (uint32_t i = 0; i < NumArgs; i++) { + switch (Args[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, + nullptr)); + break; + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, + Args[i].value.value)); + break; + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, + &Args[i].value.pointer)); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Properties = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + Args[i].value.memObjTuple.flags}; + UR_CALL(KernelSetArgMemObjHelper(Kernel, Args[i].index, &Properties, + Args[i].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, + &Args[i].value.sampler->ZeSampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + } + // Normalize so each dimension has at least one work item + return level_zero::urEnqueueKernelLaunch( + Queue, Kernel, workDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize, + NumPropsInLaunchPropList, LaunchPropList, NumEventsInWaitList, + EventWaitList, OutEvent); +} + ur_result_t urEnqueueKernelLaunch( /// [in] handle of the queue object ur_queue_handle_t Queue, diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index 13d7274e7a..4276f97f5d 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -225,6 +225,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return result; } + pDdiTable->pfnKernelLaunchWithArgsExp = + ur::level_zero::urEnqueueKernelLaunchWithArgsExp; pDdiTable->pfnUSMDeviceAllocExp = ur::level_zero::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = ur::level_zero::urEnqueueUSMSharedAllocExp; pDdiTable->pfnUSMHostAllocExp = ur::level_zero::urEnqueueUSMHostAllocExp; diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp index df8e93c1f7..b0d683e7a5 100644 --- a/source/adapters/level_zero/ur_interface_loader.hpp +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -804,6 +804,15 @@ ur_result_t urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t commandDevice, ur_exp_peer_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet); +ur_result_t urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); ur_result_t urEnqueueEventsWaitWithBarrierExt( ur_queue_handle_t hQueue, const ur_exp_enqueue_ext_properties_t *pProperties, diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index 728db1360b..04e202265d 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -11,6 +11,7 @@ #include "command_list_manager.hpp" #include "../helpers/kernel_helpers.hpp" #include "../helpers/memory_helpers.hpp" +#include "../sampler.hpp" #include "../ur_interface_loader.hpp" #include "command_buffer.hpp" #include "context.hpp" @@ -975,3 +976,60 @@ ur_result_t ur_command_list_manager::releaseSubmittedKernels() { submittedKernels.clear(); return UR_RESULT_SUCCESS; } + +ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_in_order_t::enqueueKernelLaunchWithArgsExp"); + { + std::scoped_lock guard(hKernel->Mutex); + for (uint32_t argIndex = 0; argIndex < numArgs; argIndex++) { + switch (pArgs[argIndex].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + UR_CALL(hKernel->setArgValue(pArgs[argIndex].index, + pArgs[argIndex].size, nullptr, nullptr)); + break; + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + UR_CALL(hKernel->setArgValue(pArgs[argIndex].index, + pArgs[argIndex].size, nullptr, + pArgs[argIndex].value.value)); + break; + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + UR_CALL(hKernel->setArgPointer(pArgs[argIndex].index, nullptr, + pArgs[argIndex].value.pointer)); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: + // TODO: import helper for converting ur flags to internal equivalent + UR_CALL(hKernel->addPendingMemoryAllocation( + {pArgs[argIndex].value.memObjTuple.hMem, + ur_mem_buffer_t::device_access_mode_t::read_write, + pArgs[argIndex].index})); + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + UR_CALL( + hKernel->setArgValue(argIndex, sizeof(void *), nullptr, + &pArgs[argIndex].value.sampler->ZeSampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + } + + UR_CALL(appendKernelLaunch(hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent)); + + recordSubmittedKernel(hKernel); + + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp index a7eafa8f9c..2a18f9b919 100644 --- a/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -231,6 +231,16 @@ struct ur_command_list_manager { const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + ur_result_t appendKernelLaunchWithArgsExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + private: ur_result_t appendGenericCommandListsExp( uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, diff --git a/source/adapters/level_zero/v2/kernel.cpp b/source/adapters/level_zero/v2/kernel.cpp index 173b51ffc4..f48a41154e 100644 --- a/source/adapters/level_zero/v2/kernel.cpp +++ b/source/adapters/level_zero/v2/kernel.cpp @@ -417,19 +417,17 @@ ur_result_t urKernelSetArgPointer( return exceptionToResult(std::current_exception()); } -static ur_mem_buffer_t::device_access_mode_t memAccessFromKernelProperties( - const ur_kernel_arg_mem_obj_properties_t *pProperties) { - if (pProperties) { - switch (pProperties->memoryAccess) { - case UR_MEM_FLAG_READ_WRITE: - return ur_mem_buffer_t::device_access_mode_t::read_write; - case UR_MEM_FLAG_WRITE_ONLY: - return ur_mem_buffer_t::device_access_mode_t::write_only; - case UR_MEM_FLAG_READ_ONLY: - return ur_mem_buffer_t::device_access_mode_t::read_only; - default: - return ur_mem_buffer_t::device_access_mode_t::read_write; - } +static ur_mem_buffer_t::device_access_mode_t +memAccessFromKernelProperties(const ur_mem_flags_t &Flags) { + switch (Flags) { + case UR_MEM_FLAG_READ_WRITE: + return ur_mem_buffer_t::device_access_mode_t::read_write; + case UR_MEM_FLAG_WRITE_ONLY: + return ur_mem_buffer_t::device_access_mode_t::write_only; + case UR_MEM_FLAG_READ_ONLY: + return ur_mem_buffer_t::device_access_mode_t::read_only; + default: + return ur_mem_buffer_t::device_access_mode_t::read_write; } return ur_mem_buffer_t::device_access_mode_t::read_write; } @@ -443,7 +441,10 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, std::scoped_lock guard(hKernel->Mutex); UR_CALL(hKernel->addPendingMemoryAllocation( - {hArgValue, memAccessFromKernelProperties(pProperties), argIndex})); + {hArgValue, + memAccessFromKernelProperties(pProperties ? pProperties->memoryAccess + : 0), + argIndex})); return UR_RESULT_SUCCESS; } catch (...) { diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp index d043a68dca..660ed54406 100644 --- a/source/adapters/level_zero/v2/queue_api.cpp +++ b/source/adapters/level_zero/v2/queue_api.cpp @@ -440,6 +440,22 @@ ur_result_t urEnqueueTimestampRecordingExp( } catch (...) { return exceptionToResult(std::current_exception()); } +ur_result_t urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) try { + return hQueue->get().enqueueKernelLaunchWithArgsExp( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} ur_result_t urEnqueueEventsWaitWithBarrierExt( ur_queue_handle_t hQueue, const ur_exp_enqueue_ext_properties_t *pProperties, diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp index b710f9d56b..47425c5772 100644 --- a/source/adapters/level_zero/v2/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -162,6 +162,11 @@ struct ur_queue_t_ { virtual ur_result_t enqueueTimestampRecordingExp(bool, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; + virtual ur_result_t enqueueKernelLaunchWithArgsExp( + ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, + const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *, + uint32_t, const ur_kernel_launch_property_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, uint32_t, const ur_event_handle_t *, diff --git a/source/adapters/level_zero/v2/queue_create.cpp b/source/adapters/level_zero/v2/queue_create.cpp index 2ed41e8c53..a8270b192d 100644 --- a/source/adapters/level_zero/v2/queue_create.cpp +++ b/source/adapters/level_zero/v2/queue_create.cpp @@ -109,6 +109,19 @@ ur_result_t urQueueCreateWithNativeHandle( } } + ze_bool_t isImmediate = false; + ZE2UR_CALL( + zeCommandListIsImmediate, + (reinterpret_cast(hNativeQueue), &isImmediate)); + + if (!isImmediate) { + UR_LOG(ERR, "urQueueCreateWithNativeHandle: " + "Native handle is not an immediate command " + "list; only immediate command lists are " + "supported."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + auto commandListHandle = v2::raii::command_list_unique_handle( reinterpret_cast(hNativeQueue), [ownNativeHandle](ze_command_list_handle_t hZeCommandList) { diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 74b37d1b40..7b5f36da47 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -453,6 +453,22 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { createEventIfRequested(eventPool.get(), phEvent, this)); } + ur_result_t enqueueKernelLaunchWithArgsExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendKernelLaunchWithArgsExp( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + ur::RefCount RefCount; }; diff --git a/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp index 07e8743154..5712375a84 100644 --- a/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -505,6 +505,24 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { createEventIfRequested(eventPool.get(), phEvent, this)); } + ur_result_t enqueueKernelLaunchWithArgsExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .appendKernelLaunchWithArgsExp( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, + launchPropList, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + ur::RefCount RefCount; }; diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index 39d67fff43..74cb1accfa 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -11915,6 +11915,107 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_enqueue_kernel_launch_with_args_exp_params_t params = { + &hQueue, + &hKernel, + &workDim, + &pGlobalWorkOffset, + &pGlobalWorkSize, + &pLocalWorkSize, + &numArgs, + &pArgs, + &numPropsInLaunchPropList, + &launchPropList, + &numEventsInWaitList, + &phEventWaitList, + &phEvent}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback( + "urEnqueueKernelLaunchWithArgsExp")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback( + "urEnqueueKernelLaunchWithArgsExp")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback( + "urEnqueueKernelLaunchWithArgsExp")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -12420,6 +12521,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; + pDdiTable->pfnKernelLaunchWithArgsExp = + driver::urEnqueueKernelLaunchWithArgsExp; + pDdiTable->pfnUSMDeviceAllocExp = driver::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = driver::urEnqueueUSMSharedAllocExp; diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 5fecdc5b8f..4c780031f8 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -621,3 +621,45 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + for (uint32_t argIndex = 0; argIndex < numArgs; argIndex++) { + switch (pArgs[argIndex].type) { + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + UR_CALL(hKernel->addArg(pArgs[argIndex].value.value, + pArgs[argIndex].index, pArgs[argIndex].size)); + break; + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + UR_CALL( + hKernel->addPtrArg(const_cast(pArgs[argIndex].value.pointer), + pArgs[argIndex].index)); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + auto MemObj = pArgs[argIndex].value.memObjTuple.hMem; + UR_CALL(hKernel->addMemObjArg(MemObj, pArgs[argIndex].index)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + UR_CALL( + hKernel->addLocalArg(pArgs[argIndex].index, pArgs[argIndex].size)); + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + break; + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp index ac11331357..f67c7653d0 100644 --- a/source/adapters/native_cpu/kernel.cpp +++ b/source/adapters/native_cpu/kernel.cpp @@ -61,21 +61,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( // TODO: error checking UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); - hKernel->addArg(pArgValue, argIndex, argSize); - - return UR_RESULT_SUCCESS; + return hKernel->addArg(pArgValue, argIndex, argSize); } UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal( ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, const ur_kernel_arg_local_properties_t * /*pProperties*/) { - // emplace a placeholder kernel arg, gets replaced with a pointer to the - // memory pool before enqueueing the kernel. - hKernel->addPtrArg(nullptr, argIndex); - hKernel->_localArgInfo.emplace_back(argIndex, argSize); - return UR_RESULT_SUCCESS; + return hKernel->addLocalArg(argIndex, argSize); } UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, @@ -211,11 +204,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( const void *pArgValue) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pArgValue, UR_RESULT_ERROR_INVALID_NULL_POINTER); - - hKernel->addPtrArg(const_cast(pArgValue), argIndex); - return UR_RESULT_SUCCESS; + return hKernel->addPtrArg(const_cast(pArgValue), argIndex); } UR_APIEXPORT ur_result_t UR_APICALL @@ -242,16 +232,7 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - // Taken from ur/adapters/cuda/kernel.cpp - // zero-sized buffers are expected to be null. - if (hArgValue == nullptr) { - hKernel->addPtrArg(nullptr, argIndex); - return UR_RESULT_SUCCESS; - } - - hKernel->addArgReference(hArgValue); - hKernel->addPtrArg(hArgValue->_mem, argIndex); - return UR_RESULT_SUCCESS; + return hKernel->addMemObjArg(hArgValue, argIndex); } UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp index 8daf23feb6..285b1c00d0 100644 --- a/source/adapters/native_cpu/kernel.hpp +++ b/source/adapters/native_cpu/kernel.hpp @@ -181,17 +181,44 @@ struct ur_kernel_handle_t_ : RefCounted { return Result; } - void addArg(const void *Ptr, size_t Index, size_t Size) { + inline ur_result_t addArg(const void *Ptr, size_t Index, size_t Size) { + UR_ASSERT(Size, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); Args.addArg(Index, Size, Ptr); + return UR_RESULT_SUCCESS; } - void addPtrArg(void *Ptr, size_t Index) { Args.addPtrArg(Index, Ptr); } + inline ur_result_t addPtrArg(void *Ptr, size_t Index) { + UR_ASSERT(Ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); + Args.addPtrArg(Index, Ptr); + return UR_RESULT_SUCCESS; + } void addArgReference(ur_mem_handle_t Arg) { Arg->incrementReferenceCount(); ReferencedArgs.push_back(Arg); } + inline ur_result_t addMemObjArg(ur_mem_handle_t ArgValue, size_t Index) { + // Taken from ur/adapters/cuda/kernel.cpp + // zero-sized buffers are expected to be null. + if (ArgValue == nullptr) { + addPtrArg(nullptr, Index); + return UR_RESULT_SUCCESS; + } + + addArgReference(ArgValue); + addPtrArg(ArgValue->_mem, Index); + return UR_RESULT_SUCCESS; + } + + inline ur_result_t addLocalArg(size_t Index, size_t Size) { + // emplace a placeholder kernel arg, gets replaced with a pointer to the + // memory pool before enqueueing the kernel. + Args.addPtrArg(Index, nullptr); + _localArgInfo.emplace_back(Index, Size); + return UR_RESULT_SUCCESS; + } + private: void removeArgReferences() { for (auto arg : ReferencedArgs) diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index 3f6fe061b4..7bc2f999a0 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -431,6 +431,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; + pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/offload/enqueue.cpp b/source/adapters/offload/enqueue.cpp index b1a1edac52..cd89280c5a 100644 --- a/source/adapters/offload/enqueue.cpp +++ b/source/adapters/offload/enqueue.cpp @@ -274,3 +274,40 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( return Result; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + for (uint32_t i = 0; i < numArgs; i++) { + switch (pArgs[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + hKernel->Args.addArg(pArgs[i].index, sizeof(pArgs[i].value.pointer), + &pArgs[i].value.pointer); + break; + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + hKernel->Args.addArg(pArgs[i].index, pArgs[i].size, pArgs[i].value.value); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: + hKernel->Args.addMemObjArg(pArgs[i].index, + pArgs[i].value.memObjTuple.hMem, + pArgs[i].value.memObjTuple.flags); + break; + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} diff --git a/source/adapters/offload/kernel.cpp b/source/adapters/offload/kernel.cpp index 58c4f6cf7f..02a7ee3a3f 100644 --- a/source/adapters/offload/kernel.cpp +++ b/source/adapters/offload/kernel.cpp @@ -9,7 +9,6 @@ //===----------------------------------------------------------------------===// #include "kernel.hpp" -#include "memory.hpp" #include "program.hpp" #include "ur2offload.hpp" #include @@ -88,19 +87,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, const ur_kernel_arg_mem_obj_properties_t *Properties, ur_mem_handle_t hArgValue) { - // Handle zero-sized buffers - if (hArgValue == nullptr) { - hKernel->Args.addArg(argIndex, 0, nullptr); - return UR_RESULT_SUCCESS; - } - ur_mem_flags_t MemAccess = Properties ? Properties->memoryAccess : static_cast(UR_MEM_FLAG_READ_WRITE); hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess); - auto Ptr = std::get(hArgValue->Mem).Ptr; - hKernel->Args.addArg(argIndex, sizeof(void *), &Ptr); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/offload/kernel.hpp b/source/adapters/offload/kernel.hpp index 83866b5974..a5e7f16f2b 100644 --- a/source/adapters/offload/kernel.hpp +++ b/source/adapters/offload/kernel.hpp @@ -18,6 +18,7 @@ #include #include "common.hpp" +#include "memory.hpp" struct ur_kernel_handle_t_ : RefCounted { @@ -56,7 +57,12 @@ struct ur_kernel_handle_t_ : RefCounted { } void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { - assert(hMem && "Invalid mem handle"); + // Handle zero-sized buffers + if (hMem == nullptr) { + addArg(Index, 0, nullptr); + return; + } + // If a memobj is already set at this index, update the entry rather // than adding a duplicate one for (auto &Arg : MemObjArgs) { @@ -66,6 +72,9 @@ struct ur_kernel_handle_t_ : RefCounted { } } MemObjArgs.push_back(MemObjArg{hMem, Index, Flags}); + + auto Ptr = std::get(hMem->Mem).Ptr; + addArg(Index, sizeof(void *), &Ptr); } const args_ptr_t &getPointers() const noexcept { return Pointers; } diff --git a/source/adapters/offload/ur_interface_loader.cpp b/source/adapters/offload/ur_interface_loader.cpp index 02de9df99f..e8a4fa60f7 100644 --- a/source/adapters/offload/ur_interface_loader.cpp +++ b/source/adapters/offload/ur_interface_loader.cpp @@ -383,6 +383,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = nullptr; pDdiTable->pfnNativeCommandExp = nullptr; + pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/opencl/enqueue.cpp b/source/adapters/opencl/enqueue.cpp index 63b7b45426..1bd75b6b56 100644 --- a/source/adapters/opencl/enqueue.cpp +++ b/source/adapters/opencl/enqueue.cpp @@ -16,6 +16,10 @@ #include "memory.hpp" #include "program.hpp" #include "queue.hpp" +#include "sampler.hpp" + +#include +#include cl_map_flags convertURMapFlagsToCL(ur_map_flags_t URFlags) { cl_map_flags CLFlags = 0; @@ -501,3 +505,102 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; + propIndex++) { + // Adapters that don't support cooperative kernels are currently expected + // to ignore COOPERATIVE launch properties. Ideally we should avoid passing + // these at the SYCL RT level instead, see + // https://github.com/intel/llvm/issues/18421 + if (launchPropList[propIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE || + launchPropList[propIndex].id == + UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { + continue; + } + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + clSetKernelArgMemPointerINTEL_fn SetKernelArgMemPointerPtr = nullptr; + UR_RETURN_ON_FAILURE( + cl_ext::getExtFuncFromContext( + hQueue->Context->CLContext, + ur::cl::getAdapter()->fnCache.clSetKernelArgMemPointerINTELCache, + cl_ext::SetKernelArgMemPointerName, &SetKernelArgMemPointerPtr)); + + for (uint32_t i = 0; i < numArgs; i++) { + switch (pArgs[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, nullptr)); + break; + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, pArgs[i].value.value)); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + cl_mem mem = pArgs[i].value.memObjTuple.hMem + ? pArgs[i].value.memObjTuple.hMem->CLMemory + : nullptr; + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, &mem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + CL_RETURN_ON_FAILURE(SetKernelArgMemPointerPtr( + hKernel->CLKernel, static_cast(pArgs[i].index), + pArgs[i].value.pointer)); + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + CL_RETURN_ON_FAILURE(clSetKernelArg( + hKernel->CLKernel, static_cast(pArgs[i].index), + pArgs[i].size, &pArgs[i].value.sampler->CLSampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + std::vector compiledLocalWorksize; + if (!pLocalWorkSize) { + cl_device_id device = nullptr; + CL_RETURN_ON_FAILURE(clGetCommandQueueInfo( + hQueue->CLQueue, CL_QUEUE_DEVICE, sizeof(device), &device, nullptr)); + // This query always returns size_t[3], if nothing was specified it + // returns all zeroes. + size_t queriedLocalWorkSize[3] = {0, 0, 0}; + CL_RETURN_ON_FAILURE(clGetKernelWorkGroupInfo( + hKernel->CLKernel, device, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, + sizeof(size_t[3]), queriedLocalWorkSize, nullptr)); + if (queriedLocalWorkSize[0] != 0) { + for (uint32_t i = 0; i < 3; i++) { + compiledLocalWorksize.push_back(queriedLocalWorkSize[i]); + } + } + } + + cl_event Event; + std::vector CLWaitEvents(numEventsInWaitList); + MapUREventsToCL(numEventsInWaitList, phEventWaitList, CLWaitEvents); + CL_RETURN_ON_FAILURE(clEnqueueNDRangeKernel( + hQueue->CLQueue, hKernel->CLKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, + compiledLocalWorksize.empty() ? pLocalWorkSize + : compiledLocalWorksize.data(), + numEventsInWaitList, CLWaitEvents.data(), ifUrEvent(phEvent, Event))); + + UR_RETURN_ON_FAILURE(createUREvent(Event, hQueue->Context, hQueue, phEvent)); + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index c619fa36b1..18cc6a7965 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -434,6 +434,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; + pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/source/common/stype_map_helpers.def b/source/common/stype_map_helpers.def index 7970582639..efd69e6ae4 100644 --- a/source/common/stype_map_helpers.def +++ b/source/common/stype_map_helpers.def @@ -168,3 +168,6 @@ struct stype_map template <> struct stype_map : stype_map_impl {}; +template <> +struct stype_map + : stype_map_impl {}; diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp index 899ff6a850..a8d26f5498 100644 --- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -1637,6 +1637,119 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( return pfnGetInfo(hDevice, propName, propSize, pPropValue, pPropSizeRet); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunch +ur_result_t urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. If + /// nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional] return an event object that identifies this + /// particular kernel execution instance. + ur_event_handle_t *phEvent) { + // This mutex is to prevent concurrent kernel launches across different queues + // as the DeviceASAN local/private shadow memory does not support concurrent + // kernel launches now. + std::scoped_lock Guard( + getAsanInterceptor()->KernelLaunchMutex); + + UR_LOG_L(getContext()->logger, DEBUG, + "==== urEnqueueKernelLaunchWithArgsExp"); + + // We need to set all the args now rather than letting LaunchWithArgs handle + // them. This is because some implementations of + // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely + // on all the args being set. + for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) { + switch (pArgs[ArgPropIndex].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { + UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgLocal( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, + nullptr)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: { + UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgPointer( + hKernel, pArgs[ArgPropIndex].index, nullptr, + pArgs[ArgPropIndex].value.pointer)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_VALUE: { + UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgValue( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr, + pArgs[ArgPropIndex].value.value)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Properties = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + pArgs[ArgPropIndex].value.memObjTuple.flags}; + UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgMemObj( + hKernel, pArgs[ArgPropIndex].index, &Properties, + pArgs[ArgPropIndex].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + auto pfnKernelSetArgSampler = + getContext()->urDdiTable.Kernel.pfnSetArgSampler; + UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index, + nullptr, + pArgs[ArgPropIndex].value.sampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, + pLocalWorkSize, pGlobalWorkOffset, 3); + UR_CALL(LaunchInfo.Data.syncToDevice(hQueue)); + + UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + + UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + LaunchInfo.LocalWorkSize.data(), 0, nullptr, numPropsInLaunchPropList, + launchPropList, numEventsInWaitList, phEventWaitList, phEvent)); + + UR_CALL(getAsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); + + return UR_RESULT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Adapter table /// with current process' addresses @@ -1952,6 +2065,22 @@ __urdlllocal ur_result_t UR_APICALL urGetDeviceProcAddrTable( return result; } +/// @brief Exported function for filling application's ProgramExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +ur_result_t urGetEnqueueExpProcAddrTable( + /// [in,out] pointer to table of DDI function pointers + ur_enqueue_exp_dditable_t *pDdiTable) { + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_sanitizer_layer::asan::urEnqueueKernelLaunchWithArgsExp; + + return result; +} template struct NotSupportedApi; @@ -2148,6 +2277,11 @@ ur_result_t initAsanDDITable(ur_dditable_t *dditable) { UR_API_VERSION_CURRENT, &dditable->VirtualMem); } + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::asan::urGetEnqueueExpProcAddrTable( + &dditable->EnqueueExp); + } + if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize ASAN DDI table failed: {}", result); diff --git a/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/source/loader/layers/sanitizer/msan/msan_ddi.cpp index e2bbb166a5..13868606dc 100644 --- a/source/loader/layers/sanitizer/msan/msan_ddi.cpp +++ b/source/loader/layers/sanitizer/msan/msan_ddi.cpp @@ -1809,6 +1809,122 @@ ur_result_t urEnqueueUSMMemcpy2D( return UR_RESULT_SUCCESS; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunch +ur_result_t urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. If + /// nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional] return an event object that identifies this + /// particular kernel execution instance. + ur_event_handle_t *phEvent) { + // This mutex is to prevent concurrent kernel launches across different queues + // as the DeviceMSAN local/private shadow memory does not support concurrent + // kernel launches now. + std::scoped_lock Guard( + getMsanInterceptor()->KernelLaunchMutex); + + UR_LOG_L(getContext()->logger, DEBUG, + "==== urEnqueueKernelLaunchWithArgsExp"); + + // We need to set all the args now rather than letting LaunchWithArgs handle + // them. This is because some implementations of + // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely + // on all the args being set. + for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) { + switch (pArgs[ArgPropIndex].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { + UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgLocal( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, + nullptr)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: { + auto pfnKernelSetArgPointer = + getContext()->urDdiTable.Kernel.pfnSetArgPointer; + UR_CALL(pfnKernelSetArgPointer(hKernel, pArgs[ArgPropIndex].index, + nullptr, + pArgs[ArgPropIndex].value.pointer)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_VALUE: { + UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgValue( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr, + pArgs[ArgPropIndex].value.value)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Properties = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + pArgs[ArgPropIndex].value.memObjTuple.flags}; + UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgMemObj( + hKernel, pArgs[ArgPropIndex].index, &Properties, + pArgs[ArgPropIndex].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + auto pfnKernelSetArgSampler = + getContext()->urDdiTable.Kernel.pfnSetArgSampler; + UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index, + nullptr, + pArgs[ArgPropIndex].value.sampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), + pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, + 3); + UR_CALL(LaunchInfo.initialize()); + + UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + + UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + LaunchInfo.LocalWorkSize.data(), 0, nullptr, numPropsInLaunchPropList, + launchPropList, numEventsInWaitList, phEventWaitList, phEvent)); + + UR_CALL(getMsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); + + return UR_RESULT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Adapter table /// with current process' addresses @@ -1987,6 +2103,22 @@ ur_result_t urGetUSMProcAddrTable( return result; } +/// @brief Exported function for filling application's ProgramExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +ur_result_t urGetEnqueueExpProcAddrTable( + /// [in,out] pointer to table of DDI function pointers + ur_enqueue_exp_dditable_t *pDdiTable) { + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_sanitizer_layer::msan::urEnqueueKernelLaunchWithArgsExp; + + return result; +} ur_result_t urCheckVersion(ur_api_version_t version) { if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) != @@ -2052,6 +2184,11 @@ ur_result_t initMsanDDITable(ur_dditable_t *dditable) { result = ur_sanitizer_layer::msan::urGetUSMProcAddrTable(&dditable->USM); } + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::msan::urGetEnqueueExpProcAddrTable( + &dditable->EnqueueExp); + } + if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize MSAN DDI table failed: {}", result); diff --git a/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp b/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp index 61849ac0b3..7934dbf596 100644 --- a/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp +++ b/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp @@ -1337,6 +1337,120 @@ ur_result_t urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunch +ur_result_t urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. If + /// nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional] return an event object that identifies this + /// particular kernel execution instance. + ur_event_handle_t *phEvent) { + // This mutex is to prevent concurrent kernel launches across different queues + // as the DeviceTSAN local shadow memory does not support concurrent + // kernel launches now. + std::scoped_lock Guard( + getTsanInterceptor()->KernelLaunchMutex); + + UR_LOG_L(getContext()->logger, DEBUG, + "==== urEnqueueKernelLaunchWithArgsExp"); + + // We need to set all the args now rather than letting LaunchWithArgs handle + // them. This is because some implementations of + // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely + // on all the args being set. + for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) { + switch (pArgs[ArgPropIndex].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { + UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgLocal( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, + nullptr)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: { + auto pfnKernelSetArgPointer = + getContext()->urDdiTable.Kernel.pfnSetArgPointer; + UR_CALL(pfnKernelSetArgPointer(hKernel, pArgs[ArgPropIndex].index, + nullptr, + pArgs[ArgPropIndex].value.pointer)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_VALUE: { + UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgValue( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr, + pArgs[ArgPropIndex].value.value)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Properties = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + pArgs[ArgPropIndex].value.memObjTuple.flags}; + UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgMemObj( + hKernel, pArgs[ArgPropIndex].index, &Properties, + pArgs[ArgPropIndex].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + auto pfnKernelSetArgSampler = + getContext()->urDdiTable.Kernel.pfnSetArgSampler; + UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index, + nullptr, + pArgs[ArgPropIndex].value.sampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, + pLocalWorkSize, pGlobalWorkOffset, 3); + + UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + + UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, 0, nullptr, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent)); + + UR_CALL(getTsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); + + return UR_RESULT_SUCCESS; +} + ur_result_t urCheckVersion(ur_api_version_t version) { if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) != UR_MAJOR_VERSION(version) || @@ -1547,6 +1661,22 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueProcAddrTable( return UR_RESULT_SUCCESS; } +/// @brief Exported function for filling application's ProgramExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +ur_result_t urGetEnqueueExpProcAddrTable( + /// [in,out] pointer to table of DDI function pointers + ur_enqueue_exp_dditable_t *pDdiTable) { + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_sanitizer_layer::tsan::urEnqueueKernelLaunchWithArgsExp; + + return result; +} } // namespace tsan ur_result_t initTsanDDITable(ur_dditable_t *dditable) { @@ -1596,6 +1726,11 @@ ur_result_t initTsanDDITable(ur_dditable_t *dditable) { ur_sanitizer_layer::tsan::urGetEnqueueProcAddrTable(&dditable->Enqueue); } + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::tsan::urGetEnqueueExpProcAddrTable( + &dditable->EnqueueExp); + } + if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize TSAN DDI table failed: {}", result); diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index e96e1cbffd..1cac607be8 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -10093,6 +10093,98 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) { + auto pfnKernelLaunchWithArgsExp = + getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp; + + if (nullptr == pfnKernelLaunchWithArgsExp) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + ur_enqueue_kernel_launch_with_args_exp_params_t params = { + &hQueue, + &hKernel, + &workDim, + &pGlobalWorkOffset, + &pGlobalWorkSize, + &pLocalWorkSize, + &numArgs, + &pArgs, + &numPropsInLaunchPropList, + &launchPropList, + &numEventsInWaitList, + &phEventWaitList, + &phEvent}; + uint64_t instance = getContext()->notify_begin( + UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, + "urEnqueueKernelLaunchWithArgsExp", ¶ms); + + auto &logger = getContext()->logger; + UR_LOG_L(logger, INFO, " ---> urEnqueueKernelLaunchWithArgsExp\n"); + + ur_result_t result = pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); + + getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, + "urEnqueueKernelLaunchWithArgsExp", ¶ms, &result, + instance); + + if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, ¶ms); + UR_LOG_L(logger, INFO, + " <--- urEnqueueKernelLaunchWithArgsExp({}) -> {};\n", + args_str.str(), result); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -10703,6 +10795,10 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; + dditable.pfnKernelLaunchWithArgsExp = pDdiTable->pfnKernelLaunchWithArgsExp; + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_tracing_layer::urEnqueueKernelLaunchWithArgsExp; + dditable.pfnUSMDeviceAllocExp = pDdiTable->pfnUSMDeviceAllocExp; pDdiTable->pfnUSMDeviceAllocExp = ur_tracing_layer::urEnqueueUSMDeviceAllocExp; diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 6f33aaa856..9dd572ecd3 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -10867,6 +10867,122 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) { + auto pfnKernelLaunchWithArgsExp = + getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp; + + if (nullptr == pfnKernelLaunchWithArgsExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == pGlobalWorkSize) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (launchPropList == NULL && numPropsInLaunchPropList > 0) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (pArgs == NULL && numArgs > 0) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == hQueue) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (NULL == hKernel) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (NULL != pArgs && UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type) + return UR_RESULT_ERROR_INVALID_ENUMERATION; + + if (phEventWaitList == NULL && numEventsInWaitList > 0) + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + + if (phEventWaitList != NULL && numEventsInWaitList == 0) + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + + if (pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || + pGlobalWorkSize[2] == 0) + return UR_RESULT_ERROR_INVALID_WORK_DIMENSION; + + if (pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == 0 || + pLocalWorkSize[2] == 0)) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hQueue)) { + URLOG_CTX_INVALID_REFERENCE(hQueue); + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hKernel)) { + URLOG_CTX_INVALID_REFERENCE(hKernel); + } + + ur_result_t result = pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); + + if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS && + phEvent) { + getContext()->refCountContext->createRefCount(*phEvent); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -11502,6 +11618,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; + dditable.pfnKernelLaunchWithArgsExp = pDdiTable->pfnKernelLaunchWithArgsExp; + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_validation_layer::urEnqueueKernelLaunchWithArgsExp; + dditable.pfnUSMDeviceAllocExp = pDdiTable->pfnUSMDeviceAllocExp; pDdiTable->pfnUSMDeviceAllocExp = ur_validation_layer::urEnqueueUSMDeviceAllocExp; diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in index 3ad4714931..516f465840 100644 --- a/source/loader/loader.def.in +++ b/source/loader/loader.def.in @@ -76,6 +76,7 @@ EXPORTS urEnqueueEventsWaitWithBarrier urEnqueueEventsWaitWithBarrierExt urEnqueueKernelLaunch + urEnqueueKernelLaunchWithArgsExp urEnqueueMemBufferCopy urEnqueueMemBufferCopyRect urEnqueueMemBufferFill @@ -288,6 +289,7 @@ EXPORTS urPrintEnqueueEventsWaitWithBarrierExtParams urPrintEnqueueEventsWaitWithBarrierParams urPrintEnqueueKernelLaunchParams + urPrintEnqueueKernelLaunchWithArgsExpParams urPrintEnqueueMemBufferCopyParams urPrintEnqueueMemBufferCopyRectParams urPrintEnqueueMemBufferFillParams @@ -348,6 +350,9 @@ EXPORTS urPrintExpImageCopyFlags urPrintExpImageCopyRegion urPrintExpImageMemType + urPrintExpKernelArgMemObjTuple + urPrintExpKernelArgProperties + urPrintExpKernelArgType urPrintExpPeerInfo urPrintExpSamplerAddrModes urPrintExpSamplerCubemapFilterMode diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in index fde803f9aa..a0e5b81244 100644 --- a/source/loader/loader.map.in +++ b/source/loader/loader.map.in @@ -76,6 +76,7 @@ urEnqueueEventsWaitWithBarrier; urEnqueueEventsWaitWithBarrierExt; urEnqueueKernelLaunch; + urEnqueueKernelLaunchWithArgsExp; urEnqueueMemBufferCopy; urEnqueueMemBufferCopyRect; urEnqueueMemBufferFill; @@ -288,6 +289,7 @@ urPrintEnqueueEventsWaitWithBarrierExtParams; urPrintEnqueueEventsWaitWithBarrierParams; urPrintEnqueueKernelLaunchParams; + urPrintEnqueueKernelLaunchWithArgsExpParams; urPrintEnqueueMemBufferCopyParams; urPrintEnqueueMemBufferCopyRectParams; urPrintEnqueueMemBufferFillParams; @@ -348,6 +350,9 @@ urPrintExpImageCopyFlags; urPrintExpImageCopyRegion; urPrintExpImageMemType; + urPrintExpKernelArgMemObjTuple; + urPrintExpKernelArgProperties; + urPrintExpKernelArgType; urPrintExpPeerInfo; urPrintExpSamplerAddrModes; urPrintExpSamplerCubemapFilterMode; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 5c2c3a41af..0a09a3072c 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -5734,6 +5734,65 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( pPropValue, pPropSizeRet); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) { + + auto *dditable = *reinterpret_cast(hQueue); + + auto *pfnKernelLaunchWithArgsExp = + dditable->EnqueueExp.pfnKernelLaunchWithArgsExp; + if (nullptr == pfnKernelLaunchWithArgsExp) + return UR_RESULT_ERROR_UNINITIALIZED; + + // forward to device-platform + return pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -6244,6 +6303,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( if (ur_loader::getContext()->platforms.size() != 1 || ur_loader::getContext()->forceIntercept) { // return pointers to loader's DDIs + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_loader::urEnqueueKernelLaunchWithArgsExp; pDdiTable->pfnUSMDeviceAllocExp = ur_loader::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = ur_loader::urEnqueueUSMSharedAllocExp; pDdiTable->pfnUSMHostAllocExp = ur_loader::urEnqueueUSMHostAllocExp; diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index a31b639ae5..59edc89920 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -10560,6 +10560,104 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command to execute a kernel +/// +/// @remarks +/// _Analogues_ +/// - **clEnqueueNDRangeKernel** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// + `NULL == hKernel` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkSize` +/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `pArgs == NULL && numArgs > 0` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS +/// + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR. +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || +/// pGlobalWorkSize[2] == 0` +/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE +/// + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == +/// 0 || pLocalWorkSize[2] == 0)` +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values +/// have not been specified." +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If any property in `launchPropList` isn't supported by the device. +ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) try { + auto pfnKernelLaunchWithArgsExp = + ur_lib::getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp; + if (nullptr == pfnKernelLaunchWithArgsExp) + return UR_RESULT_ERROR_UNINITIALIZED; + + return pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a barrier command which waits a list of events to complete /// before it completes, with optional extended properties diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index f3d5c96e37..0fee8b9ee2 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -1138,6 +1138,30 @@ ur_result_t urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintExpKernelArgType(enum ur_exp_kernel_arg_type_t value, + char *buffer, const size_t buff_size, + size_t *out_size) { + std::stringstream ss; + ss << value; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t urPrintExpKernelArgMemObjTuple( + const struct ur_exp_kernel_arg_mem_obj_tuple_t params, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t urPrintExpKernelArgProperties( + const struct ur_exp_kernel_arg_properties_t params, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintExpEnqueueExtFlags(enum ur_exp_enqueue_ext_flag_t value, char *buffer, const size_t buff_size, size_t *out_size) { @@ -1869,6 +1893,14 @@ ur_result_t urPrintEnqueueWriteHostPipeParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintEnqueueKernelLaunchWithArgsExpParams( + const struct ur_enqueue_kernel_launch_with_args_exp_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintEnqueueEventsWaitWithBarrierExtParams( const struct ur_enqueue_events_wait_with_barrier_ext_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/source/ur_api.cpp b/source/ur_api.cpp index da84b7f50f..771e27c3b8 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -9190,6 +9190,95 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command to execute a kernel +/// +/// @remarks +/// _Analogues_ +/// - **clEnqueueNDRangeKernel** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// + `NULL == hKernel` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkSize` +/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `pArgs == NULL && numArgs > 0` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS +/// + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR. +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || +/// pGlobalWorkSize[2] == 0` +/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE +/// + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == +/// 0 || pLocalWorkSize[2] == 0)` +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values +/// have not been specified." +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If any property in `launchPropList` isn't supported by the device. +ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a barrier command which waits a list of events to complete /// before it completes, with optional extended properties diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index 5d579dbbf5..c1ca49f8e9 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -79,6 +79,7 @@ set(TEST_SUBDIRECTORIES_DPCXX "integration" "exp_command_buffer" "exp_enqueue_native" + "exp_enqueue_kernel_launch_with_args" "exp_usm_p2p" "memory-migrate" "usm" diff --git a/test/conformance/enqueue/urEnqueueEventsWaitMultiDevice.cpp b/test/conformance/enqueue/urEnqueueEventsWaitMultiDevice.cpp index 25afde7228..c29eb16b45 100644 --- a/test/conformance/enqueue/urEnqueueEventsWaitMultiDevice.cpp +++ b/test/conformance/enqueue/urEnqueueEventsWaitMultiDevice.cpp @@ -29,6 +29,8 @@ struct urEnqueueEventsWaitMultiDeviceTest : uur::urMultiQueueMultiDeviceTest<2> { void SetUp() override { UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}); + // https://github.com/intel/llvm/issues/19607 + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); UUR_RETURN_ON_FATAL_FAILURE(uur::urMultiQueueMultiDeviceTest<2>::SetUp()); @@ -125,6 +127,9 @@ struct urEnqueueEventsWaitMultiDeviceMTTest } void SetUp() override { + // https://github.com/intel/llvm/issues/19607 + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::NativeCPU{}); UUR_RETURN_ON_FATAL_FAILURE( diff --git a/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp b/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp index cc32589c77..1abd81c8ea 100644 --- a/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp +++ b/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp @@ -367,6 +367,9 @@ UUR_PLATFORM_TEST_SUITE_WITH_PARAM( // Enqueue kernelLaunch concurrently from multiple threads TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) { + // https://github.com/intel/llvm/issues/19607 + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); + if (!queuePerThread) { UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}); } diff --git a/test/conformance/enqueue/urEnqueueMemBufferFill.cpp b/test/conformance/enqueue/urEnqueueMemBufferFill.cpp index 68f20e91d7..f9c2fc175b 100644 --- a/test/conformance/enqueue/urEnqueueMemBufferFill.cpp +++ b/test/conformance/enqueue/urEnqueueMemBufferFill.cpp @@ -14,6 +14,8 @@ struct testParametersFill { struct urEnqueueMemBufferFillTest : uur::urQueueTestWithParam { void SetUp() override { + // https://github.com/intel/llvm/issues/19604 + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); UUR_RETURN_ON_FATAL_FAILURE( urQueueTestWithParam::SetUp()); size = std::get<1>(GetParam()).size; diff --git a/test/conformance/enqueue/urEnqueueUSMFill.cpp b/test/conformance/enqueue/urEnqueueUSMFill.cpp index db7901a5eb..ba18c8a5bf 100644 --- a/test/conformance/enqueue/urEnqueueUSMFill.cpp +++ b/test/conformance/enqueue/urEnqueueUSMFill.cpp @@ -99,6 +99,8 @@ UUR_DEVICE_TEST_SUITE_WITH_PARAM( printFillTestString); TEST_P(urEnqueueUSMFillTestWithParam, Success) { + // https://github.com/intel/llvm/issues/19604 + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}); ur_event_handle_t event = nullptr; diff --git a/test/conformance/exp_command_buffer/regression/usm_copy.cpp b/test/conformance/exp_command_buffer/regression/usm_copy.cpp index 21616b0b83..681b292d20 100644 --- a/test/conformance/exp_command_buffer/regression/usm_copy.cpp +++ b/test/conformance/exp_command_buffer/regression/usm_copy.cpp @@ -78,6 +78,9 @@ struct urCommandBufferUSMCopyInOrderTest UUR_INSTANTIATE_DEVICE_TEST_SUITE(urCommandBufferUSMCopyInOrderTest); TEST_P(urCommandBufferUSMCopyInOrderTest, Success) { + // https://github.com/intel/llvm/issues/19604 + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); + // Do an eager kernel enqueue without wait on completion // D[0] = A * D[1] + D[2] // D[0] = 42 * 1 + 2 diff --git a/test/conformance/exp_command_buffer/update/kernel_handle_update.cpp b/test/conformance/exp_command_buffer/update/kernel_handle_update.cpp index 31fc80046e..d9d1c2a842 100644 --- a/test/conformance/exp_command_buffer/update/kernel_handle_update.cpp +++ b/test/conformance/exp_command_buffer/update/kernel_handle_update.cpp @@ -331,6 +331,9 @@ TEST_P(urCommandBufferKernelHandleUpdateTest, UpdateAgain) { /* Test that it is possible to change the kernel handle in a command and later * restore it to the original handle */ TEST_P(urCommandBufferKernelHandleUpdateTest, RestoreOriginalKernel) { + // https://github.com/intel/llvm/issues/19605 + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); std::vector KernelAlternatives = { FillUSM2DKernel->Kernel}; diff --git a/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt b/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt new file mode 100644 index 0000000000..d03e5ef7c0 --- /dev/null +++ b/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (C) 2025 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +add_conformance_kernels_test( + exp_kernel_launch_with_args + urEnqueueKernelLaunchWithArgsExp.cpp +) diff --git a/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp b/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp new file mode 100644 index 0000000000..093bc56004 --- /dev/null +++ b/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp @@ -0,0 +1,303 @@ +// Copyright (C) 2025 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include + +// This test runs a kernel with a mix of local memory, pointer and value args. +struct urEnqueueKernelLaunchWithArgsTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = + sizeof(uint32_t) * global_size[0] * local_size[0]; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + uint32_t current_index = 0; + // Index 0 is local_mem_a arg + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, + nullptr, + UR_EXP_KERNEL_ARG_TYPE_LOCAL, + current_index++, + local_mem_a_size, + {nullptr}}); + + // Hip has extra args for local mem at index 1-3 + ur_exp_kernel_arg_value_t argValue = {}; + if (backend == UR_BACKEND_HIP) { + argValue.value = &hip_local_offset; + ur_exp_kernel_arg_properties_t local_offset = { + UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, + nullptr, + UR_EXP_KERNEL_ARG_TYPE_VALUE, + current_index++, + sizeof(hip_local_offset), + argValue}; + args.push_back(local_offset); + local_offset.index = current_index++; + args.push_back(local_offset); + local_offset.index = current_index++; + args.push_back(local_offset); + } + + // Index 1 is local_mem_b arg + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, + nullptr, + UR_EXP_KERNEL_ARG_TYPE_LOCAL, + current_index++, + local_mem_b_size, + {nullptr}}); + + if (backend == UR_BACKEND_HIP) { + argValue.value = &hip_local_offset; + ur_exp_kernel_arg_properties_t local_offset = { + UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, + nullptr, + UR_EXP_KERNEL_ARG_TYPE_VALUE, + current_index++, + sizeof(hip_local_offset), + argValue}; + args.push_back(local_offset); + local_offset.index = current_index++; + args.push_back(local_offset); + local_offset.index = current_index++; + args.push_back(local_offset); + } + + // Index 2 is output + argValue.pointer = shared_ptrs[0]; + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++, + sizeof(shared_ptrs[0]), argValue}); + // Index 3 is A + argValue.value = &A; + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_VALUE, current_index++, sizeof(A), + argValue}); + // Index 4 is X + argValue.pointer = shared_ptrs[1]; + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++, + sizeof(shared_ptrs[1]), argValue}); + // Index 5 is Y + argValue.pointer = shared_ptrs[2]; + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++, + sizeof(shared_ptrs[2]), argValue}); + } + + void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A, + size_t length, size_t local_size) { + for (size_t i = 0; i < length; i++) { + uint32_t result = A * X[i] + Y[i] + local_size; + ASSERT_EQ(result, output[i]); + } + } + + virtual void TearDown() override { + for (auto &shared_ptr : shared_ptrs) { + if (shared_ptr) { + EXPECT_SUCCESS(urUSMFree(context, shared_ptr)); + } + } + + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown()); + } + + static constexpr size_t local_size[3] = {4, 1, 1}; + static constexpr size_t local_mem_a_size = local_size[0] * sizeof(uint32_t); + static constexpr size_t local_mem_b_size = local_mem_a_size * 2; + static constexpr size_t global_size[3] = {16, 1, 1}; + static constexpr size_t global_offset[3] = {0, 0, 0}; + static constexpr uint32_t workDim = 3; + static constexpr uint32_t A = 42; + std::array shared_ptrs = {nullptr, nullptr, nullptr, nullptr, + nullptr}; + + uint32_t hip_arg_offset = 0; + static constexpr uint64_t hip_local_offset = 0; + ur_backend_t backend{}; + std::vector args; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchWithArgsTest); + +TEST_P(urEnqueueKernelLaunchWithArgsTest, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, global_size, local_size, + args.size(), args.data(), 0, nullptr, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size[0], local_size[0]); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullHandleQueue) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, + urEnqueueKernelLaunchWithArgsExp( + nullptr, kernel, workDim, global_offset, global_size, + local_size, args.size(), args.data(), 0, nullptr, 0, + nullptr, nullptr)); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullHandleKernel) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, + urEnqueueKernelLaunchWithArgsExp( + queue, nullptr, workDim, global_offset, global_size, + local_size, args.size(), args.data(), 0, nullptr, 0, + nullptr, nullptr)); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerGlobalSize) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, nullptr, + local_size, args.size(), args.data(), 0, nullptr, 0, + nullptr, nullptr)); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerProperties) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, global_size, + local_size, args.size(), args.data(), 1, nullptr, 0, + nullptr, nullptr)); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerArgs) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, global_size, + local_size, args.size(), nullptr, 0, nullptr, 0, nullptr, + nullptr)); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidEventWaitList) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, + urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, global_size, + local_size, args.size(), args.data(), 0, nullptr, 1, + nullptr, nullptr)); + ur_event_handle_t event = nullptr; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, + urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, global_size, + local_size, args.size(), args.data(), 0, nullptr, 0, + &event, nullptr)); +} + +// This test runs a kernel with a buffer (MEM_OBJ) arg. +struct urEnqueueKernelLaunchWithArgsMemObjTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "fill"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, + sizeof(val) * global_size[0], nullptr, + &buffer)); + + char zero = 0; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffer, &zero, sizeof(zero), 0, + buffer_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // First argument is buffer to fill + unsigned current_arg_index = 0; + ur_exp_kernel_arg_mem_obj_tuple_t buffer_and_properties = {buffer, 0}; + ur_exp_kernel_arg_properties_t arg = { + UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, + nullptr, + UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ, + current_arg_index++, + sizeof(buffer), + {nullptr}}; + arg.value.memObjTuple = buffer_and_properties; + args.push_back(arg); + + // Add accessor arguments depending on backend. + // HIP has 3 offset parameters and other backends only have 1. + if (backend == UR_BACKEND_HIP) { + arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE; + arg.size = sizeof(hip_local_offset); + arg.value.value = &hip_local_offset; + arg.index = current_arg_index++; + args.push_back(arg); + arg.index = current_arg_index++; + args.push_back(arg); + arg.index = current_arg_index++; + args.push_back(arg); + } else { + arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE; + arg.index = current_arg_index++; + arg.size = sizeof(accessor); + arg.value.value = &accessor; + args.push_back(arg); + } + + // Second user defined argument is scalar to fill with. + arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE; + arg.index = current_arg_index++; + arg.size = sizeof(val); + arg.value.value = &val; + args.push_back(arg); + } + + void TearDown() override { + if (buffer) { + EXPECT_SUCCESS(urMemRelease(buffer)); + } + + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown()); + } + + static constexpr uint32_t val = 42; + static constexpr size_t global_size[3] = {32, 1, 1}; + static constexpr uint32_t workDim = 3; + static constexpr size_t buffer_size = sizeof(val) * global_size[0]; + static constexpr uint64_t hip_local_offset = 0; + ur_backend_t backend{}; + ur_mem_handle_t buffer = nullptr; + // This is the accessor offset struct sycl kernels expect to accompany buffer args. + struct { + size_t offsets[1] = {0}; + } accessor; + std::vector args; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchWithArgsMemObjTest); + +TEST_P(urEnqueueKernelLaunchWithArgsMemObjTest, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, nullptr, global_size, nullptr, args.size(), + args.data(), 0, nullptr, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ValidateBuffer(buffer, buffer_size, val); +} diff --git a/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp b/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp index 3c776cebc0..b49c4dbf09 100644 --- a/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp +++ b/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp @@ -9,6 +9,9 @@ struct urUSMContextMemcpyExpTest : uur::urQueueTest { void SetUp() override { + // https://github.com/intel/llvm/issues/19604 + // this test uses urEnqueueUSMFill which looks to be bugged with latest driver + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); bool context_memcpy_support = false;