diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 2b18a4fb6e28f..eac772e4ce378 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -467,13 +467,14 @@ EventImplPtr queue_impl::submit_kernel_scheduler_bypass( BinImage = detail::retrieveKernelBinary(*this, KData.getKernelName()); assert(BinImage && "Failed to obtain a binary image."); } - enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(), - KernelBundleImpPtr, KernelImplPtr, - *KData.getDeviceKernelInfoPtr(), RawEvents, - ResultEvent.get(), nullptr, KData.getKernelCacheConfig(), - KData.isCooperative(), KData.usesClusterLaunch(), - KData.getKernelWorkGroupMemorySize(), BinImage, - KData.getKernelFuncPtr()); + enqueueImpKernel( + *this, KData.getNDRDesc(), KData.getArgs(), KernelBundleImpPtr, + KernelImplPtr, *KData.getDeviceKernelInfoPtr(), RawEvents, + ResultEvent.get(), nullptr, KData.getKernelCacheConfig(), + KData.isCooperative(), KData.usesClusterLaunch(), + KData.getKernelWorkGroupMemorySize(), BinImage, + KData.getKernelFuncPtr(), KData.getKernelNumArgs(), + KData.getKernelParamDescGetter(), KData.hasSpecialCaptures()); #ifdef XPTI_ENABLE_INSTRUMENTATION if (xptiEnabled) { // Emit signal only when event is created diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index b38706b9d9d37..d451c1e035576 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2303,14 +2303,14 @@ ur_mem_flags_t AccessModeToUr(access::mode AccessorMode) { } } -// Sets arguments for a given kernel and device based on the argument type. -// Refactored from SetKernelParamsAndLaunch to allow it to be used in the graphs -// extension. -static void SetArgBasedOnType( - adapter_impl &Adapter, ur_kernel_handle_t Kernel, +// Gets UR argument struct for a given kernel and device based on the argument +// type. Refactored from SetKernelParamsAndLaunch to allow it to be used in +// the graphs extension (LaunchWithArgs for graphs is planned future work). +static void GetUrArgsBasedOnType( device_image_impl *DeviceImageImpl, const std::function &getMemAllocationFunc, - context_impl &ContextImpl, detail::ArgDesc &Arg, size_t NextTrueIndex) { + context_impl &ContextImpl, detail::ArgDesc &Arg, size_t NextTrueIndex, + std::vector &UrArgs) { switch (Arg.MType) { case kernel_param_kind_t::kind_dynamic_work_group_memory: break; @@ -2330,52 +2330,61 @@ static void SetArgBasedOnType( getMemAllocationFunc ? reinterpret_cast(getMemAllocationFunc(Req)) : nullptr; - ur_kernel_arg_mem_obj_properties_t MemObjData{}; - MemObjData.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES; - MemObjData.memoryAccess = AccessModeToUr(Req->MAccessMode); - Adapter.call(Kernel, NextTrueIndex, - &MemObjData, MemArg); + ur_exp_kernel_arg_value_t Value = {}; + Value.memObjTuple = {MemArg, AccessModeToUr(Req->MAccessMode)}; + UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ, + static_cast(NextTrueIndex), sizeof(MemArg), + Value}); break; } case kernel_param_kind_t::kind_std_layout: { + ur_exp_kernel_arg_type_t Type; if (Arg.MPtr) { - Adapter.call( - Kernel, NextTrueIndex, Arg.MSize, nullptr, Arg.MPtr); + Type = UR_EXP_KERNEL_ARG_TYPE_VALUE; } else { - Adapter.call(Kernel, NextTrueIndex, - Arg.MSize, nullptr); + Type = UR_EXP_KERNEL_ARG_TYPE_LOCAL; } + ur_exp_kernel_arg_value_t Value = {}; + Value.value = {Arg.MPtr}; + UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + Type, static_cast(NextTrueIndex), + static_cast(Arg.MSize), Value}); break; } case kernel_param_kind_t::kind_sampler: { sampler *SamplerPtr = (sampler *)Arg.MPtr; - ur_sampler_handle_t Sampler = - (ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr) - ->getOrCreateSampler(ContextImpl); - Adapter.call(Kernel, NextTrueIndex, - nullptr, Sampler); + ur_exp_kernel_arg_value_t Value = {}; + Value.sampler = (ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr) + ->getOrCreateSampler(ContextImpl); + UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_SAMPLER, + static_cast(NextTrueIndex), + sizeof(ur_sampler_handle_t), Value}); break; } case kernel_param_kind_t::kind_pointer: { - // We need to de-rerence this to get the actual USM allocation - that's the + ur_exp_kernel_arg_value_t Value = {}; + // We need to de-rerence to get the actual USM allocation - that's the // pointer UR is expecting. - const void *Ptr = *static_cast(Arg.MPtr); - Adapter.call(Kernel, NextTrueIndex, - nullptr, Ptr); + Value.pointer = *static_cast(Arg.MPtr); + UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_POINTER, + static_cast(NextTrueIndex), sizeof(Arg.MPtr), + Value}); break; } case kernel_param_kind_t::kind_specialization_constants_buffer: { assert(DeviceImageImpl != nullptr); ur_mem_handle_t SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref(); - - ur_kernel_arg_mem_obj_properties_t MemObjProps{}; - MemObjProps.pNext = nullptr; - MemObjProps.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES; - MemObjProps.memoryAccess = UR_MEM_FLAG_READ_ONLY; - Adapter.call( - Kernel, NextTrueIndex, &MemObjProps, SpecConstsBuffer); + ur_exp_kernel_arg_value_t Value = {}; + Value.memObjTuple = {SpecConstsBuffer, UR_MEM_FLAG_READ_ONLY}; + UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ, + static_cast(NextTrueIndex), + sizeof(SpecConstsBuffer), Value}); break; } case kernel_param_kind_t::kind_invalid: @@ -2394,58 +2403,75 @@ static ur_result_t SetKernelParamsAndLaunch( const std::function &getMemAllocationFunc, bool IsCooperative, bool KernelUsesClusterLaunch, uint32_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage, - DeviceKernelInfo &DeviceKernelInfo, void *KernelFuncPtr = nullptr) { + KernelNameStrRefT KernelName, void *KernelFuncPtr = nullptr, + int KernelNumArgs = 0, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr, + bool KernelHasSpecialCaptures = true) { adapter_impl &Adapter = Queue.getAdapter(); if (SYCLConfig::get()) { std::vector Empty; Kernel = Scheduler::getInstance().completeSpecConstMaterialization( - Queue, BinImage, DeviceKernelInfo.Name, + Queue, BinImage, KernelName, DeviceImageImpl ? DeviceImageImpl->get_spec_const_blob_ref() : Empty); } - if (KernelFuncPtr && !DeviceKernelInfo.HasSpecialCaptures) { - auto setFunc = [&Adapter, Kernel, + std::vector UrArgs; + UrArgs.reserve(Args.size()); + + if (KernelFuncPtr && !KernelHasSpecialCaptures) { + auto setFunc = [&UrArgs, KernelFuncPtr](const detail::kernel_param_desc_t &ParamDesc, size_t NextTrueIndex) { const void *ArgPtr = (const char *)KernelFuncPtr + ParamDesc.offset; switch (ParamDesc.kind) { case kernel_param_kind_t::kind_std_layout: { int Size = ParamDesc.info; - Adapter.call(Kernel, NextTrueIndex, - Size, nullptr, ArgPtr); + ur_exp_kernel_arg_value_t Value = {}; + Value.value = ArgPtr; + UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_VALUE, + static_cast(NextTrueIndex), + static_cast(Size), Value}); break; } case kernel_param_kind_t::kind_pointer: { - const void *Ptr = *static_cast(ArgPtr); - Adapter.call(Kernel, NextTrueIndex, - nullptr, Ptr); + ur_exp_kernel_arg_value_t Value = {}; + Value.pointer = *static_cast(ArgPtr); + UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_POINTER, + static_cast(NextTrueIndex), + sizeof(Value.pointer), Value}); break; } default: throw std::runtime_error("Direct kernel argument copy failed."); } }; - applyFuncOnFilteredArgs(EliminatedArgMask, DeviceKernelInfo.NumParams, - DeviceKernelInfo.ParamDescGetter, setFunc); + applyFuncOnFilteredArgs(EliminatedArgMask, KernelNumArgs, + KernelParamDescGetter, setFunc); } else { - auto setFunc = [&Adapter, Kernel, &DeviceImageImpl, &getMemAllocationFunc, - &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) { - SetArgBasedOnType(Adapter, Kernel, DeviceImageImpl, getMemAllocationFunc, - Queue.getContextImpl(), Arg, NextTrueIndex); + auto setFunc = [&DeviceImageImpl, &getMemAllocationFunc, &Queue, + &UrArgs](detail::ArgDesc &Arg, size_t NextTrueIndex) { + GetUrArgsBasedOnType(DeviceImageImpl, getMemAllocationFunc, + Queue.getContextImpl(), Arg, NextTrueIndex, UrArgs); }; applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc); } - const std::optional &ImplicitLocalArg = - DeviceKernelInfo.getImplicitLocalArgPos(); + std::optional ImplicitLocalArg = + ProgramManager::getInstance().kernelImplicitLocalArgPos(KernelName); // Set the implicit local memory buffer to support // get_work_group_scratch_memory. This is for backend not supporting // CUDA-style local memory setting. Note that we may have -1 as a position, // this indicates the buffer is actually unused and was elided. if (ImplicitLocalArg.has_value() && ImplicitLocalArg.value() != -1) { - Adapter.call( - Kernel, ImplicitLocalArg.value(), WorkGroupMemorySize, nullptr); + UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, + nullptr, + UR_EXP_KERNEL_ARG_TYPE_LOCAL, + static_cast(ImplicitLocalArg.value()), + WorkGroupMemorySize, + {nullptr}}); } adjustNDRangePerKernel(NDRDesc, Kernel, Queue.getDeviceImpl()); @@ -2468,16 +2494,14 @@ static ur_result_t SetKernelParamsAndLaunch( /* pPropSizeRet = */ nullptr); const bool EnforcedLocalSize = - (RequiredWGSize[0] != 0 && - (NDRDesc.Dims < 2 || RequiredWGSize[1] != 0) && - (NDRDesc.Dims < 3 || RequiredWGSize[2] != 0)); + (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 || + RequiredWGSize[2] != 0); if (EnforcedLocalSize) LocalSize = RequiredWGSize; } - - const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 && - (NDRDesc.Dims < 2 || NDRDesc.GlobalOffset[1] != 0) && - (NDRDesc.Dims < 3 || NDRDesc.GlobalOffset[2] != 0); + const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 || + NDRDesc.GlobalOffset[1] != 0 || + NDRDesc.GlobalOffset[2] != 0; std::vector property_list; @@ -2505,13 +2529,15 @@ static ur_result_t SetKernelParamsAndLaunch( {{WorkGroupMemorySize}}}); } ur_event_handle_t UREvent = nullptr; - ur_result_t Error = Adapter.call_nocheck( - Queue.getHandleRef(), Kernel, NDRDesc.Dims, - HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr, &NDRDesc.GlobalSize[0], - LocalSize, property_list.size(), - property_list.empty() ? nullptr : property_list.data(), RawEvents.size(), - RawEvents.empty() ? nullptr : &RawEvents[0], - OutEventImpl ? &UREvent : nullptr); + ur_result_t Error = + Adapter.call_nocheck( + Queue.getHandleRef(), Kernel, NDRDesc.Dims, + HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr, + &NDRDesc.GlobalSize[0], LocalSize, UrArgs.size(), UrArgs.data(), + property_list.size(), + property_list.empty() ? nullptr : property_list.data(), + RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], + OutEventImpl ? &UREvent : nullptr); if (Error == UR_RESULT_SUCCESS && OutEventImpl) { OutEventImpl->setHandle(UREvent); } @@ -2519,6 +2545,88 @@ static ur_result_t SetKernelParamsAndLaunch( return Error; } +// Sets arguments for a given kernel and device based on the argument type. +// This is a legacy path which the graphs extension still uses. +static void SetArgBasedOnType( + adapter_impl &Adapter, ur_kernel_handle_t Kernel, + device_image_impl *DeviceImageImpl, + const std::function &getMemAllocationFunc, + context_impl &ContextImpl, detail::ArgDesc &Arg, size_t NextTrueIndex) { + switch (Arg.MType) { + case kernel_param_kind_t::kind_dynamic_work_group_memory: + break; + case kernel_param_kind_t::kind_work_group_memory: + break; + case kernel_param_kind_t::kind_stream: + break; + case kernel_param_kind_t::kind_dynamic_accessor: + case kernel_param_kind_t::kind_accessor: { + Requirement *Req = (Requirement *)(Arg.MPtr); + + // getMemAllocationFunc is nullptr when there are no requirements. However, + // we may pass default constructed accessors to a command, which don't add + // requirements. In such case, getMemAllocationFunc is nullptr, but it's a + // valid case, so we need to properly handle it. + ur_mem_handle_t MemArg = + getMemAllocationFunc + ? reinterpret_cast(getMemAllocationFunc(Req)) + : nullptr; + ur_kernel_arg_mem_obj_properties_t MemObjData{}; + MemObjData.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES; + MemObjData.memoryAccess = AccessModeToUr(Req->MAccessMode); + Adapter.call(Kernel, NextTrueIndex, + &MemObjData, MemArg); + break; + } + case kernel_param_kind_t::kind_std_layout: { + if (Arg.MPtr) { + Adapter.call( + Kernel, NextTrueIndex, Arg.MSize, nullptr, Arg.MPtr); + } else { + Adapter.call(Kernel, NextTrueIndex, + Arg.MSize, nullptr); + } + + break; + } + case kernel_param_kind_t::kind_sampler: { + sampler *SamplerPtr = (sampler *)Arg.MPtr; + ur_sampler_handle_t Sampler = + (ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr) + ->getOrCreateSampler(ContextImpl); + Adapter.call(Kernel, NextTrueIndex, + nullptr, Sampler); + break; + } + case kernel_param_kind_t::kind_pointer: { + // We need to de-rerence this to get the actual USM allocation - that's the + // pointer UR is expecting. + const void *Ptr = *static_cast(Arg.MPtr); + Adapter.call(Kernel, NextTrueIndex, + nullptr, Ptr); + break; + } + case kernel_param_kind_t::kind_specialization_constants_buffer: { + assert(DeviceImageImpl != nullptr); + ur_mem_handle_t SpecConstsBuffer = + DeviceImageImpl->get_spec_const_buffer_ref(); + + ur_kernel_arg_mem_obj_properties_t MemObjProps{}; + MemObjProps.pNext = nullptr; + MemObjProps.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES; + MemObjProps.memoryAccess = UR_MEM_FLAG_READ_ONLY; + Adapter.call( + Kernel, NextTrueIndex, &MemObjProps, SpecConstsBuffer); + break; + } + case kernel_param_kind_t::kind_invalid: + throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), + "Invalid kernel param kind " + + codeToString(UR_RESULT_ERROR_INVALID_VALUE)); + break; + } +} + static std::tuple getCGKernelInfo(const CGExecKernel &CommandGroup, context_impl &ContextImpl, @@ -2668,7 +2776,9 @@ void enqueueImpKernel( const std::function &getMemAllocationFunc, ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative, const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize, - const RTDeviceBinaryImage *BinImage, void *KernelFuncPtr) { + const RTDeviceBinaryImage *BinImage, void *KernelFuncPtr, int KernelNumArgs, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + bool KernelHasSpecialCaptures) { // Run OpenCL kernel context_impl &ContextImpl = Queue.getContextImpl(); device_impl &DeviceImpl = Queue.getDeviceImpl(); @@ -2755,7 +2865,8 @@ void enqueueImpKernel( Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList, OutEventImpl, EliminatedArgMask, getMemAllocationFunc, KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize, - BinImage, DeviceKernelInfo, KernelFuncPtr); + BinImage, DeviceKernelInfo.Name, KernelFuncPtr, KernelNumArgs, + KernelParamDescGetter, KernelHasSpecialCaptures); } if (UR_RESULT_SUCCESS != Error) { // If we have got non-success error code, let's analyze it to emit nice diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index d47a5d9d9131f..7c9600eb4a5c5 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -633,7 +633,9 @@ void enqueueImpKernel( ur_kernel_cache_config_t KernelCacheConfig, bool KernelIsCooperative, const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage = nullptr, - void *KernelFuncPtr = nullptr); + void *KernelFuncPtr = nullptr, int KernelNumArgs = 0, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr, + bool KernelHasSpecialCaptures = true); /// The exec CG command enqueues execution of kernel or explicit memory /// operation. diff --git a/sycl/test-e2e/Adapters/level_zero/batch_barrier.cpp b/sycl/test-e2e/Adapters/level_zero/batch_barrier.cpp index 9b4a05391e3ca..6ede03dca7a84 100644 --- a/sycl/test-e2e/Adapters/level_zero/batch_barrier.cpp +++ b/sycl/test-e2e/Adapters/level_zero/batch_barrier.cpp @@ -24,7 +24,7 @@ int main(int argc, char *argv[]) { queue q; submit_kernel(q); // starts a batch - // CHECK: ---> urEnqueueKernelLaunch + // CHECK: ---> urEnqueueKernelLaunchWithArgsExp // CHECK-NOT: zeCommandQueueExecuteCommandLists // Initializing Level Zero driver is required if this test is linked @@ -42,7 +42,7 @@ int main(int argc, char *argv[]) { // CHECK-NOT: zeCommandQueueExecuteCommandLists submit_kernel(q); - // CHECK: ---> urEnqueueKernelLaunch + // CHECK: ---> urEnqueueKernelLaunchWithArgsExp // CHECK-NOT: zeCommandQueueExecuteCommandLists // interop should close the batch diff --git a/sycl/test-e2e/Adapters/level_zero/batch_test.cpp b/sycl/test-e2e/Adapters/level_zero/batch_test.cpp index 8f6e4e0f6a563..1db60a36e1265 100644 --- a/sycl/test-e2e/Adapters/level_zero/batch_test.cpp +++ b/sycl/test-e2e/Adapters/level_zero/batch_test.cpp @@ -55,55 +55,54 @@ // variable SYCL_PI_LEVEL_ZEOR+BATCH_SIZE=N. // This test enqueues 8 kernels and then does a wait. And it does this 3 times. // Expected output is that for batching =1 you will see zeCommandListClose, -// and zeCommandQueueExecuteCommandLists after every urEnqueueKernelLaunch. -// For batching=3 you will see that after 3rd and 6th enqueues, and then after -// urQueueFinish. For 5, after 5th urEnqueue, and then after urQueueFinish. For -// 4 you will see these after 4th and 8th Enqueue, and for 8, only after the -// 8th enqueue. And lastly for 9, you will see the Close and Execute calls -// only after the urQueueFinish. -// Since the test does this 3 times, this pattern will repeat 2 more times, -// and then the test will print Test Passed 8 times, once for each kernel -// validation check. +// and zeCommandQueueExecuteCommandLists after every +// urEnqueueKernelLaunchWithArgsExp. For batching=3 you will see that after 3rd +// and 6th enqueues, and then after urQueueFinish. For 5, after 5th urEnqueue, +// and then after urQueueFinish. For 4 you will see these after 4th and 8th +// Enqueue, and for 8, only after the 8th enqueue. And lastly for 9, you will +// see the Close and Execute calls only after the urQueueFinish. Since the test +// does this 3 times, this pattern will repeat 2 more times, and then the test +// will print Test Passed 8 times, once for each kernel validation check. // Pattern starts first set of kernel executions. -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB4: zeCommandListClose( // CKB4: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB5: zeCommandListClose( // CKB5: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB7: zeCommandListClose( // CKB7: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( @@ -121,45 +120,45 @@ // CKB9: zeCommandListClose( // CKB9: zeCommandQueueExecuteCommandLists( // Pattern starts 2nd set of kernel executions -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB4: zeCommandListClose( // CKB4: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB5: zeCommandListClose( // CKB5: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB7: zeCommandListClose( // CKB7: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( @@ -177,45 +176,45 @@ // CKB9: zeCommandListClose( // CKB9: zeCommandQueueExecuteCommandLists( // Pattern starts 3rd set of kernel executions -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB4: zeCommandListClose( // CKB4: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB5: zeCommandListClose( // CKB5: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB7: zeCommandListClose( // CKB7: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( diff --git a/sycl/test-e2e/Adapters/level_zero/batch_test_copy_with_compute.cpp b/sycl/test-e2e/Adapters/level_zero/batch_test_copy_with_compute.cpp index 8470c7abd2bfa..85634616c67a9 100644 --- a/sycl/test-e2e/Adapters/level_zero/batch_test_copy_with_compute.cpp +++ b/sycl/test-e2e/Adapters/level_zero/batch_test_copy_with_compute.cpp @@ -31,55 +31,53 @@ // variable SYCL_PI_LEVEL_ZERO_{COPY_}BATCH_SIZE=N. // This test enqueues 8 kernels and then does a wait. And it does this 3 times. // Expected output is that for batching =1 you will see zeCommandListClose, -// and zeCommandQueueExecuteCommandLists after every urEnqueueKernelLaunch. -// For batching=3 you will see that after 3rd and 6th enqueues, and then after -// urEventWait. For 5, after 5th urEnqueue, and then after urEventWait. For -// 4 you will see these after 4th and 8th Enqueue, and for 8, only after the -// 8th enqueue. And lastly for 9, you will see the Close and Execute calls -// only after the urEventWait. -// Since the test does this 3 times, this pattern will repeat 2 more times, -// and then the test will print Test Passed 8 times, once for each kernel -// validation check. -// Pattern starts first set of kernel executions. -// CKALL: ---> urEnqueueKernelLaunch +// and zeCommandQueueExecuteCommandLists after every +// urEnqueueKernelLaunchWithArgsExp. For batching=3 you will see that after 3rd +// and 6th enqueues, and then after urEventWait. For 5, after 5th urEnqueue, and +// then after urEventWait. For 4 you will see these after 4th and 8th Enqueue, +// and for 8, only after the 8th enqueue. And lastly for 9, you will see the +// Close and Execute calls only after the urEventWait. Since the test does this +// 3 times, this pattern will repeat 2 more times, and then the test will print +// Test Passed 8 times, once for each kernel validation check. Pattern starts +// first set of kernel executions. CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB4: zeCommandListClose( // CKB4: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB5: zeCommandListClose( // CKB5: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB7: zeCommandListClose( // CKB7: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( @@ -97,45 +95,45 @@ // CKB9: zeCommandListClose( // CKB9: zeCommandQueueExecuteCommandLists( // Pattern starts 2nd set of kernel executions -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB4: zeCommandListClose( // CKB4: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB5: zeCommandListClose( // CKB5: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB7: zeCommandListClose( // CKB7: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( @@ -153,45 +151,45 @@ // CKB9: zeCommandListClose( // CKB9: zeCommandQueueExecuteCommandLists( // Pattern starts 3rd set of kernel executions -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB4: zeCommandListClose( // CKB4: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB5: zeCommandListClose( // CKB5: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB3: zeCommandListClose( // CKB3: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( // CKB7: zeCommandListClose( // CKB7: zeCommandQueueExecuteCommandLists( -// CKALL: ---> urEnqueueKernelLaunch +// CKALL: ---> urEnqueueKernelLaunchWithArgsExp // CKALL: zeCommandListAppendLaunchKernel( // CKB1: zeCommandListClose( // CKB1: zeCommandQueueExecuteCommandLists( diff --git a/sycl/test-e2e/Basic/fill_accessor_ur.cpp b/sycl/test-e2e/Basic/fill_accessor_ur.cpp index 29a991ae40ad3..69395f8b4053b 100644 --- a/sycl/test-e2e/Basic/fill_accessor_ur.cpp +++ b/sycl/test-e2e/Basic/fill_accessor_ur.cpp @@ -63,7 +63,7 @@ void testFill_Buffer2D() { auto acc2D = buffer_2D.get_access(cgh, {8, 12}, {2, 2}); // "ranged accessor" will have to be handled by custom kernel: - // urEnqueueKernelLaunch + // urEnqueueKernelLaunchWithArgsExp cgh.fill(acc2D, float{4}); }); q.wait(); @@ -94,7 +94,7 @@ void testFill_Buffer3D() { auto acc3D = buffer_3D.get_access( cgh, {4, 8, 12}, {3, 3, 3}); // "ranged accessor" will have to be handled by custom kernel: - // urEnqueueKernelLaunch + // urEnqueueKernelLaunchWithArgsExp cgh.fill(acc3D, float{6}); }); q.wait(); @@ -139,12 +139,12 @@ int main() { // CHECK: start testFill_Buffer2D // CHECK: <--- urEnqueueMemBufferFill // CHECK: start testFill_Buffer2D -- OFFSET -// CHECK: <--- urEnqueueKernelLaunch +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK: start testFill_Buffer3D // CHECK: <--- urEnqueueMemBufferFill // CHECK: start testFill_Buffer3D -- OFFSET -// CHECK: <--- urEnqueueKernelLaunch +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK: start testFill_ZeroDim // CHECK: <--- urEnqueueMemBufferFill diff --git a/sycl/test-e2e/Basic/host-task-dependency.cpp b/sycl/test-e2e/Basic/host-task-dependency.cpp index 3b015051377c9..2c29034de4af8 100644 --- a/sycl/test-e2e/Basic/host-task-dependency.cpp +++ b/sycl/test-e2e/Basic/host-task-dependency.cpp @@ -179,13 +179,13 @@ int main() { // launch of Gen kernel // CHECK: <--- urKernelCreate // CHECK: NameGen -// CHECK: <--- urEnqueueKernelLaunch +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // prepare for host task // CHECK: <--- urEnqueueMemBuffer{{Map|Read}} // launch of Copier kernel // CHECK: <--- urKernelCreate // CHECK: Copier -// CHECK: <--- urEnqueueKernelLaunch +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK:Third buffer [ 0] = 0 // CHECK:Third buffer [ 1] = 1 diff --git a/sycl/test-e2e/Basic/kernel_bundle/kernel_bundle_api.cpp b/sycl/test-e2e/Basic/kernel_bundle/kernel_bundle_api.cpp index 70f29387c866c..7d87d2ed13e55 100644 --- a/sycl/test-e2e/Basic/kernel_bundle/kernel_bundle_api.cpp +++ b/sycl/test-e2e/Basic/kernel_bundle/kernel_bundle_api.cpp @@ -230,7 +230,7 @@ int main() { // CHECK-SAME: .hKernel = [[KERNEL_HANDLE]] // CHECK-SAME:-> UR_RESULT_SUCCESS; // - // CHECK:<--- urEnqueueKernelLaunch( + // CHECK:<--- urEnqueueKernelLaunchWithArgsExp( // CHECK-SAME: .hKernel = [[KERNEL_HANDLE]] // // CHECK:<--- urKernelRelease( diff --git a/sycl/test-e2e/Basic/queue/release.cpp b/sycl/test-e2e/Basic/queue/release.cpp index 13ee5d6ee22bf..5f4dc58c92575 100644 --- a/sycl/test-e2e/Basic/queue/release.cpp +++ b/sycl/test-e2e/Basic/queue/release.cpp @@ -11,7 +11,7 @@ int main() { return 0; } -// CHECK: <--- urEnqueueKernelLaunch( +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp( // FIXME the order of these 2 varies between adapters due to a Level Zero // specific queue workaround. // CHECK-DAG: <--- urEventRelease( diff --git a/sycl/test-e2e/Basic/subdevice_pi.cpp b/sycl/test-e2e/Basic/subdevice_pi.cpp index 4843c86a899b3..43595063abc1e 100644 --- a/sycl/test-e2e/Basic/subdevice_pi.cpp +++ b/sycl/test-e2e/Basic/subdevice_pi.cpp @@ -67,7 +67,7 @@ static bool check_separate(device dev, buffer buf, // CHECK-SEPARATE: <--- urContextCreate // CHECK-SEPARATE: <--- urQueueCreate // CHECK-SEPARATE: <--- urMemBufferCreate - // CHECK-SEPARATE: <--- urEnqueueKernelLaunch + // CHECK-SEPARATE: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-SEPARATE: <--- urQueueFinish log_pi("Test sub device 1"); @@ -84,7 +84,7 @@ static bool check_separate(device dev, buffer buf, // CHECK-SEPARATE: <--- urEnqueueMemBuffer{{Map|Read}} // CHECK-SEPARATE: <--- urEnqueueMemBufferWrite // - // CHECK-SEPARATE: <--- urEnqueueKernelLaunch + // CHECK-SEPARATE: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-SEPARATE: <--- urQueueFinish return true; @@ -119,7 +119,7 @@ static bool check_shared_context(device dev, buffer buf, // Make sure that a single buffer is created (and shared between subdevices): // see --implicit-check-not above. // - // CHECK-SHARED: <--- urEnqueueKernelLaunch + // CHECK-SHARED: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-SHARED: <--- urQueueFinish log_pi("Test sub device 1"); @@ -129,7 +129,7 @@ static bool check_shared_context(device dev, buffer buf, } // CHECK-SHARED: Test sub device 1 // CHECK-SHARED: <--- urQueueCreate - // CHECK-SHARED: <--- urEnqueueKernelLaunch + // CHECK-SHARED: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-SHARED: <--- urQueueFinish // CHECK-SHARED: <--- urEnqueueMemBufferRead @@ -168,7 +168,7 @@ static bool check_fused_context(device dev, buffer buf, // Make sure that a single buffer is created (and shared between subdevices // *and* the root device): see --implicit-check-not above. // - // CHECK-FUSED: <--- urEnqueueKernelLaunch + // CHECK-FUSED: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-FUSED: <--- urQueueFinish log_pi("Test sub device 0"); @@ -178,7 +178,7 @@ static bool check_fused_context(device dev, buffer buf, } // CHECK-FUSED: Test sub device 0 // CHECK-FUSED: <--- urQueueCreate - // CHECK-FUSED: <--- urEnqueueKernelLaunch + // CHECK-FUSED: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-FUSED: <--- urQueueFinish log_pi("Test sub device 1"); @@ -188,7 +188,7 @@ static bool check_fused_context(device dev, buffer buf, } // CHECK-FUSED: Test sub device 1 // CHECK-FUSED: <--- urQueueCreate - // CHECK-FUSED: <--- urEnqueueKernelLaunch + // CHECK-FUSED: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-FUSED: <--- urQueueFinish // CHECK-FUSED: <--- urEnqueueMemBufferRead diff --git a/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp index 7928e5da66bac..50a488d861874 100644 --- a/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp +++ b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp @@ -1,5 +1,5 @@ // Checks whether or not event Dependencies are honored by -// urEnqueueKernelLaunch with cluster dimensions +// urEnqueueKernelLaunchWithArgsExp with cluster dimensions // REQUIRES: target-nvidia, aspect-ext_oneapi_cuda_cluster_group // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_90 -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_accessors.cpp b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_accessors.cpp index 5cdbd0f73e683..fffd5d22625f7 100644 --- a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_accessors.cpp +++ b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_accessors.cpp @@ -3,12 +3,12 @@ // RUN: env SYCL_UR_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt // // The test checks that the last parameter is not `nullptr` for -// urEnqueueKernelLaunch for a kernel using buffer accessor. +// urEnqueueKernelLaunchWithArgsExp for a kernel using buffer accessor. // -// CHECK: <--- urEnqueueKernelLaunch +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // -// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}}.phEvent = nullptr -// CHECK: <--- urEnqueueKernelLaunch +// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}}.phEvent = nullptr +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK: -> UR_RESULT_SUCCESS // // CHECK: The test passed. diff --git a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_using_assert.cpp b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_using_assert.cpp index 5d965b1c27ff1..df9bb56b6d280 100644 --- a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_using_assert.cpp +++ b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_using_assert.cpp @@ -6,10 +6,10 @@ // RUN: env SYCL_UR_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt // // The test checks that the last parameter is not `nullptr` for -// urEnqueueKernelLaunch. +// urEnqueueKernelLaunchWithArgsExp. // -// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}}.phEvent = nullptr -// CHECK: <--- urEnqueueKernelLaunch +// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}}.phEvent = nullptr +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK: -> UR_RESULT_SUCCESS // // CHECK: The test passed. diff --git a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_usm_ooo_queue.cpp b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_usm_ooo_queue.cpp index 3aa91ed17cd32..79ba3d79f5800 100644 --- a/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_usm_ooo_queue.cpp +++ b/sycl/test-e2e/DeprecatedFeatures/DiscardEvents/discard_events_usm_ooo_queue.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // -// On level_zero Q.fill uses urEnqueueKernelLaunch and not urEnqueueUSMFill -// due to https://github.com/intel/llvm/issues/13787 +// On level_zero Q.fill uses urEnqueueKernelLaunchWithArgsExp and not +// urEnqueueUSMFill due to https://github.com/intel/llvm/issues/13787 // // RUN: env SYCL_UR_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt --check-prefixes=CHECK%if level_zero %{,CHECK-L0%} %else %{,CHECK-OTHER%} // @@ -25,7 +25,7 @@ // CHECK: -> UR_RESULT_SUCCESS // // Level-zero backend doesn't use urEnqueueUSMFill -// CHECK-L0: <--- urEnqueueKernelLaunch +// CHECK-L0: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-L0: .phEvent = {{[0-9a-f]+}} // CHECK-OTHER: <--- urEnqueueUSMFill({{.*}} .phEvent = {{[0-9a-f]+}} // CHECK: -> UR_RESULT_SUCCESS @@ -47,12 +47,12 @@ // CHECK: <--- urEnqueueEventsWaitWithBarrier // CHECK: -> UR_RESULT_SUCCESS // -// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}} .phEvent = nullptr -// CHECK: <--- urEnqueueKernelLaunch +// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}} .phEvent = nullptr +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK: -> UR_RESULT_SUCCESS // -// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}} .phEvent = nullptr -// CHECK: <--- urEnqueueKernelLaunch +// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}} .phEvent = nullptr +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK: -> UR_RESULT_SUCCESS // // RegularQueue @@ -74,7 +74,7 @@ // CHECK: -> UR_RESULT_SUCCESS // // Level-zero backend doesn't use urEnqueueUSMFill -// CHECK-L0: <--- urEnqueueKernelLaunch +// CHECK-L0: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-L0: .phEvent = {{[0-9a-f]+}} // CHECK-OTHER: <--- urEnqueueUSMFill({{.*}} .phEvent = {{[0-9a-f]+}} // CHECK: -> UR_RESULT_SUCCESS @@ -96,12 +96,12 @@ // CHECK: <--- urEnqueueEventsWaitWithBarrier // CHECK: -> UR_RESULT_SUCCESS // -// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}} .phEvent = nullptr -// CHECK: <--- urEnqueueKernelLaunch +// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}} .phEvent = nullptr +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK: -> UR_RESULT_SUCCESS // -// CHECK-NOT: <--- urEnqueueKernelLaunch({{.*}} .phEvent = nullptr -// CHECK: <--- urEnqueueKernelLaunch +// CHECK-NOT: <--- urEnqueueKernelLaunchWithArgsExp({{.*}} .phEvent = nullptr +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK: -> UR_RESULT_SUCCESS // // RegularQueue diff --git a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp index 3f8482f4bc3df..7d283372e2ae6 100644 --- a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp +++ b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp @@ -33,7 +33,7 @@ int main() { // CHECK-NOT: <--- urProgramRetain // CHECK: <--- urKernelCreate // CHECK-NOT: <--- urKernelRetain - // CHECK: <--- urEnqueueKernelLaunch + // CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK: <--- urProgramRelease // CHECK: <--- urKernelRelease // CHECK: <--- urEventWait @@ -45,7 +45,7 @@ int main() { // CHECK-CACHE: <--- urKernelCreate // CHECK-CACHE: <--- urKernelRetain // CHECK-CACHE-NOT: <--- urKernelCreate - // CHECK-CACHE: <--- urEnqueueKernelLaunch + // CHECK-CACHE: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-CACHE-NOT: <--- urProgramRelease // CHECK-CACHE: <--- urEventWait // iteration 1: @@ -59,7 +59,7 @@ int main() { // CHECK-NOT: <--- urProgramRetain // CHECK: <--- urKernelCreate // CHECK-NOT: <--- urKernelRetain - // CHECK: <--- urEnqueueKernelLaunch + // CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK: <--- urKernelRelease // CHECK: <--- urProgramRelease // CHECK: <--- urEventWait @@ -70,7 +70,7 @@ int main() { // CHECK-CACHE: <--- urKernelCreate // CHECK-CACHE: <--- urKernelRetain // CHECK-CACHE-NOT: <--- urKernelCreate - // CHECK-CACHE: <--- urEnqueueKernelLaunch + // CHECK-CACHE: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-CACHE: <--- urKernelRelease // CHECK-CACHE: <--- urProgramRelease // CHECK-CACHE: <--- urEventWait @@ -79,7 +79,7 @@ int main() { // CHECK-NOT: <--- urProgramRetain // CHECK: <--- urKernelCreate // CHECK-NOT: <--- urKernelRetain - // CHECK: <--- urEnqueueKernelLaunch + // CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK: <--- urKernelRelease // CHECK: <--- urProgramRelease // CHECK: <--- urEventWait @@ -90,7 +90,7 @@ int main() { // CHECK-CACHE: <--- urKernelCreate // CHECK-CACHE: <--- urKernelRetain // CHECK-CACHE-NOT: <--- urKernelCreate - // CHECK-CACHE: <--- urEnqueueKernelLaunch + // CHECK-CACHE: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-CACHE: <--- urKernelRelease // CHECK-CACHE: <--- urProgramRelease // CHECK-CACHE: <--- urEventWait diff --git a/sycl/test-e2e/Scheduler/HostAccDestruction.cpp b/sycl/test-e2e/Scheduler/HostAccDestruction.cpp index fd9465935dfe1..30e98974b5f1d 100644 --- a/sycl/test-e2e/Scheduler/HostAccDestruction.cpp +++ b/sycl/test-e2e/Scheduler/HostAccDestruction.cpp @@ -32,5 +32,5 @@ int main() { } // CHECK:host acc destructor call -// CHECK: <--- urEnqueueKernelLaunch +// CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK:end of scope diff --git a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp index 0c0e1750805f2..df8062d165ccb 100644 --- a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp +++ b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp @@ -36,15 +36,15 @@ int main() { // Sequential submissions to the same in-order queue should not result in any // event dependencies. - // CHECK: <--- urEnqueueKernelLaunch + // CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-SAME: .numEventsInWaitList = 0 submitKernel(InOrderQueueA, Buf); - // CHECK: <--- urEnqueueKernelLaunch + // CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-SAME: .numEventsInWaitList = 0 submitKernel(InOrderQueueA, Buf); // Submisssion to a different in-order queue should explicitly depend on the // previous command group. - // CHECK: <--- urEnqueueKernelLaunch + // CHECK: <--- urEnqueueKernelLaunchWithArgsExp // CHECK-SAME: .numEventsInWaitList = 1 submitKernel(InOrderQueueB, Buf); diff --git a/sycl/test-e2e/SpecConstants/2020/image_selection.cpp b/sycl/test-e2e/SpecConstants/2020/image_selection.cpp index d473098defeb1..bc2f5901ec8c4 100644 --- a/sycl/test-e2e/SpecConstants/2020/image_selection.cpp +++ b/sycl/test-e2e/SpecConstants/2020/image_selection.cpp @@ -65,40 +65,40 @@ int main() { // submission depending on whether spec const value was set or not. a. In the // case when we select image where specialization constants are replaced with // default value - specialization constant buffer is not created and we set - // nullptr in urKernelSetArgMemObj (4th parameter) b. In the case when we - // select regular image - specialization constant buffer is created and we set - // a real pointer in urKernelSetArgMemObj. + // nullptr in urEnqueueKernelLaunchWithArgsExp. In the case when we select + // regular image - specialization constant buffer is created and we set a + // real pointer in urEnqueueKernelLaunchWithArgsExp. // CHECK-DEFAULT: Submission 0 - // CHECK-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS; + // CHECK-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS; // CHECK-DEFAULT: Default value of specialization constant was used. // CHECK-DEFAULT: Submission 1 - // CHECK-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS; + // CHECK-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS; // CHECK-DEFAULT: New specialization constant value was set. // CHECK-DEFAULT: Submission 2 - // CHECK-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS; + // CHECK-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS; // CHECK-DEFAULT: Default value of specialization constant was used. // CHECK-DEFAULT: Submission 3 - // CHECK-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS; + // CHECK-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS; // CHECK-DEFAULT: New specialization constant value was set. // CHECK-ENABLED: Submission 0 - // CHECK-ENABLED: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = nullptr) -> UR_RESULT_SUCCESS; + // CHECK-ENABLED: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = nullptr{{.*}} -> UR_RESULT_SUCCESS; // CHECK-ENABLED: Default value of specialization constant was used. // CHECK-ENABLED: Submission 1 - // CHECK-ENABLED: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS; + // CHECK-ENABLED: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}}-> UR_RESULT_SUCCESS; // CHECK-ENABLED: New specialization constant value was set. // CHECK-ENABLED: Submission 2 - // CHECK-ENABLED: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = nullptr) -> UR_RESULT_SUCCESS; + // CHECK-ENABLED: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = nullptr{{.*}} -> UR_RESULT_SUCCESS; // CHECK-ENABLED: Default value of specialization constant was used. // CHECK-ENABLED: Submission 3 - // CHECK-ENABLED: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS; + // CHECK-ENABLED: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS; // CHECK-ENABLED: New specialization constant value was set. // CHECK-MIX: Submission 0 @@ -138,11 +138,11 @@ int main() { // default, that's why nullptr is set as 4th parameter of // urKernelSetArgMemObj. // CHECK-DEFAULT: Kernel bundle - // CHECK-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = {{(0x)?[0-9,a-f,A-F]+}}) -> UR_RESULT_SUCCESS; + // CHECK-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = {{(0x)?[0-9,a-f,A-F]+}}{{.*}} -> UR_RESULT_SUCCESS; // CHECK-DEFAULT: Default value of specialization constant was used. // CHECK-ENABLED: Kernel bundle - // CHECK-ENABLED: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = nullptr) -> UR_RESULT_SUCCESS; + // CHECK-ENABLED: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = nullptr{{.*}} -> UR_RESULT_SUCCESS; // CHECK-ENABLED: Default value of specialization constant was used. // CHECK-MIX: Kernel bundle @@ -170,7 +170,7 @@ int main() { // constants. We are verifying that by checking the 4th parameter is set to // zero. // CHECK-DEFAULT-EXPLICIT-SET: Default value was explicitly set - // CHECK-DEFAULT-EXPLICIT-SET: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = nullptr) -> UR_RESULT_SUCCESS; + // CHECK-DEFAULT-EXPLICIT-SET: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = nullptr{{.*}} -> UR_RESULT_SUCCESS; // CHECK-DEFAULT-EXPLICIT-SET: Default value of specialization constant was used. std::cout << "Default value was explicitly set" << std::endl; Q.submit([&](sycl::handler &cgh) { @@ -193,7 +193,7 @@ int main() { // values of specialization constants. We are verifying that by checking the // 4th parameter is set to zero. // CHECK-DEFAULT-BACK-TO-DEFAULT: Changed to new value and then default value was explicitly set - // CHECK-DEFAULT-BACK-TO-DEFAULT: <--- urKernelSetArgMemObj({{.*}}, .hArgValue = nullptr) -> UR_RESULT_SUCCESS; + // CHECK-DEFAULT-BACK-TO-DEFAULT: <--- urEnqueueKernelLaunchWithArgsExp{{.*}}KERNEL_ARG_TYPE_MEM_OBJ,{{.*}}.hMem = nullptr{{.*}} -> UR_RESULT_SUCCESS; // CHECK-DEFAULT-BACK-TO-DEFAULT: Default value of specialization constant was used. std::cout << "Changed to new value and then default value was explicitly set" << std::endl; diff --git a/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp b/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp index 8faa2f2678356..28f071237ffa0 100644 --- a/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp +++ b/sycl/test-e2e/XPTI/basic_event_collection_linux.cpp @@ -22,6 +22,13 @@ // CHECK-NEXT: UR Call Begin : urPlatformGetInfo // CHECK-NEXT: UR Call Begin : urKernelSetExecInfo // CHECK-NEXT: UR Call Begin : urKernelRetain +// CHECK: UR Call Begin : urKernelGetGroupInfo +// CHECK-NEXT: UR Call Begin : urEnqueueKernelLaunchWithArgsExp +// CHECK: UR Call Begin : urKernelCreate +// CHECK-NEXT: UR Call Begin : urPlatformGetInfo +// CHECK-NEXT: UR Call Begin : urPlatformGetInfo +// CHECK-NEXT: UR Call Begin : urKernelSetExecInfo +// CHECK-NEXT: UR Call Begin : urKernelRetain // CHECK: Node create // CHECK-DAG: queue_id : {{.*}} // CHECK-DAG: sym_line_no : {{.*}} @@ -38,9 +45,8 @@ // CHECK-DAG: from_source : false // CHECK-DAG: kernel_name : typeinfo name for main::{lambda(sycl::_V1::handler&)#1}::operator()(sycl::_V1::handler&) const::{lambda()#1} // CHECK-DAG: sycl_device : {{.*}} -// CHECK: UR Call Begin : urKernelSetArgPointer -// CHECK-NEXT: UR Call Begin : urKernelGetGroupInfo -// CHECK-NEXT: UR Call Begin : urEnqueueKernelLaunch +// CHECK: UR Call Begin : urKernelGetGroupInfo +// CHECK-NEXT: UR Call Begin : urEnqueueKernelLaunchWithArgsExp // CHECK-NEXT: Signal // CHECK-DAG: queue_id : {{.*}} // CHECK-DAG: sym_line_no : {{.*}} diff --git a/sycl/tools/xpti_helpers/usm_analyzer.hpp b/sycl/tools/xpti_helpers/usm_analyzer.hpp index 6df1c522899d9..0dc46427557bb 100644 --- a/sycl/tools/xpti_helpers/usm_analyzer.hpp +++ b/sycl/tools/xpti_helpers/usm_analyzer.hpp @@ -254,6 +254,11 @@ class USMAnalyzer { handleKernelSetArgPointer( static_cast(Data->args_data)); return; + case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP: + handleEnqueueKernelLaunchWithArgsExp( + static_cast( + Data->args_data)); + return; default: return; } @@ -421,4 +426,18 @@ class USMAnalyzer { "kernel parameter with index = " + std::to_string(*Params->pargIndex), Ptr, 0 /*no data how it will be used in kernel*/, "kernel"); } + + static void handleEnqueueKernelLaunchWithArgsExp( + const ur_enqueue_kernel_launch_with_args_exp_params_t *Params) { + // Search for pointer args and validate the pointers + for (uint32_t i = 0; i < *Params->pnumArgs; i++) { + if ((*Params->ppArgs)[i].type == UR_EXP_KERNEL_ARG_TYPE_POINTER) { + void *Ptr = (const_cast((*Params->ppArgs)[i].value.pointer)); + CheckPointerValidness("kernel parameter with index = " + + std::to_string((*Params->ppArgs)[i].index), + Ptr, 0 /*no data how it will be used in kernel*/, + "kernel"); + } + } + } }; diff --git a/sycl/unittests/Extensions/DeviceGlobal.cpp b/sycl/unittests/Extensions/DeviceGlobal.cpp index b6fbc9bd8ab3e..6aa8c32405830 100644 --- a/sycl/unittests/Extensions/DeviceGlobal.cpp +++ b/sycl/unittests/Extensions/DeviceGlobal.cpp @@ -191,8 +191,9 @@ ur_result_t after_urEventGetInfo(void *pParams) { return UR_RESULT_SUCCESS; } -ur_result_t after_urEnqueueKernelLaunch(void *pParams) { - auto params = *static_cast(pParams); +ur_result_t after_urEnqueueKernelLaunchWithArgsExp(void *pParams) { + auto params = + *static_cast(pParams); ++KernelCallCounter; EXPECT_TRUE(DeviceGlobalInitEvent.has_value()) << "DeviceGlobalInitEvent has not been set. Kernel call " @@ -275,7 +276,7 @@ TEST_F(DeviceGlobalTest, DeviceGlobalInitBeforeUse) { REDEFINE_AFTER(urEnqueueUSMMemcpy); REDEFINE_AFTER_TEMPLATED(urEnqueueDeviceGlobalVariableWrite, true); REDEFINE_AFTER(urEventGetInfo); - REDEFINE_AFTER(urEnqueueKernelLaunch); + REDEFINE_AFTER(urEnqueueKernelLaunchWithArgsExp); // Kernel call 1. // First launch should create both init events. diff --git a/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp b/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp index a1014adbff686..9e6366ce16abf 100644 --- a/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp +++ b/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp @@ -13,6 +13,7 @@ #include using namespace sycl; +using namespace FreeFunctionEventsHelpers; namespace oneapiext = ext::oneapi::experimental; @@ -26,7 +27,7 @@ class EnqueueFunctionsEventsTests : public ::testing::Test { protected: void SetUp() override { - counter_urEnqueueKernelLaunch = 0; + counter_urEnqueueKernelLaunchWithArgsExp = 0; counter_urUSMEnqueueMemcpy = 0; counter_urUSMEnqueueFill = 0; counter_urUSMEnqueuePrefetch = 0; @@ -39,28 +40,31 @@ class EnqueueFunctionsEventsTests : public ::testing::Test { }; TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); oneapiext::submit(Q, [&](handler &CGH) { oneapiext::single_task(CGH, []() {}); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); oneapiext::single_task(Q, []() {}); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); @@ -74,12 +78,13 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskKernelNoEvent) { oneapiext::submit(Q, [&](handler &CGH) { oneapiext::single_task(CGH, Kernel); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); @@ -93,32 +98,35 @@ TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutKernelNoEvent) { oneapiext::single_task(Q, Kernel); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); oneapiext::submit(Q, [&](handler &CGH) { oneapiext::parallel_for(CGH, range<1>{32}, [](item<1>) {}); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); oneapiext::parallel_for(Q, range<1>{32}, [](item<1>) {}); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); @@ -133,12 +141,13 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForKernelNoEvent) { oneapiext::parallel_for(CGH, range<1>{32}, Kernel); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); @@ -152,34 +161,37 @@ TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutKernelNoEvent) { oneapiext::parallel_for(Q, range<1>{32}, Kernel); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); oneapiext::submit(Q, [&](handler &CGH) { oneapiext::nd_launch( CGH, nd_range<1>{range<1>{32}, range<1>{32}}, [](nd_item<1>) {}); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); oneapiext::nd_launch(Q, nd_range<1>{range<1>{32}, range<1>{32}}, [](nd_item<1>) {}); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); @@ -194,12 +206,13 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchKernelNoEvent) { oneapiext::nd_launch(CGH, nd_range<1>{range<1>{32}, range<1>{32}}, Kernel); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); @@ -213,7 +226,7 @@ TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutKernelNoEvent) { oneapiext::nd_launch(Q, nd_range<1>{range<1>{32}, range<1>{32}}, Kernel); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(EnqueueFunctionsEventsTests, SubmitMemcpyNoEvent) { @@ -375,8 +388,9 @@ TEST_F(EnqueueFunctionsEventsTests, MemAdviseShortcutNoEvent) { TEST_F(EnqueueFunctionsEventsTests, BarrierBeforeHostTask) { // Special test for case where host_task need an event after, so a barrier is // enqueued to create a usable event. - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback( "urEnqueueEventsWaitWithBarrier", &after_urEnqueueEventsWaitWithBarrier); @@ -388,7 +402,7 @@ TEST_F(EnqueueFunctionsEventsTests, BarrierBeforeHostTask) { [&]() { HostTaskTimestamp = std::chrono::steady_clock::now(); }); }).wait(); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); ASSERT_EQ(counter_urEnqueueEventsWaitWithBarrier, size_t{1}); ASSERT_TRUE(HostTaskTimestamp > timestamp_urEnqueueEventsWaitWithBarrier); } diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp index be5302a1c12a4..d451e51f99652 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp @@ -15,6 +15,8 @@ #define __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS #include +using namespace FreeFunctionEventsHelpers; + class TestFunctor { public: void operator()() const {} @@ -77,7 +79,7 @@ class FreeFunctionCommandsEventsTests : public ::testing::Test { protected: void SetUp() override { - counter_urEnqueueKernelLaunch = 0; + counter_urEnqueueKernelLaunchWithArgsExp = 0; counter_urUSMEnqueueMemcpy = 0; counter_urUSMEnqueueFill = 0; counter_urUSMEnqueuePrefetch = 0; @@ -90,26 +92,29 @@ class FreeFunctionCommandsEventsTests : public ::testing::Test { }; TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchTaskNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); sycl::khr::submit(Queue, [&](sycl::handler &Handler) { sycl::khr::launch_task(Handler, TestFunctor()); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, LaunchTaskShortcutNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); sycl::khr::launch_task(Queue, TestFunctor()); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchTaskKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); auto KID = sycl::get_kernel_id(); @@ -123,12 +128,13 @@ TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchTaskKernelNoEvent) { sycl::khr::launch_task(Handler, Kernel); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, LaunchTaskShortcutKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); @@ -142,33 +148,36 @@ TEST_F(FreeFunctionCommandsEventsTests, LaunchTaskShortcutKernelNoEvent) { sycl::khr::launch_task(Queue, Kernel); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchForNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); sycl::khr::submit(Queue, [&](sycl::handler &Handler) { sycl::khr::launch(Handler, sycl::range<1>{32}, TestFunctor()); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, LaunchForShortcutNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); sycl::khr::launch(Queue, sycl::range<1>{32}, TestFunctor()); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchForKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); @@ -183,12 +192,13 @@ TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchForKernelNoEvent) { sycl::khr::launch(Handler, sycl::range<1>{32}, Kernel); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, LaunchForShortcutKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); @@ -202,35 +212,38 @@ TEST_F(FreeFunctionCommandsEventsTests, LaunchForShortcutKernelNoEvent) { sycl::khr::launch(Queue, sycl::range<1>{32}, Kernel); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); sycl::khr::submit(Queue, [&](sycl::handler &Handler) { sycl::khr::launch_grouped(Handler, sycl::range<1>{32}, sycl::range<1>{32}, TestFunctor()); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32}, TestFunctor()); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutMoveKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); TestMoveFunctor::MoveCtorCalls = 0; TestMoveFunctor MoveOnly; @@ -245,7 +258,7 @@ TEST_F(FreeFunctionCommandsEventsTests, std::move(MoveOnly)); ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 0); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); // Another kernel submission is queued behind a host task, // to force the scheduler-based submission. In this case, the HostKernel @@ -254,7 +267,8 @@ TEST_F(FreeFunctionCommandsEventsTests, // Replace the callback with an event based one, since the scheduler // needs to create an event internally mock::getCallbacks().set_replace_callback( - "urEnqueueKernelLaunch", &redefined_urEnqueueKernelLaunchWithEvent); + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithEvent); Queue.submit([&](sycl::handler &CGH) { CGH.host_task([&] { @@ -282,8 +296,9 @@ TEST_F(FreeFunctionCommandsEventsTests, } TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); @@ -299,12 +314,13 @@ TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedKernelNoEvent) { Kernel); }); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutKernelNoEvent) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback("urKernelGetInfo", &after_urKernelGetInfo); @@ -319,7 +335,7 @@ TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutKernelNoEvent) { sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32}, Kernel); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); } TEST_F(FreeFunctionCommandsEventsTests, SubmitMemcpyNoEvent) { @@ -482,8 +498,9 @@ TEST_F(FreeFunctionCommandsEventsTests, MemAdviseShortcutNoEvent) { TEST_F(FreeFunctionCommandsEventsTests, BarrierBeforeHostTask) { // Special test for case where host_task need an event after, so a barrier is // enqueued to create a usable event. - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefined_urEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback( "urEnqueueEventsWaitWithBarrier", &after_urEnqueueEventsWaitWithBarrier); @@ -497,7 +514,7 @@ TEST_F(FreeFunctionCommandsEventsTests, BarrierBeforeHostTask) { }) .wait(); - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithArgsExp, size_t{1}); ASSERT_EQ(counter_urEnqueueEventsWaitWithBarrier, size_t{1}); ASSERT_TRUE(HostTaskTimestamp > timestamp_urEnqueueEventsWaitWithBarrier); } diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp index 4e57f82002654..a87dda42021d1 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp @@ -9,6 +9,8 @@ #include #include +namespace FreeFunctionEventsHelpers { + inline ur_result_t after_urKernelGetInfo(void *pParams) { auto params = *static_cast(pParams); constexpr char MockKernel[] = "TestKernel"; @@ -23,10 +25,11 @@ inline ur_result_t after_urKernelGetInfo(void *pParams) { return UR_RESULT_SUCCESS; } -static thread_local size_t counter_urEnqueueKernelLaunch = 0; -inline ur_result_t redefined_urEnqueueKernelLaunch(void *pParams) { - ++counter_urEnqueueKernelLaunch; - auto params = *static_cast(pParams); +static thread_local size_t counter_urEnqueueKernelLaunchWithArgsExp = 0; +inline ur_result_t redefined_urEnqueueKernelLaunchWithArgsExp(void *pParams) { + ++counter_urEnqueueKernelLaunchWithArgsExp; + auto params = + *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); return UR_RESULT_SUCCESS; } @@ -34,7 +37,8 @@ inline ur_result_t redefined_urEnqueueKernelLaunch(void *pParams) { static thread_local size_t counter_urEnqueueKernelLaunchWithEvent = 0; inline ur_result_t redefined_urEnqueueKernelLaunchWithEvent(void *pParams) { ++counter_urEnqueueKernelLaunchWithEvent; - auto params = *static_cast(pParams); + auto params = + *static_cast(pParams); EXPECT_NE(*params.pphEvent, nullptr); return UR_RESULT_SUCCESS; } @@ -79,3 +83,5 @@ inline ur_result_t after_urEnqueueEventsWaitWithBarrier(void *pParams) { timestamp_urEnqueueEventsWaitWithBarrier = std::chrono::steady_clock::now(); return UR_RESULT_SUCCESS; } + +} // namespace FreeFunctionEventsHelpers diff --git a/sycl/unittests/Extensions/USMMemcpy2D.cpp b/sycl/unittests/Extensions/USMMemcpy2D.cpp index e05164d2ac66d..bf8b5df0c4793 100644 --- a/sycl/unittests/Extensions/USMMemcpy2D.cpp +++ b/sycl/unittests/Extensions/USMMemcpy2D.cpp @@ -264,8 +264,9 @@ ur_result_t after_urKernelCreate(void *pParams) { std::string LastEnqueuedKernel; -ur_result_t after_urEnqueueKernelLaunch(void *pParams) { - auto params = *static_cast(pParams); +ur_result_t after_urEnqueueKernelLaunchWithArgsExp(void *pParams) { + auto params = + *static_cast(pParams); auto KernelIt = KernelToNameMap.find(*params.phKernel); EXPECT_TRUE(KernelIt != KernelToNameMap.end()); LastEnqueuedKernel = KernelIt->second; @@ -347,8 +348,9 @@ TEST(USMMemcpy2DTest, USMMemops2DUnsupported) { &after_urDeviceGetInfo); mock::getCallbacks().set_after_callback("urKernelCreate", &after_urKernelCreate); - mock::getCallbacks().set_after_callback("urEnqueueKernelLaunch", - &after_urEnqueueKernelLaunch); + mock::getCallbacks().set_after_callback( + "urEnqueueKernelLaunchWithArgsExp", + &after_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback( "urUSMGetMemAllocInfo", &after_urUSMGetMemAllocInfo); @@ -388,8 +390,9 @@ TEST(USMMemcpy2DTest, USMFillSupportedOnly) { &after_urDeviceGetInfo); mock::getCallbacks().set_after_callback("urKernelCreate", &after_urKernelCreate); - mock::getCallbacks().set_after_callback("urEnqueueKernelLaunch", - &after_urEnqueueKernelLaunch); + mock::getCallbacks().set_after_callback( + "urEnqueueKernelLaunchWithArgsExp", + &after_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_replace_callback("urEnqueueUSMFill2D", &redefine_urEnqueueUSMFill2D); mock::getCallbacks().set_after_callback( @@ -435,8 +438,9 @@ TEST(USMMemcpy2DTest, USMMemsetSupportedOnly) { &after_urDeviceGetInfo); mock::getCallbacks().set_after_callback("urKernelCreate", &after_urKernelCreate); - mock::getCallbacks().set_after_callback("urEnqueueKernelLaunch", - &after_urEnqueueKernelLaunch); + mock::getCallbacks().set_after_callback( + "urEnqueueKernelLaunchWithArgsExp", + &after_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_after_callback( "urUSMGetMemAllocInfo", &after_urUSMGetMemAllocInfo); mock::getCallbacks().set_replace_callback("urEnqueueUSMFill2D", @@ -480,8 +484,9 @@ TEST(USMMemcpy2DTest, USMMemcpySupportedOnly) { &after_urDeviceGetInfo); mock::getCallbacks().set_after_callback("urKernelCreate", &after_urKernelCreate); - mock::getCallbacks().set_after_callback("urEnqueueKernelLaunch", - &after_urEnqueueKernelLaunch); + mock::getCallbacks().set_after_callback( + "urEnqueueKernelLaunchWithArgsExp", + &after_urEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_replace_callback("urEnqueueUSMMemcpy2D", &redefine_urEnqueueUSMMemcpy2D); mock::getCallbacks().set_after_callback( diff --git a/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp b/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp index 97159ba3a278d..3abf2f7eca47b 100644 --- a/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp +++ b/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp @@ -46,16 +46,24 @@ static sycl::unittest::MockDeviceImage Img = sycl::unittest::generateDefaultImage({"WorkGroupMemoryKernel"}); static sycl::unittest::MockDeviceImageArray<1> ImgArray{&Img}; -static int urKernelSetArgLocalCalls = 0; -inline ur_result_t redefined_urKernelSetArgLocal(void *) { - ++urKernelSetArgLocalCalls; +static int LocalMemArgs = 0; +inline ur_result_t redefined_urEnqueueKernelLaunchWithArgsExp(void *pParams) { + auto params = + *static_cast(pParams); + auto Args = *params.ppArgs; + for (uint32_t i = 0; i < *params.pnumArgs; i++) { + if (Args[i].type == UR_EXP_KERNEL_ARG_TYPE_LOCAL) { + ++LocalMemArgs; + } + } return UR_RESULT_SUCCESS; } TEST(URArgumentTest, URArgumentTest) { sycl::unittest::UrMock<> Mock; - mock::getCallbacks().set_replace_callback("urKernelSetArgLocal", - &redefined_urKernelSetArgLocal); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); sycl::platform Platform = sycl::platform(); const sycl::device dev = Platform.get_devices()[0]; sycl::queue q{dev}; @@ -66,5 +74,5 @@ TEST(URArgumentTest, URArgumentTest) { kernel); }); q.wait(); - ASSERT_EQ(urKernelSetArgLocalCalls, 1); + ASSERT_EQ(LocalMemArgs, 1); } diff --git a/sycl/unittests/buffer/KernelArgMemObj.cpp b/sycl/unittests/buffer/KernelArgMemObj.cpp index b826e89a128b8..136f5b73843c4 100644 --- a/sycl/unittests/buffer/KernelArgMemObj.cpp +++ b/sycl/unittests/buffer/KernelArgMemObj.cpp @@ -42,13 +42,21 @@ static sycl::unittest::MockDeviceImageArray<1> ImgArray{&Img}; using namespace sycl; bool PropertyPresent = false; -ur_kernel_arg_mem_obj_properties_t PropsCopy{}; - -ur_result_t redefinedKernelSetArgMemObj(void *pParams) { - auto params = *static_cast(pParams); - PropertyPresent = *params.ppProperties != nullptr; - if (PropertyPresent) - PropsCopy = **params.ppProperties; +ur_mem_flags_t MemFlags{}; + +ur_result_t redefinedEnqueueKernelLaunchWithArgsExp(void *pParams) { + auto params = + *static_cast(pParams); + auto Args = *params.ppArgs; + for (uint32_t i = 0; i < *params.pnumArgs; i++) { + if (Args[i].type != UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ) { + continue; + } + PropertyPresent = Args[i].value.memObjTuple.flags != 0; + if (PropertyPresent) { + MemFlags = Args[i].value.memObjTuple.flags; + } + } return UR_RESULT_SUCCESS; } @@ -59,9 +67,10 @@ class BuferTestUrArgs : public ::testing::Test { protected: void SetUp() override { PropertyPresent = false; - PropsCopy = {}; - mock::getCallbacks().set_before_callback("urKernelSetArgMemObj", - &redefinedKernelSetArgMemObj); + MemFlags = 0; + mock::getCallbacks().set_before_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefinedEnqueueKernelLaunchWithArgsExp); } template @@ -80,7 +89,7 @@ class BuferTestUrArgs : public ::testing::Test { }) .wait(); ASSERT_TRUE(PropertyPresent); - EXPECT_EQ(PropsCopy.memoryAccess, ExpectedAccessMode); + EXPECT_EQ(MemFlags, ExpectedAccessMode); } protected: diff --git a/sycl/unittests/handler/SetArgForLocalAccessor.cpp b/sycl/unittests/handler/SetArgForLocalAccessor.cpp index 7a9079872ce36..aef26577bc50b 100644 --- a/sycl/unittests/handler/SetArgForLocalAccessor.cpp +++ b/sycl/unittests/handler/SetArgForLocalAccessor.cpp @@ -21,9 +21,15 @@ namespace { size_t LocalBufferArgSize = 0; -ur_result_t redefined_urKernelSetArgLocal(void *pParams) { - auto params = *static_cast(pParams); - LocalBufferArgSize = *params.pargSize; +ur_result_t redefined_urEnqueueKernelLaunchWithArgsExp(void *pParams) { + auto params = + *static_cast(pParams); + auto Args = *params.ppArgs; + for (uint32_t i = 0; i < *params.pnumArgs; i++) { + if (Args[i].type == UR_EXP_KERNEL_ARG_TYPE_LOCAL) { + LocalBufferArgSize = Args[i].size; + } + } return UR_RESULT_SUCCESS; } @@ -31,8 +37,9 @@ ur_result_t redefined_urKernelSetArgLocal(void *pParams) { TEST(HandlerSetArg, LocalAccessor) { sycl::unittest::UrMock<> Mock; redefineMockForKernelInterop(Mock); - mock::getCallbacks().set_replace_callback("urKernelSetArgLocal", - &redefined_urKernelSetArgLocal); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); constexpr size_t Size = 128; sycl::queue Q; diff --git a/sycl/unittests/scheduler/FailedCommands.cpp b/sycl/unittests/scheduler/FailedCommands.cpp index 8207ade72f0d0..9104c1d9e4a10 100644 --- a/sycl/unittests/scheduler/FailedCommands.cpp +++ b/sycl/unittests/scheduler/FailedCommands.cpp @@ -76,7 +76,7 @@ ur_result_t failingUrCall(void *) { return UR_RESULT_ERROR_UNKNOWN; } TEST_F(SchedulerTest, FailedKernelException) { unittest::UrMock<> Mock; - mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", + mock::getCallbacks().set_before_callback("urEnqueueKernelLaunchWithArgsExp", &failingUrCall); RunWithFailedCommandsAndCheck(true, 0); } @@ -94,7 +94,8 @@ ur_event_handle_t DummyEvent = mock::createDummyHandle(); inline ur_result_t failedEnqueueKernelLaunchWithDummy(void *pParams) { DummyEventReturned = true; - auto params = *static_cast(pParams); + auto params = + *static_cast(pParams); **params.pphEvent = DummyEvent; return UR_RESULT_ERROR_UNKNOWN; } @@ -120,7 +121,7 @@ TEST(FailedCommandsTest, CheckUREventReleaseWithKernel) { DummyEventReleaseAttempt = false; DummyEventReturned = false; sycl::unittest::UrMock<> Mock; - mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", + mock::getCallbacks().set_before_callback("urEnqueueKernelLaunchWithArgsExp", &failedEnqueueKernelLaunchWithDummy); mock::getCallbacks().set_before_callback("urEventRelease", &checkDummyInEventRelease); diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp index 6467ff7a2809d..fa33c3a06d9bc 100644 --- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp @@ -129,8 +129,9 @@ TEST_P(SchedulerTest, InOrderQueueIsolatedDeps) { std::vector KernelEventListSize; -inline ur_result_t customEnqueueKernelLaunch(void *pParams) { - auto params = *static_cast(pParams); +inline ur_result_t customEnqueueKernelLaunchWithArgsExp(void *pParams) { + auto params = + *static_cast(pParams); KernelEventListSize.push_back(*params.pnumEventsInWaitList); return UR_RESULT_SUCCESS; } @@ -139,8 +140,9 @@ TEST_P(SchedulerTest, TwoInOrderQueuesOnSameContext) { KernelEventListSize.clear(); sycl::unittest::UrMock<> Mock; bool UseShortcutFunction = GetParam(); - mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", - &customEnqueueKernelLaunch); + mock::getCallbacks().set_before_callback( + "urEnqueueKernelLaunchWithArgsExp", + &customEnqueueKernelLaunchWithArgsExp); sycl::platform Plt = sycl::platform(); @@ -166,8 +168,9 @@ TEST_P(SchedulerTest, InOrderQueueNoSchedulerPath) { KernelEventListSize.clear(); sycl::unittest::UrMock<> Mock; bool UseShortcutFunction = GetParam(); - mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", - &customEnqueueKernelLaunch); + mock::getCallbacks().set_before_callback( + "urEnqueueKernelLaunchWithArgsExp", + &customEnqueueKernelLaunchWithArgsExp); sycl::platform Plt = sycl::platform(); diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 7dae968013948..c79c6ecdbba51 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -51,8 +51,9 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { enum class CommandType { KERNEL = 1, MEMSET = 2, HOST_TASK = 3 }; std::vector> ExecutedCommands; -inline ur_result_t customEnqueueKernelLaunch(void *pParams) { - auto params = *static_cast(pParams); +inline ur_result_t customEnqueueKernelLaunchWithArgsExp(void *pParams) { + auto params = + *static_cast(pParams); ExecutedCommands.push_back( {CommandType::KERNEL, *params.pnumEventsInWaitList}); return UR_RESULT_SUCCESS; @@ -68,8 +69,9 @@ inline ur_result_t customEnqueueUSMFill(void *pParams) { TEST_F(SchedulerTest, InOrderQueueCrossDeps) { ExecutedCommands.clear(); sycl::unittest::UrMock<> Mock; - mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", - &customEnqueueKernelLaunch); + mock::getCallbacks().set_before_callback( + "urEnqueueKernelLaunchWithArgsExp", + &customEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_before_callback("urEnqueueUSMFill", &customEnqueueUSMFill); @@ -121,8 +123,9 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { ExecutedCommands.clear(); sycl::unittest::UrMock<> Mock; - mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", - &customEnqueueKernelLaunch); + mock::getCallbacks().set_before_callback( + "urEnqueueKernelLaunchWithArgsExp", + &customEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_before_callback("urEnqueueUSMFill", &customEnqueueUSMFill); @@ -166,8 +169,9 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) { ExecutedCommands.clear(); sycl::unittest::UrMock<> Mock; - mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", - &customEnqueueKernelLaunch); + mock::getCallbacks().set_before_callback( + "urEnqueueKernelLaunchWithArgsExp", + &customEnqueueKernelLaunchWithArgsExp); sycl::platform Plt = sycl::platform(); diff --git a/sycl/unittests/scheduler/RequiredWGSize.cpp b/sycl/unittests/scheduler/RequiredWGSize.cpp index 675036ec30081..851ee3ee32d38 100644 --- a/sycl/unittests/scheduler/RequiredWGSize.cpp +++ b/sycl/unittests/scheduler/RequiredWGSize.cpp @@ -37,8 +37,9 @@ static ur_result_t redefinedKernelGetGroupInfo(void *pParams) { return UR_RESULT_SUCCESS; } -static ur_result_t redefinedEnqueueKernelLaunch(void *pParams) { - auto params = *static_cast(pParams); +static ur_result_t redefinedEnqueueKernelLaunchWithArgsExp(void *pParams) { + auto params = + *static_cast(pParams); if (*params.ppLocalWorkSize) { IncomingLocalSize[0] = (*params.ppLocalWorkSize)[0]; IncomingLocalSize[1] = (*params.ppLocalWorkSize)[1]; @@ -56,8 +57,9 @@ static void reset() { static void performChecks() { sycl::unittest::UrMock<> Mock; sycl::platform Plt = sycl::platform(); - mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", - &redefinedEnqueueKernelLaunch); + mock::getCallbacks().set_before_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefinedEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_before_callback("urKernelGetGroupInfo", &redefinedKernelGetGroupInfo); diff --git a/sycl/unittests/thread_safety/InteropKernelEnqueue.cpp b/sycl/unittests/thread_safety/InteropKernelEnqueue.cpp index ca54cf0d908d6..935ca1eba18ac 100644 --- a/sycl/unittests/thread_safety/InteropKernelEnqueue.cpp +++ b/sycl/unittests/thread_safety/InteropKernelEnqueue.cpp @@ -24,23 +24,33 @@ constexpr std::size_t LaunchCount = 8; uint32_t LastArgSet = -1; std::size_t LastThread = -1; -ur_result_t redefined_urKernelSetArgValue(void *pParams) { - auto params = *static_cast(pParams); - EXPECT_EQ((LastArgSet + 1) % NArgs, *params.pargIndex); - LastArgSet = *params.pargIndex; - std::size_t ArgValue = *static_cast(*params.ppArgValue); - if (*params.pargIndex == 0) - LastThread = ArgValue; - else - EXPECT_EQ(LastThread, ArgValue); +ur_result_t redefined_urEnqueueKernelLaunchWithArgsExp(void *pParams) { + auto params = + *static_cast(pParams); + auto Args = *params.ppArgs; + for (uint32_t i = 0; i < *params.pnumArgs; i++) { + if (Args[i].type != UR_EXP_KERNEL_ARG_TYPE_VALUE) { + continue; + } + auto ArgIndex = Args[i].index; + EXPECT_EQ((LastArgSet + 1) % NArgs, ArgIndex); + LastArgSet = ArgIndex; + std::size_t ArgValue = + *static_cast(Args[i].value.pointer); + if (ArgIndex == 0) + LastThread = ArgValue; + else + EXPECT_EQ(LastThread, ArgValue); + } return UR_RESULT_SUCCESS; } TEST(KernelEnqueue, InteropKernel) { unittest::UrMock<> Mock; redefineMockForKernelInterop(Mock); - mock::getCallbacks().set_replace_callback("urKernelSetArgValue", - &redefined_urKernelSetArgValue); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefined_urEnqueueKernelLaunchWithArgsExp); platform Plt = sycl::platform(); queue Q; diff --git a/sycl/unittests/xpti_trace/QueueApiFailures.cpp b/sycl/unittests/xpti_trace/QueueApiFailures.cpp index 1e88143774e21..303567ddcf953 100644 --- a/sycl/unittests/xpti_trace/QueueApiFailures.cpp +++ b/sycl/unittests/xpti_trace/QueueApiFailures.cpp @@ -30,7 +30,7 @@ inline ur_result_t redefinedAdapterGetLastError(void *) { return UR_RESULT_ERROR_INVALID_VALUE; } -ur_result_t redefinedEnqueueKernelLaunch(void *) { +ur_result_t redefinedEnqueueKernelLaunchWithArgsExp(void *) { return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } @@ -92,8 +92,9 @@ class QueueApiFailures : public ::testing::Test { }; TEST_F(QueueApiFailures, QueueSubmit) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefinedEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefinedEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_replace_callback("urAdapterGetLastError", &redefinedAdapterGetLastError); sycl::queue Q; @@ -116,8 +117,9 @@ TEST_F(QueueApiFailures, QueueSubmit) { } TEST_F(QueueApiFailures, QueueSingleTask) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefinedEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefinedEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_replace_callback("urAdapterGetLastError", &redefinedAdapterGetLastError); sycl::queue Q; @@ -319,8 +321,9 @@ TEST_F(QueueApiFailures, QueueMemAdvise) { } TEST_F(QueueApiFailures, QueueParallelFor) { - mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", - &redefinedEnqueueKernelLaunch); + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunchWithArgsExp", + &redefinedEnqueueKernelLaunchWithArgsExp); mock::getCallbacks().set_replace_callback("urAdapterGetLastError", &redefinedAdapterGetLastError); sycl::queue Q; @@ -451,7 +454,8 @@ ur_result_t redefinedEnqueueKernelLaunchWithStatus(void *) { TEST_F(QueueApiFailures, QueueKernelAsync) { mock::getCallbacks().set_replace_callback( - "urEnqueueKernelLaunch", &redefinedEnqueueKernelLaunchWithStatus); + "urEnqueueKernelLaunchWithArgsExp", + &redefinedEnqueueKernelLaunchWithStatus); mock::getCallbacks().set_replace_callback("urAdapterGetLastError", &redefinedAdapterGetLastError); diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 906c32b837bbe..3d1136cfe2d1f 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -475,6 +475,8 @@ typedef enum ur_function_t { UR_FUNCTION_MEMORY_EXPORT_EXPORT_MEMORY_HANDLE_EXP = 287, /// Enumerator for ::urBindlessImagesSupportsImportingHandleTypeExp UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP = 288, + /// Enumerator for ::urEnqueueKernelLaunchWithArgsExp + UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP = 289, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -588,6 +590,8 @@ typedef enum ur_structure_type_t { UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES = 0x3000, /// ::ur_exp_enqueue_ext_properties_t UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES = 0x4000, + /// ::ur_exp_kernel_arg_properties_t + UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES = 0x5000, /// @cond UR_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -12895,6 +12899,166 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( /// propName. size_t *pPropSizeRet); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental API for setting args at kernel +// launch +#if !defined(__GNUC__) +#pragma region enqueue_kernel_launch_with_args_(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief What kind of kernel arg is this +typedef enum ur_exp_kernel_arg_type_t { + /// Kernel arg is a value. + UR_EXP_KERNEL_ARG_TYPE_VALUE = 0, + /// Kernel arg is a pointer. + UR_EXP_KERNEL_ARG_TYPE_POINTER = 1, + /// Kernel arg is a memory object. + UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ = 2, + /// Kernel arg is a local allocation. + UR_EXP_KERNEL_ARG_TYPE_LOCAL = 3, + /// Kernel arg is a sampler. + UR_EXP_KERNEL_ARG_TYPE_SAMPLER = 4, + /// @cond + UR_EXP_KERNEL_ARG_TYPE_FORCE_UINT32 = 0x7fffffff + /// @endcond + +} ur_exp_kernel_arg_type_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Mem obj/properties tuple +typedef struct ur_exp_kernel_arg_mem_obj_tuple_t { + /// [in] Handle of a memory object + ur_mem_handle_t hMem; + /// [in] Memory flags to associate with `hMem`. Allowed values are: + /// ::UR_MEM_FLAG_READ_WRITE, ::UR_MEM_FLAG_WRITE_ONLY, + /// ::UR_MEM_FLAG_READ_ONLY. + ur_mem_flags_t flags; + +} ur_exp_kernel_arg_mem_obj_tuple_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Typesafe container for a kernel argument value +typedef union ur_exp_kernel_arg_value_t { + /// [in] argument value represented as matching arg type. + /// The data pointed to will be copied and therefore can be reused on return. + const void *value; + /// [in] Allocation obtained by USM allocation or virtual memory mapping + /// operation, or pointer to a literal value. + const void *pointer; + /// [in] Struct containing a memory object and associated flags. + ur_exp_kernel_arg_mem_obj_tuple_t memObjTuple; + /// [in] Handle of a sampler object. + ur_sampler_handle_t sampler; + +} ur_exp_kernel_arg_value_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Kernel arg properties +typedef struct ur_exp_kernel_arg_properties_t { + /// [in] type of this structure, must be + /// ::UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES + ur_structure_type_t stype; + /// [in,out][optional] pointer to extension-specific structure + void *pNext; + /// [in] type of the kernel arg + ur_exp_kernel_arg_type_t type; + /// [in] index of the kernel arg + uint32_t index; + /// [in] size of the kernel arg + size_t size; + /// [in][tagged_by(type)] Union containing the argument value. + ur_exp_kernel_arg_value_t value; + +} ur_exp_kernel_arg_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command to execute a kernel +/// +/// @remarks +/// _Analogues_ +/// - **clEnqueueNDRangeKernel** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// + `NULL == hKernel` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkSize` +/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `pArgs == NULL && numArgs > 0` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS +/// + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR. +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || +/// pGlobalWorkSize[2] == 0` +/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE +/// + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == +/// 0 || pLocalWorkSize[2] == 0)` +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values +/// have not been specified." +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If any property in `launchPropList` isn't supported by the device. +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent); + #if !defined(__GNUC__) #pragma endregion #endif @@ -14497,6 +14661,26 @@ typedef struct ur_enqueue_write_host_pipe_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_write_host_pipe_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urEnqueueKernelLaunchWithArgsExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_enqueue_kernel_launch_with_args_exp_params_t { + ur_queue_handle_t *phQueue; + ur_kernel_handle_t *phKernel; + uint32_t *pworkDim; + const size_t **ppGlobalWorkOffset; + const size_t **ppGlobalWorkSize; + const size_t **ppLocalWorkSize; + uint32_t *pnumArgs; + const ur_exp_kernel_arg_properties_t **ppArgs; + uint32_t *pnumPropsInLaunchPropList; + const ur_kernel_launch_property_t **plaunchPropList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; + ur_event_handle_t **pphEvent; +} ur_enqueue_kernel_launch_with_args_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urEnqueueEventsWaitWithBarrierExt /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/unified-runtime/include/ur_api_funcs.def b/unified-runtime/include/ur_api_funcs.def index f0c92445b9238..97092258a5a46 100644 --- a/unified-runtime/include/ur_api_funcs.def +++ b/unified-runtime/include/ur_api_funcs.def @@ -133,6 +133,7 @@ _UR_API(urEnqueueDeviceGlobalVariableRead) _UR_API(urEnqueueReadHostPipe) _UR_API(urEnqueueWriteHostPipe) _UR_API(urEnqueueEventsWaitWithBarrierExt) +_UR_API(urEnqueueKernelLaunchWithArgsExp) _UR_API(urEnqueueUSMDeviceAllocExp) _UR_API(urEnqueueUSMSharedAllocExp) _UR_API(urEnqueueUSMHostAllocExp) diff --git a/unified-runtime/include/ur_ddi.h b/unified-runtime/include/ur_ddi.h index f59e15a9eb3cd..b0c95cd7509fc 100644 --- a/unified-runtime/include/ur_ddi.h +++ b/unified-runtime/include/ur_ddi.h @@ -1097,6 +1097,15 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( typedef ur_result_t(UR_APICALL *ur_pfnGetEnqueueProcAddrTable_t)( ur_api_version_t, ur_enqueue_dditable_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urEnqueueKernelLaunchWithArgsExp +typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchWithArgsExp_t)( + ur_queue_handle_t, ur_kernel_handle_t, uint32_t, const size_t *, + const size_t *, const size_t *, uint32_t, + const ur_exp_kernel_arg_properties_t *, uint32_t, + const ur_kernel_launch_property_t *, uint32_t, const ur_event_handle_t *, + ur_event_handle_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urEnqueueUSMDeviceAllocExp typedef ur_result_t(UR_APICALL *ur_pfnEnqueueUSMDeviceAllocExp_t)( @@ -1147,6 +1156,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueNativeCommandExp_t)( /////////////////////////////////////////////////////////////////////////////// /// @brief Table of EnqueueExp functions pointers typedef struct ur_enqueue_exp_dditable_t { + ur_pfnEnqueueKernelLaunchWithArgsExp_t pfnKernelLaunchWithArgsExp; ur_pfnEnqueueUSMDeviceAllocExp_t pfnUSMDeviceAllocExp; ur_pfnEnqueueUSMSharedAllocExp_t pfnUSMSharedAllocExp; ur_pfnEnqueueUSMHostAllocExp_t pfnUSMHostAllocExp; diff --git a/unified-runtime/include/ur_print.h b/unified-runtime/include/ur_print.h index 7c4528ec9ea81..567960c90cfff 100644 --- a/unified-runtime/include/ur_print.h +++ b/unified-runtime/include/ur_print.h @@ -1425,6 +1425,36 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_kernel_arg_type_t enum +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL +urPrintExpKernelArgType(enum ur_exp_kernel_arg_type_t value, char *buffer, + const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_kernel_arg_mem_obj_tuple_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpKernelArgMemObjTuple( + const struct ur_exp_kernel_arg_mem_obj_tuple_t params, char *buffer, + const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_kernel_arg_properties_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpKernelArgProperties( + const struct ur_exp_kernel_arg_properties_t params, char *buffer, + const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_exp_enqueue_ext_flag_t enum /// @returns @@ -2694,6 +2724,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams( const struct ur_enqueue_write_host_pipe_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_enqueue_kernel_launch_with_args_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueKernelLaunchWithArgsExpParams( + const struct ur_enqueue_kernel_launch_with_args_exp_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_enqueue_events_wait_with_barrier_ext_params_t struct /// @returns diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp index b3de6c166ca31..d8c98a667088b 100644 --- a/unified-runtime/include/ur_print.hpp +++ b/unified-runtime/include/ur_print.hpp @@ -50,6 +50,8 @@ inline ur_result_t printFlag(std::ostream &os, uint32_t flag); template inline ur_result_t printTagged(std::ostream &os, const void *ptr, T value, size_t size); +template +inline ur_result_t printArray(std::ostream &os, const T *ptr); inline ur_result_t printStruct(std::ostream &os, const void *ptr); @@ -263,6 +265,10 @@ template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_peer_info_t value, size_t size); +inline ur_result_t printUnion(std::ostream &os, + const union ur_exp_kernel_arg_value_t params, + const enum ur_exp_kernel_arg_type_t tag); + template <> inline ur_result_t printFlag(std::ostream &os, uint32_t flag); @@ -594,6 +600,14 @@ operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_command_buffer_update_kernel_launch_desc_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_peer_info_t value); +inline std::ostream &operator<<(std::ostream &os, + enum ur_exp_kernel_arg_type_t value); +inline std::ostream &operator<<( + std::ostream &os, + [[maybe_unused]] const struct ur_exp_kernel_arg_mem_obj_tuple_t params); +inline std::ostream & +operator<<(std::ostream &os, + [[maybe_unused]] const struct ur_exp_kernel_arg_properties_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_enqueue_ext_flag_t value); inline std::ostream &operator<<( @@ -1276,6 +1290,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP: os << "UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP"; break; + case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP: + os << "UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP"; + break; default: os << "unknown enumerator"; break; @@ -1445,6 +1462,9 @@ inline std::ostream &operator<<(std::ostream &os, case UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES: os << "UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES"; break; + case UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES: + os << "UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES"; + break; default: os << "unknown enumerator"; break; @@ -1761,6 +1781,12 @@ inline ur_result_t printStruct(std::ostream &os, const void *ptr) { (const ur_exp_enqueue_ext_properties_t *)ptr; printPtr(os, pstruct); } break; + + case UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES: { + const ur_exp_kernel_arg_properties_t *pstruct = + (const ur_exp_kernel_arg_properties_t *)ptr; + printPtr(os, pstruct); + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -10963,13 +10989,7 @@ printUnion(std::ostream &os, case UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: os << ".clusterDim = {"; - for (auto i = 0; i < 3; i++) { - if (i != 0) { - os << ", "; - } - - os << (params.clusterDim[i]); - } + ur::details::printArray<3>(os, params.clusterDim); os << "}"; break; @@ -11653,13 +11673,7 @@ operator<<(std::ostream &os, const struct ur_exp_sampler_addr_modes_t params) { os << ", "; os << ".addrModes = {"; - for (auto i = 0; i < 3; i++) { - if (i != 0) { - os << ", "; - } - - os << (params.addrModes[i]); - } + ur::details::printArray<3>(os, params.addrModes); os << "}"; os << "}"; @@ -12378,6 +12392,141 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, } } // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_kernel_arg_type_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, + enum ur_exp_kernel_arg_type_t value) { + switch (value) { + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + os << "UR_EXP_KERNEL_ARG_TYPE_VALUE"; + break; + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + os << "UR_EXP_KERNEL_ARG_TYPE_POINTER"; + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: + os << "UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ"; + break; + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + os << "UR_EXP_KERNEL_ARG_TYPE_LOCAL"; + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: + os << "UR_EXP_KERNEL_ARG_TYPE_SAMPLER"; + break; + default: + os << "unknown enumerator"; + break; + } + return os; +} +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_kernel_arg_mem_obj_tuple_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, + const struct ur_exp_kernel_arg_mem_obj_tuple_t params) { + os << "(struct ur_exp_kernel_arg_mem_obj_tuple_t){"; + + os << ".hMem = "; + + ur::details::printPtr(os, (params.hMem)); + + os << ", "; + os << ".flags = "; + + ur::details::printFlag(os, (params.flags)); + + os << "}"; + return os; +} +namespace ur::details { + +/////////////////////////////////////////////////////////////////////////////// +// @brief Print ur_exp_kernel_arg_value_t union +inline ur_result_t printUnion(std::ostream &os, + const union ur_exp_kernel_arg_value_t params, + const enum ur_exp_kernel_arg_type_t tag) { + os << "(union ur_exp_kernel_arg_value_t){"; + + switch (tag) { + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + + os << ".value = "; + + ur::details::printPtr(os, (params.value)); + + break; + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + + os << ".pointer = "; + + ur::details::printPtr(os, (params.pointer)); + + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: + + os << ".memObjTuple = "; + + os << (params.memObjTuple); + + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: + + os << ".sampler = "; + + ur::details::printPtr(os, (params.sampler)); + + break; + default: + os << ""; + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + os << "}"; + return UR_RESULT_SUCCESS; +} +} // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_kernel_arg_properties_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, + const struct ur_exp_kernel_arg_properties_t params) { + os << "(struct ur_exp_kernel_arg_properties_t){"; + + os << ".stype = "; + + os << (params.stype); + + os << ", "; + os << ".pNext = "; + + ur::details::printStruct(os, (params.pNext)); + + os << ", "; + os << ".type = "; + + os << (params.type); + + os << ", "; + os << ".index = "; + + os << (params.index); + + os << ", "; + os << ".size = "; + + os << (params.size); + + os << ", "; + os << ".value = "; + ur::details::printUnion(os, (params.value), params.type); + + os << "}"; + return os; +} /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_exp_enqueue_ext_flag_t type /// @returns @@ -17027,6 +17176,114 @@ inline std::ostream &operator<<( return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the +/// ur_enqueue_kernel_launch_with_args_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, [[maybe_unused]] const struct + ur_enqueue_kernel_launch_with_args_exp_params_t *params) { + + os << ".hQueue = "; + + ur::details::printPtr(os, *(params->phQueue)); + + os << ", "; + os << ".hKernel = "; + + ur::details::printPtr(os, *(params->phKernel)); + + os << ", "; + os << ".workDim = "; + + os << *(params->pworkDim); + + os << ", "; + os << ".pGlobalWorkOffset = "; + + ur::details::printPtr(os, *(params->ppGlobalWorkOffset)); + + os << ", "; + os << ".pGlobalWorkSize = "; + + ur::details::printPtr(os, *(params->ppGlobalWorkSize)); + + os << ", "; + os << ".pLocalWorkSize = "; + + ur::details::printPtr(os, *(params->ppLocalWorkSize)); + + os << ", "; + os << ".numArgs = "; + + os << *(params->pnumArgs); + + os << ", "; + os << ".pArgs = "; + ur::details::printPtr(os, reinterpret_cast(*(params->ppArgs))); + if (*(params->ppArgs) != NULL) { + os << " {"; + for (size_t i = 0; i < *params->pnumArgs; ++i) { + if (i != 0) { + os << ", "; + } + + os << (*(params->ppArgs))[i]; + } + os << "}"; + } + + os << ", "; + os << ".numPropsInLaunchPropList = "; + + os << *(params->pnumPropsInLaunchPropList); + + os << ", "; + os << ".launchPropList = "; + ur::details::printPtr( + os, reinterpret_cast(*(params->plaunchPropList))); + if (*(params->plaunchPropList) != NULL) { + os << " {"; + for (size_t i = 0; i < *params->pnumPropsInLaunchPropList; ++i) { + if (i != 0) { + os << ", "; + } + + os << (*(params->plaunchPropList))[i]; + } + os << "}"; + } + + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = "; + ur::details::printPtr( + os, reinterpret_cast(*(params->pphEventWaitList))); + if (*(params->pphEventWaitList) != NULL) { + os << " {"; + for (size_t i = 0; i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, (*(params->pphEventWaitList))[i]); + } + os << "}"; + } + + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, *(params->pphEvent)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the /// ur_enqueue_events_wait_with_barrier_ext_params_t type @@ -21160,6 +21417,25 @@ inline ur_result_t printPtr(std::ostream &os, const T *ptr) { return UR_RESULT_SUCCESS; } + +/////////////////////////////////////////////////////////////////////////////// +// @brief Print array of literals +template +inline ur_result_t printArray(std::ostream &os, const T *ptr) { + if (ptr == NULL) { + return printPtr(os, ptr); + } + + for (size_t i = 0; i < size; i++) { + if (i != 0) { + os << ", "; + } + + os << ptr[i]; + } + + return UR_RESULT_SUCCESS; +} } // namespace ur::details namespace ur::extras { @@ -21554,6 +21830,10 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE: { os << (const struct ur_enqueue_write_host_pipe_params_t *)params; } break; + case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP: { + os << (const struct ur_enqueue_kernel_launch_with_args_exp_params_t *) + params; + } break; case UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT: { os << (const struct ur_enqueue_events_wait_with_barrier_ext_params_t *) params; diff --git a/unified-runtime/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst b/unified-runtime/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst new file mode 100644 index 0000000000000..703cd1e935592 --- /dev/null +++ b/unified-runtime/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst @@ -0,0 +1,77 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-enqueue-kernel-launch-with-args: + +================================================================================ +Enqueue Kernel Launch With Args +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + + +Motivation +-------------------------------------------------------------------------------- + +If an application is setting a kernel's args and launching that kernel in the +same place, we can eliminate some overhead by allowing this to be accomplished +with one API call, rather than requiring one call for each argument and one to +launch. This also aligns with developments in the Level Zero backend, as well +as how CUDA and HIP handle kernel args. + +API +-------------------------------------------------------------------------------- + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}_structure_type_t + ${X}_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES + +* ${x}_exp_kernel_arg_type_t + ${X}_EXP_KERNEL_ARG_TYPE_VALUE + ${X}_EXP_KERNEL_ARG_TYPE_POINTER + ${X}_EXP_KERNEL_ARG_TYPE_MEM_OBJ + ${X}_EXP_KERNEL_ARG_TYPE_LOCAL + ${X}_EXP_KERNEL_ARG_TYPE_SAMPLER + +Types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}_exp_kernel_arg_mem_obj_tuple_t +* ${x}_exp_kernel_arg_value_t +* ${x}_exp_kernel_arg_properties_t + +Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}EnqueueKernelLaunchWithArgsExp + +Changelog +-------------------------------------------------------------------------------- + ++-----------+---------------------------------------------+ +| Revision | Changes | ++===========+=============================================+ +| 1.0 | Initial Draft | ++-----------+---------------------------------------------+ + +Support +-------------------------------------------------------------------------------- + +Adapters must support this feature. A naive implementation can easily be +constructed as a wrapper around the existing APIs for setting kernel args and +launching. + +Contributors +-------------------------------------------------------------------------------- + +* Aaron Greig `aaron.greig@codeplay.com `_ diff --git a/unified-runtime/scripts/core/exp-enqueue-kernel-launch-with-args.yml b/unified-runtime/scripts/core/exp-enqueue-kernel-launch-with-args.yml new file mode 100644 index 0000000000000..6656b6a6d0299 --- /dev/null +++ b/unified-runtime/scripts/core/exp-enqueue-kernel-launch-with-args.yml @@ -0,0 +1,170 @@ +# +# Copyright (C) 2025 Intel Corporation +# +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# See YaML.md for syntax definition +# +--- #-------------------------------------------------------------------------- +type: header +desc: "Intel $OneApi Unified Runtime Experimental API for setting args at kernel launch" +ordinal: "100" +--- #-------------------------------------------------------------------------- +type: enum +desc: "What kind of kernel arg is this" +class: $xKernel +name: $x_exp_kernel_arg_type_t +etors: + - name: VALUE + desc: "Kernel arg is a value." + - name: POINTER + desc: "Kernel arg is a pointer." + - name: MEM_OBJ + desc: "Kernel arg is a memory object." + - name: LOCAL + desc: "Kernel arg is a local allocation." + - name: SAMPLER + desc: "Kernel arg is a sampler." +--- #-------------------------------------------------------------------------- +type: struct +desc: "Mem obj/properties tuple" +name: $x_exp_kernel_arg_mem_obj_tuple_t +members: + - type: $x_mem_handle_t + name: hMem + desc: "[in] Handle of a memory object" + - type: $x_mem_flags_t + name: flags + desc: "[in] Memory flags to associate with `hMem`. Allowed values are: $X_MEM_FLAG_READ_WRITE, $X_MEM_FLAG_WRITE_ONLY, $X_MEM_FLAG_READ_ONLY." +--- #-------------------------------------------------------------------------- +# We have redundant entries in the union (value + pointer) to make printing work +# as it relies on the tags and we can currently only have one tag per member. +type: union +desc: "Typesafe container for a kernel argument value" +name: $x_exp_kernel_arg_value_t +tag: $x_exp_kernel_arg_type_t +members: + - type: "const void*" + name: value + desc: | + [in] argument value represented as matching arg type. + The data pointed to will be copied and therefore can be reused on return. + tag: $X_EXP_KERNEL_ARG_TYPE_VALUE + - type: "const void*" + name: pointer + desc: "[in] Allocation obtained by USM allocation or virtual memory mapping operation, or pointer to a literal value." + tag: $X_EXP_KERNEL_ARG_TYPE_POINTER + - type: $x_exp_kernel_arg_mem_obj_tuple_t + name: memObjTuple + desc: "[in] Struct containing a memory object and associated flags." + tag: $X_EXP_KERNEL_ARG_TYPE_MEM_OBJ + - type: $x_sampler_handle_t + name: sampler + desc: "[in] Handle of a sampler object." + tag: $X_EXP_KERNEL_ARG_TYPE_SAMPLER +--- #-------------------------------------------------------------------------- +type: struct +desc: "Kernel arg properties" +name: $x_exp_kernel_arg_properties_t +base: $x_base_properties_t +members: + - type: $x_exp_kernel_arg_type_t + name: type + desc: "[in] type of the kernel arg" + - type: uint32_t + name: index + desc: "[in] index of the kernel arg" + - type: size_t + name: size + desc: "[in] size of the kernel arg" + - type: $x_exp_kernel_arg_value_t + name: value + desc: "[in][tagged_by(type)] Union containing the argument value." +--- #-------------------------------------------------------------------------- +type: enum +extend: true +desc: "Structure Type experimental enumerations." +name: $x_structure_type_t +etors: + - name: EXP_KERNEL_ARG_PROPERTIES + desc: $x_exp_kernel_arg_properties_t + value: "0x5000" +--- #-------------------------------------------------------------------------- +type: function +desc: "Enqueue a command to execute a kernel" +class: $xEnqueue +name: KernelLaunchWithArgsExp +ordinal: "0" +analogue: + - "**clEnqueueNDRangeKernel**" +params: + - type: $x_queue_handle_t + name: hQueue + desc: "[in] handle of the queue object" + - type: $x_kernel_handle_t + name: hKernel + desc: "[in] handle of the kernel object" + - type: uint32_t + name: workDim + desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items" + - type: "const size_t*" + name: pGlobalWorkOffset + desc: "[in][optional] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item" + - type: "const size_t*" + name: pGlobalWorkSize + desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function" + - type: "const size_t*" + name: pLocalWorkSize + desc: | + [in][optional] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. + If nullptr, the runtime implementation will choose the work-group size. + - type: uint32_t + name: numArgs + desc: "[in] Number of entries in pArgs" + - type: "const $x_exp_kernel_arg_properties_t*" + name: pArgs + desc: "[in][optional][range(0, numArgs)] pointer to a list of kernel arg properties." + - type: uint32_t + name: numPropsInLaunchPropList + desc: "[in] size of the launch prop list" + - type: const $x_kernel_launch_property_t* + name: launchPropList + desc: "[in][optional][range(0, numPropsInLaunchPropList)] pointer to a list of launch properties" + - type: uint32_t + name: numEventsInWaitList + desc: "[in] size of the event wait list" + - type: "const $x_event_handle_t*" + name: phEventWaitList + desc: | + [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. + If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + - type: $x_event_handle_t* + name: phEvent + desc: | + [out][optional][alloc] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. +returns: + - $X_RESULT_ERROR_INVALID_QUEUE + - $X_RESULT_ERROR_INVALID_KERNEL + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "`phEventWaitList == NULL && numEventsInWaitList > 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS: + - "An event in `phEventWaitList` has $X_EVENT_STATUS_ERROR." + - $X_RESULT_ERROR_INVALID_WORK_DIMENSION: + - "`pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || pGlobalWorkSize[2] == 0`" + - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE: + - "`pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == 0 || pLocalWorkSize[2] == 0)`" + - $X_RESULT_ERROR_INVALID_VALUE + - $X_RESULT_ERROR_INVALID_KERNEL_ARGS + - "The kernel argument values have not been specified." + - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY + - $X_RESULT_ERROR_OUT_OF_RESOURCES + - $X_RESULT_ERROR_INVALID_OPERATION: + - "If any property in `launchPropList` isn't supported by the device." + - $X_RESULT_ERROR_INVALID_NULL_POINTER: + - "`launchPropList == NULL && numPropsInLaunchPropList > 0`" + - "`pArgs == NULL && numArgs > 0`" diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml index baefa0ad0903d..580a9a1dae741 100644 --- a/unified-runtime/scripts/core/registry.yml +++ b/unified-runtime/scripts/core/registry.yml @@ -670,7 +670,10 @@ etors: - name: BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP desc: Enumerator for $xBindlessImagesSupportsImportingHandleTypeExp value: '288' -max_id: '288' +- name: ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP + desc: Enumerator for $xEnqueueKernelLaunchWithArgsExp + value: '289' +max_id: '289' --- type: enum desc: Defines structure types diff --git a/unified-runtime/scripts/parse_specs.py b/unified-runtime/scripts/parse_specs.py index 9fdb69eedc473..8adbc6de19f4a 100644 --- a/unified-runtime/scripts/parse_specs.py +++ b/unified-runtime/scripts/parse_specs.py @@ -909,7 +909,9 @@ def _append(lst, key, val): rets[idx][key].append(val) def append_nullchecks(param, accessor: str): - if type_traits.is_pointer(param["type"]): + if type_traits.is_pointer(param["type"]) or type_traits.is_array( + param["type"] + ): _append( rets, "$X_RESULT_ERROR_INVALID_NULL_POINTER", diff --git a/unified-runtime/scripts/templates/helper.py b/unified-runtime/scripts/templates/helper.py index 00de01e34753a..5b4fc0c18c2c5 100644 --- a/unified-runtime/scripts/templates/helper.py +++ b/unified-runtime/scripts/templates/helper.py @@ -987,12 +987,17 @@ def make_param_lines( words = [] if "type*" in format: - words.append(tname + "*") + ptname = tname + "*" + if type_traits.is_array(item["type"]): + ptname += "*" + words.append(ptname) name = "p" + name elif "type" in format: words.append(tname) if "name" in format: words.append(name) + if type_traits.is_array(item["type"]) and "type" in format: + words.append(f"[{type_traits.get_array_length(item['type'])}]") prologue = " ".join(words) if "delim" in format: diff --git a/unified-runtime/scripts/templates/print.hpp.mako b/unified-runtime/scripts/templates/print.hpp.mako index 4481847130cbe..ed94cd9227996 100644 --- a/unified-runtime/scripts/templates/print.hpp.mako +++ b/unified-runtime/scripts/templates/print.hpp.mako @@ -99,14 +99,7 @@ def findMemberType(_item): ${x}::details::printUnion(os, ${deref}(params${access}${item['name']}), params${access}${th.param_traits.tagged_member(item)}); %elif th.type_traits.is_array(item['type']): os << ".${iname} = {"; - for(auto i = 0; i < ${th.type_traits.get_array_length(item['type'])}; i++){ - if(i != 0){ - os << ", "; - } - <%call expr="member(iname, itype, True)"> - ${deref}(params${access}${item['name']}[i]) - - } + ${x}::details::printArray<${th.type_traits.get_array_length(item['type'])}>(os, ${deref}params${access}${pname}); os << "}"; %elif typename is not None: os << ".${iname} = "; @@ -140,6 +133,7 @@ inline constexpr bool is_handle_v = is_handle::value; template inline ${x}_result_t printPtr(std::ostream &os, const T *ptr); template inline ${x}_result_t printFlag(std::ostream &os, uint32_t flag); template inline ${x}_result_t printTagged(std::ostream &os, const void *ptr, T value, size_t size); +template inline ur_result_t printArray(std::ostream &os, const T *ptr); %for spec in specs: %for obj in spec['objects']: @@ -564,6 +558,25 @@ template inline ${x}_result_t printPtr(std::ostream &os, const T *p return ${X}_RESULT_SUCCESS; } + +/////////////////////////////////////////////////////////////////////////////// +// @brief Print array of literals +template +inline ur_result_t printArray(std::ostream &os, const T *ptr) { + if(ptr == NULL) { + return printPtr(os, ptr); + } + + for (size_t i = 0; i < size; i++) { + if (i != 0) { + os << ", "; + } + + os << ptr[i]; + } + + return ${X}_RESULT_SUCCESS; +} } // namespace ${x}::details namespace ${x}::extras { diff --git a/unified-runtime/source/adapters/cuda/enqueue.cpp b/unified-runtime/source/adapters/cuda/enqueue.cpp index 36f7535be74f5..e4ccb670e0291 100644 --- a/unified-runtime/source/adapters/cuda/enqueue.cpp +++ b/unified-runtime/source/adapters/cuda/enqueue.cpp @@ -15,6 +15,7 @@ #include "kernel.hpp" #include "memory.hpp" #include "queue.hpp" +#include "sampler.hpp" #include #include @@ -632,6 +633,60 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( #endif // CUDA_VERSION >= 11080 } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + try { + for (uint32_t i = 0; i < numArgs; i++) { + switch (pArgs[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { + hKernel->setKernelLocalArg(pArgs[i].index, pArgs[i].size); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_VALUE: { + hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, + pArgs[i].value.value); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: { + // setKernelArg is expecting a pointer to our argument + hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, + &pArgs[i].value.pointer); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Props = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + pArgs[i].value.memObjTuple.flags}; + UR_CALL(urKernelSetArgMemObj(hKernel, pArgs[i].index, &Props, + pArgs[i].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + uint32_t SamplerProps = pArgs[i].value.sampler->Props; + hKernel->setKernelArg(pArgs[i].index, sizeof(uint32_t), + (void *)&SamplerProps); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + } catch (ur_result_t Err) { + return Err; + } + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} + /// Set parameters for general 3D memory copy. /// If the source and/or destination is on the device, SrcPtr and/or DstPtr /// must be a pointer to a CUdeviceptr diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp index 8430df0ab0678..a9b072472b7c9 100644 --- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp @@ -454,6 +454,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnUSMHostAllocExp = urEnqueueUSMHostAllocExp; pDdiTable->pfnUSMFreeExp = urEnqueueUSMFreeExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; + pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/hip/enqueue.cpp b/unified-runtime/source/adapters/hip/enqueue.cpp index 2206fbbf3eb3a..89b45d9d29b2a 100644 --- a/unified-runtime/source/adapters/hip/enqueue.cpp +++ b/unified-runtime/source/adapters/hip/enqueue.cpp @@ -16,6 +16,7 @@ #include "logger/ur_logger.hpp" #include "memory.hpp" #include "queue.hpp" +#include "sampler.hpp" #include "ur_api.h" #include @@ -340,6 +341,60 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + try { + for (uint32_t i = 0; i < numArgs; i++) { + switch (pArgs[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { + hKernel->setKernelLocalArg(pArgs[i].index, pArgs[i].size); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_VALUE: { + hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, + pArgs[i].value.value); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: { + // setKernelArg is expecting a pointer to our argument + hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, + &pArgs[i].value.pointer); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Props = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + pArgs[i].value.memObjTuple.flags}; + UR_CALL(urKernelSetArgMemObj(hKernel, pArgs[i].index, &Props, + pArgs[i].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + uint32_t SamplerProps = pArgs[i].value.sampler->Props; + hKernel->setKernelArg(pArgs[i].index, sizeof(uint32_t), + (void *)&SamplerProps); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + } catch (ur_result_t Err) { + return Err; + } + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} + /// Enqueues a wait on the given queue for all events. /// See \ref enqueueEventWait /// diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp index dfb4382cad828..d8ec6bb3b50c9 100644 --- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp @@ -447,6 +447,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; + pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/kernel.cpp b/unified-runtime/source/adapters/level_zero/kernel.cpp index bcac9cb04c320..1a5d02da15d37 100644 --- a/unified-runtime/source/adapters/level_zero/kernel.cpp +++ b/unified-runtime/source/adapters/level_zero/kernel.cpp @@ -56,6 +56,173 @@ ur_result_t urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } +inline ur_result_t KernelSetArgValueHelper( + ur_kernel_handle_t Kernel, + /// [in] argument index in range [0, num args - 1] + uint32_t ArgIndex, + /// [in] size of argument type + size_t ArgSize, + /// [in] argument value represented as matching arg type. + const void *PArgValue) { + // OpenCL: "the arg_value pointer can be NULL or point to a NULL value + // in which case a NULL value will be used as the value for the argument + // declared as a pointer to global or constant memory in the kernel" + // + // We don't know the type of the argument but it seems that the only time + // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument + // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL. + if (ArgSize == sizeof(void *) && PArgValue && + *(void **)(const_cast(PArgValue)) == nullptr) { + PArgValue = nullptr; + } + + if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; + } + + ze_result_t ZeResult = ZE_RESULT_SUCCESS; + if (Kernel->ZeKernelMap.empty()) { + auto ZeKernel = Kernel->ZeKernel; + ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue, + (ZeKernel, ArgIndex, ArgSize, PArgValue)); + } else { + for (auto It : Kernel->ZeKernelMap) { + auto ZeKernel = It.second; + ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue, + (ZeKernel, ArgIndex, ArgSize, PArgValue)); + } + } + + if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE; + } + + return ze2urResult(ZeResult); +} + +inline ur_result_t KernelSetArgMemObjHelper( + /// [in] handle of the kernel object + ur_kernel_handle_t Kernel, + /// [in] argument index in range [0, num args - 1] + uint32_t ArgIndex, + /// [in][optional] pointer to Memory object properties. + const ur_kernel_arg_mem_obj_properties_t *Properties, + /// [in][optional] handle of Memory object. + ur_mem_handle_t ArgValue) { + // The ArgValue may be a NULL pointer in which case a NULL value is used for + // the kernel argument declared as a pointer to global or constant memory. + + if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { + return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; + } + + ur_mem_handle_t_ *UrMem = ur_cast(ArgValue); + + ur_mem_handle_t_::access_mode_t UrAccessMode = ur_mem_handle_t_::read_write; + if (Properties) { + switch (Properties->memoryAccess) { + case UR_MEM_FLAG_READ_WRITE: + UrAccessMode = ur_mem_handle_t_::read_write; + break; + case UR_MEM_FLAG_WRITE_ONLY: + UrAccessMode = ur_mem_handle_t_::write_only; + break; + case UR_MEM_FLAG_READ_ONLY: + UrAccessMode = ur_mem_handle_t_::read_only; + break; + case 0: + break; + default: + return UR_RESULT_ERROR_INVALID_ARGUMENT; + } + } + auto Arg = UrMem ? UrMem : nullptr; + Kernel->PendingArguments.push_back( + {ArgIndex, sizeof(void *), Arg, UrAccessMode}); + + return UR_RESULT_SUCCESS; +} + +ur_result_t urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t Queue, + /// [in] handle of the kernel object + ur_kernel_handle_t Kernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *GlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *GlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *LocalWorkSize, + /// [in] size of the event wait list + uint32_t NumArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *Args, + /// [in] size of the launch prop list + uint32_t NumPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *LaunchPropList, + uint32_t NumEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. If + /// nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *EventWaitList, + /// [in,out][optional] return an event object that identifies this + /// particular kernel execution instance. + ur_event_handle_t *OutEvent) { + { + std::scoped_lock Guard(Kernel->Mutex); + for (uint32_t i = 0; i < NumArgs; i++) { + switch (Args[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, + nullptr)); + break; + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, + Args[i].value.value)); + break; + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, + &Args[i].value.pointer)); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Properties = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + Args[i].value.memObjTuple.flags}; + UR_CALL(KernelSetArgMemObjHelper(Kernel, Args[i].index, &Properties, + Args[i].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, + &Args[i].value.sampler->ZeSampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + } + // Normalize so each dimension has at least one work item + return level_zero::urEnqueueKernelLaunch( + Queue, Kernel, workDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize, + NumPropsInLaunchPropList, LaunchPropList, NumEventsInWaitList, + EventWaitList, OutEvent); +} + ur_result_t urEnqueueKernelLaunch( /// [in] handle of the queue object ur_queue_handle_t Queue, diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index 13d7274e7aebf..4276f97f5dd29 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -225,6 +225,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return result; } + pDdiTable->pfnKernelLaunchWithArgsExp = + ur::level_zero::urEnqueueKernelLaunchWithArgsExp; pDdiTable->pfnUSMDeviceAllocExp = ur::level_zero::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = ur::level_zero::urEnqueueUSMSharedAllocExp; pDdiTable->pfnUSMHostAllocExp = ur::level_zero::urEnqueueUSMHostAllocExp; diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index 77bc0b7d5b737..bf1cb230dfc6c 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -806,6 +806,15 @@ ur_result_t urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t commandDevice, ur_exp_peer_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet); +ur_result_t urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); ur_result_t urEnqueueEventsWaitWithBarrierExt( ur_queue_handle_t hQueue, const ur_exp_enqueue_ext_properties_t *pProperties, diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index 142caaecc1a71..3b4bbc01e2b18 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -11,6 +11,7 @@ #include "command_list_manager.hpp" #include "../helpers/kernel_helpers.hpp" #include "../helpers/memory_helpers.hpp" +#include "../sampler.hpp" #include "../ur_interface_loader.hpp" #include "command_buffer.hpp" #include "context.hpp" @@ -1037,3 +1038,60 @@ ur_result_t ur_command_list_manager::releaseSubmittedKernels() { submittedKernels.clear(); return UR_RESULT_SUCCESS; } + +ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_in_order_t::enqueueKernelLaunchWithArgsExp"); + { + std::scoped_lock guard(hKernel->Mutex); + for (uint32_t argIndex = 0; argIndex < numArgs; argIndex++) { + switch (pArgs[argIndex].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + UR_CALL(hKernel->setArgValue(pArgs[argIndex].index, + pArgs[argIndex].size, nullptr, nullptr)); + break; + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + UR_CALL(hKernel->setArgValue(pArgs[argIndex].index, + pArgs[argIndex].size, nullptr, + pArgs[argIndex].value.value)); + break; + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + UR_CALL(hKernel->setArgPointer(pArgs[argIndex].index, nullptr, + pArgs[argIndex].value.pointer)); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: + // TODO: import helper for converting ur flags to internal equivalent + UR_CALL(hKernel->addPendingMemoryAllocation( + {pArgs[argIndex].value.memObjTuple.hMem, + ur_mem_buffer_t::device_access_mode_t::read_write, + pArgs[argIndex].index})); + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + UR_CALL( + hKernel->setArgValue(argIndex, sizeof(void *), nullptr, + &pArgs[argIndex].value.sampler->ZeSampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + } + + UR_CALL(appendKernelLaunch(hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent)); + + recordSubmittedKernel(hKernel); + + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp index 3c1bbd710ed47..8ace46c88837f 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp @@ -233,6 +233,16 @@ struct ur_command_list_manager { const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + ur_result_t appendKernelLaunchWithArgsExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + private: ur_result_t appendGenericCommandListsExp( uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, diff --git a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp index 8809abe194362..0344323ab4f89 100644 --- a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp @@ -436,19 +436,17 @@ ur_result_t urKernelSetArgPointer( return exceptionToResult(std::current_exception()); } -static ur_mem_buffer_t::device_access_mode_t memAccessFromKernelProperties( - const ur_kernel_arg_mem_obj_properties_t *pProperties) { - if (pProperties) { - switch (pProperties->memoryAccess) { - case UR_MEM_FLAG_READ_WRITE: - return ur_mem_buffer_t::device_access_mode_t::read_write; - case UR_MEM_FLAG_WRITE_ONLY: - return ur_mem_buffer_t::device_access_mode_t::write_only; - case UR_MEM_FLAG_READ_ONLY: - return ur_mem_buffer_t::device_access_mode_t::read_only; - default: - return ur_mem_buffer_t::device_access_mode_t::read_write; - } +static ur_mem_buffer_t::device_access_mode_t +memAccessFromKernelProperties(const ur_mem_flags_t &Flags) { + switch (Flags) { + case UR_MEM_FLAG_READ_WRITE: + return ur_mem_buffer_t::device_access_mode_t::read_write; + case UR_MEM_FLAG_WRITE_ONLY: + return ur_mem_buffer_t::device_access_mode_t::write_only; + case UR_MEM_FLAG_READ_ONLY: + return ur_mem_buffer_t::device_access_mode_t::read_only; + default: + return ur_mem_buffer_t::device_access_mode_t::read_write; } return ur_mem_buffer_t::device_access_mode_t::read_write; } @@ -462,7 +460,10 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, std::scoped_lock guard(hKernel->Mutex); UR_CALL(hKernel->addPendingMemoryAllocation( - {hArgValue, memAccessFromKernelProperties(pProperties), argIndex})); + {hArgValue, + memAccessFromKernelProperties(pProperties ? pProperties->memoryAccess + : 0), + argIndex})); return UR_RESULT_SUCCESS; } catch (...) { diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp index 582885ea67c46..29a8bf7114862 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp @@ -442,6 +442,22 @@ ur_result_t urEnqueueTimestampRecordingExp( } catch (...) { return exceptionToResult(std::current_exception()); } +ur_result_t urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) try { + return hQueue->get().enqueueKernelLaunchWithArgsExp( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} ur_result_t urEnqueueEventsWaitWithBarrierExt( ur_queue_handle_t hQueue, const ur_exp_enqueue_ext_properties_t *pProperties, diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp index 4bd9d8fd2141e..530897288f0fa 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp @@ -163,6 +163,11 @@ struct ur_queue_t_ { virtual ur_result_t enqueueTimestampRecordingExp(bool, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; + virtual ur_result_t enqueueKernelLaunchWithArgsExp( + ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, + const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *, + uint32_t, const ur_kernel_launch_property_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, uint32_t, const ur_event_handle_t *, diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 3f230861ad563..7f40d9139d63a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -454,6 +454,22 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { createEventIfRequested(eventPool.get(), phEvent, this)); } + ur_result_t enqueueKernelLaunchWithArgsExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendKernelLaunchWithArgsExp( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + ur::RefCount RefCount; }; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp index f1ad68a62a1a8..564c0472edb81 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -506,6 +506,24 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { createEventIfRequested(eventPool.get(), phEvent, this)); } + ur_result_t enqueueKernelLaunchWithArgsExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .appendKernelLaunchWithArgsExp( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, + launchPropList, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + ur::RefCount RefCount; }; diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index bc1ef8ae6dc86..71778bec2dacc 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -11919,6 +11919,107 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_enqueue_kernel_launch_with_args_exp_params_t params = { + &hQueue, + &hKernel, + &workDim, + &pGlobalWorkOffset, + &pGlobalWorkSize, + &pLocalWorkSize, + &numArgs, + &pArgs, + &numPropsInLaunchPropList, + &launchPropList, + &numEventsInWaitList, + &phEventWaitList, + &phEvent}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback( + "urEnqueueKernelLaunchWithArgsExp")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback( + "urEnqueueKernelLaunchWithArgsExp")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback( + "urEnqueueKernelLaunchWithArgsExp")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -12424,6 +12525,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; + pDdiTable->pfnKernelLaunchWithArgsExp = + driver::urEnqueueKernelLaunchWithArgsExp; + pDdiTable->pfnUSMDeviceAllocExp = driver::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = driver::urEnqueueUSMSharedAllocExp; diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 6818a915b9334..f1e7ea3c31f4d 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -705,3 +705,45 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + for (uint32_t argIndex = 0; argIndex < numArgs; argIndex++) { + switch (pArgs[argIndex].type) { + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + UR_CALL(hKernel->addArg(pArgs[argIndex].value.value, + pArgs[argIndex].index, pArgs[argIndex].size)); + break; + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + UR_CALL( + hKernel->addPtrArg(const_cast(pArgs[argIndex].value.pointer), + pArgs[argIndex].index)); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + auto MemObj = pArgs[argIndex].value.memObjTuple.hMem; + UR_CALL(hKernel->addMemObjArg(MemObj, pArgs[argIndex].index)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + UR_CALL( + hKernel->addLocalArg(pArgs[argIndex].index, pArgs[argIndex].size)); + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + break; + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} diff --git a/unified-runtime/source/adapters/native_cpu/kernel.cpp b/unified-runtime/source/adapters/native_cpu/kernel.cpp index ac11331357f39..f67c7653d0981 100644 --- a/unified-runtime/source/adapters/native_cpu/kernel.cpp +++ b/unified-runtime/source/adapters/native_cpu/kernel.cpp @@ -61,21 +61,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( // TODO: error checking UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); - hKernel->addArg(pArgValue, argIndex, argSize); - - return UR_RESULT_SUCCESS; + return hKernel->addArg(pArgValue, argIndex, argSize); } UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal( ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, const ur_kernel_arg_local_properties_t * /*pProperties*/) { - // emplace a placeholder kernel arg, gets replaced with a pointer to the - // memory pool before enqueueing the kernel. - hKernel->addPtrArg(nullptr, argIndex); - hKernel->_localArgInfo.emplace_back(argIndex, argSize); - return UR_RESULT_SUCCESS; + return hKernel->addLocalArg(argIndex, argSize); } UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, @@ -211,11 +204,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( const void *pArgValue) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pArgValue, UR_RESULT_ERROR_INVALID_NULL_POINTER); - - hKernel->addPtrArg(const_cast(pArgValue), argIndex); - return UR_RESULT_SUCCESS; + return hKernel->addPtrArg(const_cast(pArgValue), argIndex); } UR_APIEXPORT ur_result_t UR_APICALL @@ -242,16 +232,7 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - // Taken from ur/adapters/cuda/kernel.cpp - // zero-sized buffers are expected to be null. - if (hArgValue == nullptr) { - hKernel->addPtrArg(nullptr, argIndex); - return UR_RESULT_SUCCESS; - } - - hKernel->addArgReference(hArgValue); - hKernel->addPtrArg(hArgValue->_mem, argIndex); - return UR_RESULT_SUCCESS; + return hKernel->addMemObjArg(hArgValue, argIndex); } UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( diff --git a/unified-runtime/source/adapters/native_cpu/kernel.hpp b/unified-runtime/source/adapters/native_cpu/kernel.hpp index 8daf23feb65f5..285b1c00d0e63 100644 --- a/unified-runtime/source/adapters/native_cpu/kernel.hpp +++ b/unified-runtime/source/adapters/native_cpu/kernel.hpp @@ -181,17 +181,44 @@ struct ur_kernel_handle_t_ : RefCounted { return Result; } - void addArg(const void *Ptr, size_t Index, size_t Size) { + inline ur_result_t addArg(const void *Ptr, size_t Index, size_t Size) { + UR_ASSERT(Size, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); Args.addArg(Index, Size, Ptr); + return UR_RESULT_SUCCESS; } - void addPtrArg(void *Ptr, size_t Index) { Args.addPtrArg(Index, Ptr); } + inline ur_result_t addPtrArg(void *Ptr, size_t Index) { + UR_ASSERT(Ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); + Args.addPtrArg(Index, Ptr); + return UR_RESULT_SUCCESS; + } void addArgReference(ur_mem_handle_t Arg) { Arg->incrementReferenceCount(); ReferencedArgs.push_back(Arg); } + inline ur_result_t addMemObjArg(ur_mem_handle_t ArgValue, size_t Index) { + // Taken from ur/adapters/cuda/kernel.cpp + // zero-sized buffers are expected to be null. + if (ArgValue == nullptr) { + addPtrArg(nullptr, Index); + return UR_RESULT_SUCCESS; + } + + addArgReference(ArgValue); + addPtrArg(ArgValue->_mem, Index); + return UR_RESULT_SUCCESS; + } + + inline ur_result_t addLocalArg(size_t Index, size_t Size) { + // emplace a placeholder kernel arg, gets replaced with a pointer to the + // memory pool before enqueueing the kernel. + Args.addPtrArg(Index, nullptr); + _localArgInfo.emplace_back(Index, Size); + return UR_RESULT_SUCCESS; + } + private: void removeArgReferences() { for (auto arg : ReferencedArgs) diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp index 3f6fe061b4917..7bc2f999a00cd 100644 --- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp @@ -431,6 +431,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; + pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index 3d419a30dc500..39f1696e9ea22 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -481,3 +481,40 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + for (uint32_t i = 0; i < numArgs; i++) { + switch (pArgs[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + hKernel->Args.addArg(pArgs[i].index, sizeof(pArgs[i].value.pointer), + &pArgs[i].value.pointer); + break; + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + hKernel->Args.addArg(pArgs[i].index, pArgs[i].size, pArgs[i].value.value); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: + hKernel->Args.addMemObjArg(pArgs[i].index, + pArgs[i].value.memObjTuple.hMem, + pArgs[i].value.memObjTuple.flags); + break; + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp index 1b81636729af7..91d547f8f3112 100644 --- a/unified-runtime/source/adapters/offload/kernel.cpp +++ b/unified-runtime/source/adapters/offload/kernel.cpp @@ -9,7 +9,6 @@ //===----------------------------------------------------------------------===// #include "kernel.hpp" -#include "memory.hpp" #include "program.hpp" #include "ur2offload.hpp" #include @@ -102,19 +101,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, const ur_kernel_arg_mem_obj_properties_t *Properties, ur_mem_handle_t hArgValue) { - // Handle zero-sized buffers - if (hArgValue == nullptr) { - hKernel->Args.addArg(argIndex, 0, nullptr); - return UR_RESULT_SUCCESS; - } - ur_mem_flags_t MemAccess = Properties ? Properties->memoryAccess : static_cast(UR_MEM_FLAG_READ_WRITE); hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess); - auto Ptr = std::get(hArgValue->Mem).Ptr; - hKernel->Args.addArg(argIndex, sizeof(void *), &Ptr); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/offload/kernel.hpp b/unified-runtime/source/adapters/offload/kernel.hpp index 2794fc8f32213..88fc6d6591430 100644 --- a/unified-runtime/source/adapters/offload/kernel.hpp +++ b/unified-runtime/source/adapters/offload/kernel.hpp @@ -18,6 +18,7 @@ #include #include "common.hpp" +#include "memory.hpp" struct ur_kernel_handle_t_ : RefCounted { @@ -56,7 +57,12 @@ struct ur_kernel_handle_t_ : RefCounted { } void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { - assert(hMem && "Invalid mem handle"); + // Handle zero-sized buffers + if (hMem == nullptr) { + addArg(Index, 0, nullptr); + return; + } + // If a memobj is already set at this index, update the entry rather // than adding a duplicate one for (auto &Arg : MemObjArgs) { @@ -66,6 +72,9 @@ struct ur_kernel_handle_t_ : RefCounted { } } MemObjArgs.push_back(MemObjArg{hMem, Index, Flags}); + + auto Ptr = std::get(hMem->Mem).Ptr; + addArg(Index, sizeof(void *), &Ptr); } const args_ptr_t &getPointers() const noexcept { return Pointers; } diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp index fa3adfdaf92bd..0aa0bf4644a13 100644 --- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -384,6 +384,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = nullptr; pDdiTable->pfnNativeCommandExp = nullptr; + pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/opencl/enqueue.cpp b/unified-runtime/source/adapters/opencl/enqueue.cpp index 63b7b45426632..1bd75b6b56aaf 100644 --- a/unified-runtime/source/adapters/opencl/enqueue.cpp +++ b/unified-runtime/source/adapters/opencl/enqueue.cpp @@ -16,6 +16,10 @@ #include "memory.hpp" #include "program.hpp" #include "queue.hpp" +#include "sampler.hpp" + +#include +#include cl_map_flags convertURMapFlagsToCL(ur_map_flags_t URFlags) { cl_map_flags CLFlags = 0; @@ -501,3 +505,102 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; + propIndex++) { + // Adapters that don't support cooperative kernels are currently expected + // to ignore COOPERATIVE launch properties. Ideally we should avoid passing + // these at the SYCL RT level instead, see + // https://github.com/intel/llvm/issues/18421 + if (launchPropList[propIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE || + launchPropList[propIndex].id == + UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { + continue; + } + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + clSetKernelArgMemPointerINTEL_fn SetKernelArgMemPointerPtr = nullptr; + UR_RETURN_ON_FAILURE( + cl_ext::getExtFuncFromContext( + hQueue->Context->CLContext, + ur::cl::getAdapter()->fnCache.clSetKernelArgMemPointerINTELCache, + cl_ext::SetKernelArgMemPointerName, &SetKernelArgMemPointerPtr)); + + for (uint32_t i = 0; i < numArgs; i++) { + switch (pArgs[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, nullptr)); + break; + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, pArgs[i].value.value)); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + cl_mem mem = pArgs[i].value.memObjTuple.hMem + ? pArgs[i].value.memObjTuple.hMem->CLMemory + : nullptr; + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, &mem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + CL_RETURN_ON_FAILURE(SetKernelArgMemPointerPtr( + hKernel->CLKernel, static_cast(pArgs[i].index), + pArgs[i].value.pointer)); + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + CL_RETURN_ON_FAILURE(clSetKernelArg( + hKernel->CLKernel, static_cast(pArgs[i].index), + pArgs[i].size, &pArgs[i].value.sampler->CLSampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + std::vector compiledLocalWorksize; + if (!pLocalWorkSize) { + cl_device_id device = nullptr; + CL_RETURN_ON_FAILURE(clGetCommandQueueInfo( + hQueue->CLQueue, CL_QUEUE_DEVICE, sizeof(device), &device, nullptr)); + // This query always returns size_t[3], if nothing was specified it + // returns all zeroes. + size_t queriedLocalWorkSize[3] = {0, 0, 0}; + CL_RETURN_ON_FAILURE(clGetKernelWorkGroupInfo( + hKernel->CLKernel, device, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, + sizeof(size_t[3]), queriedLocalWorkSize, nullptr)); + if (queriedLocalWorkSize[0] != 0) { + for (uint32_t i = 0; i < 3; i++) { + compiledLocalWorksize.push_back(queriedLocalWorkSize[i]); + } + } + } + + cl_event Event; + std::vector CLWaitEvents(numEventsInWaitList); + MapUREventsToCL(numEventsInWaitList, phEventWaitList, CLWaitEvents); + CL_RETURN_ON_FAILURE(clEnqueueNDRangeKernel( + hQueue->CLQueue, hKernel->CLKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, + compiledLocalWorksize.empty() ? pLocalWorkSize + : compiledLocalWorksize.data(), + numEventsInWaitList, CLWaitEvents.data(), ifUrEvent(phEvent, Event))); + + UR_RETURN_ON_FAILURE(createUREvent(Event, hQueue->Context, hQueue, phEvent)); + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp index c619fa36b1ab0..18cc6a79651be 100644 --- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp @@ -434,6 +434,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; + pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/common/stype_map_helpers.def b/unified-runtime/source/common/stype_map_helpers.def index 79705826395b9..efd69e6ae4cb3 100644 --- a/unified-runtime/source/common/stype_map_helpers.def +++ b/unified-runtime/source/common/stype_map_helpers.def @@ -168,3 +168,6 @@ struct stype_map template <> struct stype_map : stype_map_impl {}; +template <> +struct stype_map + : stype_map_impl {}; diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp index 012aa1422cfa1..7b8ebbd7ff36d 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -1646,6 +1646,119 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( return pfnGetInfo(hDevice, propName, propSize, pPropValue, pPropSizeRet); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunch +ur_result_t urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. If + /// nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional] return an event object that identifies this + /// particular kernel execution instance. + ur_event_handle_t *phEvent) { + // This mutex is to prevent concurrent kernel launches across different queues + // as the DeviceASAN local/private shadow memory does not support concurrent + // kernel launches now. + std::scoped_lock Guard( + getAsanInterceptor()->KernelLaunchMutex); + + UR_LOG_L(getContext()->logger, DEBUG, + "==== urEnqueueKernelLaunchWithArgsExp"); + + // We need to set all the args now rather than letting LaunchWithArgs handle + // them. This is because some implementations of + // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely + // on all the args being set. + for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) { + switch (pArgs[ArgPropIndex].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { + UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgLocal( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, + nullptr)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: { + UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgPointer( + hKernel, pArgs[ArgPropIndex].index, nullptr, + pArgs[ArgPropIndex].value.pointer)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_VALUE: { + UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgValue( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr, + pArgs[ArgPropIndex].value.value)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Properties = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + pArgs[ArgPropIndex].value.memObjTuple.flags}; + UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgMemObj( + hKernel, pArgs[ArgPropIndex].index, &Properties, + pArgs[ArgPropIndex].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + auto pfnKernelSetArgSampler = + getContext()->urDdiTable.Kernel.pfnSetArgSampler; + UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index, + nullptr, + pArgs[ArgPropIndex].value.sampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, + pLocalWorkSize, pGlobalWorkOffset, 3); + UR_CALL(LaunchInfo.Data.syncToDevice(hQueue)); + + UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + + UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + LaunchInfo.LocalWorkSize.data(), 0, nullptr, numPropsInLaunchPropList, + launchPropList, numEventsInWaitList, phEventWaitList, phEvent)); + + UR_CALL(getAsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); + + return UR_RESULT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Adapter table /// with current process' addresses @@ -1961,6 +2074,22 @@ __urdlllocal ur_result_t UR_APICALL urGetDeviceProcAddrTable( return result; } +/// @brief Exported function for filling application's ProgramExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +ur_result_t urGetEnqueueExpProcAddrTable( + /// [in,out] pointer to table of DDI function pointers + ur_enqueue_exp_dditable_t *pDdiTable) { + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_sanitizer_layer::asan::urEnqueueKernelLaunchWithArgsExp; + + return result; +} template struct NotSupportedApi; @@ -2157,6 +2286,11 @@ ur_result_t initAsanDDITable(ur_dditable_t *dditable) { UR_API_VERSION_CURRENT, &dditable->VirtualMem); } + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::asan::urGetEnqueueExpProcAddrTable( + &dditable->EnqueueExp); + } + if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize ASAN DDI table failed: {}", result); diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp index 810fd76120b2d..f9625b3b1d9e7 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp @@ -1811,6 +1811,122 @@ ur_result_t urEnqueueUSMMemcpy2D( return UR_RESULT_SUCCESS; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunch +ur_result_t urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. If + /// nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional] return an event object that identifies this + /// particular kernel execution instance. + ur_event_handle_t *phEvent) { + // This mutex is to prevent concurrent kernel launches across different queues + // as the DeviceMSAN local/private shadow memory does not support concurrent + // kernel launches now. + std::scoped_lock Guard( + getMsanInterceptor()->KernelLaunchMutex); + + UR_LOG_L(getContext()->logger, DEBUG, + "==== urEnqueueKernelLaunchWithArgsExp"); + + // We need to set all the args now rather than letting LaunchWithArgs handle + // them. This is because some implementations of + // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely + // on all the args being set. + for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) { + switch (pArgs[ArgPropIndex].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { + UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgLocal( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, + nullptr)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: { + auto pfnKernelSetArgPointer = + getContext()->urDdiTable.Kernel.pfnSetArgPointer; + UR_CALL(pfnKernelSetArgPointer(hKernel, pArgs[ArgPropIndex].index, + nullptr, + pArgs[ArgPropIndex].value.pointer)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_VALUE: { + UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgValue( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr, + pArgs[ArgPropIndex].value.value)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Properties = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + pArgs[ArgPropIndex].value.memObjTuple.flags}; + UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgMemObj( + hKernel, pArgs[ArgPropIndex].index, &Properties, + pArgs[ArgPropIndex].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + auto pfnKernelSetArgSampler = + getContext()->urDdiTable.Kernel.pfnSetArgSampler; + UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index, + nullptr, + pArgs[ArgPropIndex].value.sampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), + pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, + 3); + UR_CALL(LaunchInfo.initialize()); + + UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + + UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + LaunchInfo.LocalWorkSize.data(), 0, nullptr, numPropsInLaunchPropList, + launchPropList, numEventsInWaitList, phEventWaitList, phEvent)); + + UR_CALL(getMsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); + + return UR_RESULT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Adapter table /// with current process' addresses @@ -1989,6 +2105,22 @@ ur_result_t urGetUSMProcAddrTable( return result; } +/// @brief Exported function for filling application's ProgramExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +ur_result_t urGetEnqueueExpProcAddrTable( + /// [in,out] pointer to table of DDI function pointers + ur_enqueue_exp_dditable_t *pDdiTable) { + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_sanitizer_layer::msan::urEnqueueKernelLaunchWithArgsExp; + + return result; +} ur_result_t urCheckVersion(ur_api_version_t version) { if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) != @@ -2054,6 +2186,11 @@ ur_result_t initMsanDDITable(ur_dditable_t *dditable) { result = ur_sanitizer_layer::msan::urGetUSMProcAddrTable(&dditable->USM); } + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::msan::urGetEnqueueExpProcAddrTable( + &dditable->EnqueueExp); + } + if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize MSAN DDI table failed: {}", result); diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp index 61849ac0b363a..7934dbf596e40 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp @@ -1337,6 +1337,120 @@ ur_result_t urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunch +ur_result_t urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. If + /// nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional] return an event object that identifies this + /// particular kernel execution instance. + ur_event_handle_t *phEvent) { + // This mutex is to prevent concurrent kernel launches across different queues + // as the DeviceTSAN local shadow memory does not support concurrent + // kernel launches now. + std::scoped_lock Guard( + getTsanInterceptor()->KernelLaunchMutex); + + UR_LOG_L(getContext()->logger, DEBUG, + "==== urEnqueueKernelLaunchWithArgsExp"); + + // We need to set all the args now rather than letting LaunchWithArgs handle + // them. This is because some implementations of + // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely + // on all the args being set. + for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) { + switch (pArgs[ArgPropIndex].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { + UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgLocal( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, + nullptr)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: { + auto pfnKernelSetArgPointer = + getContext()->urDdiTable.Kernel.pfnSetArgPointer; + UR_CALL(pfnKernelSetArgPointer(hKernel, pArgs[ArgPropIndex].index, + nullptr, + pArgs[ArgPropIndex].value.pointer)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_VALUE: { + UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgValue( + hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr, + pArgs[ArgPropIndex].value.value)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + ur_kernel_arg_mem_obj_properties_t Properties = { + UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, + pArgs[ArgPropIndex].value.memObjTuple.flags}; + UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgMemObj( + hKernel, pArgs[ArgPropIndex].index, &Properties, + pArgs[ArgPropIndex].value.memObjTuple.hMem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + auto pfnKernelSetArgSampler = + getContext()->urDdiTable.Kernel.pfnSetArgSampler; + UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index, + nullptr, + pArgs[ArgPropIndex].value.sampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, + pLocalWorkSize, pGlobalWorkOffset, 3); + + UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + + UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, 0, nullptr, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent)); + + UR_CALL(getTsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); + + return UR_RESULT_SUCCESS; +} + ur_result_t urCheckVersion(ur_api_version_t version) { if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) != UR_MAJOR_VERSION(version) || @@ -1547,6 +1661,22 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueProcAddrTable( return UR_RESULT_SUCCESS; } +/// @brief Exported function for filling application's ProgramExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +ur_result_t urGetEnqueueExpProcAddrTable( + /// [in,out] pointer to table of DDI function pointers + ur_enqueue_exp_dditable_t *pDdiTable) { + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_sanitizer_layer::tsan::urEnqueueKernelLaunchWithArgsExp; + + return result; +} } // namespace tsan ur_result_t initTsanDDITable(ur_dditable_t *dditable) { @@ -1596,6 +1726,11 @@ ur_result_t initTsanDDITable(ur_dditable_t *dditable) { ur_sanitizer_layer::tsan::urGetEnqueueProcAddrTable(&dditable->Enqueue); } + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::tsan::urGetEnqueueExpProcAddrTable( + &dditable->EnqueueExp); + } + if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize TSAN DDI table failed: {}", result); diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index 80993d811a397..31e034e258239 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -10097,6 +10097,98 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) { + auto pfnKernelLaunchWithArgsExp = + getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp; + + if (nullptr == pfnKernelLaunchWithArgsExp) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + ur_enqueue_kernel_launch_with_args_exp_params_t params = { + &hQueue, + &hKernel, + &workDim, + &pGlobalWorkOffset, + &pGlobalWorkSize, + &pLocalWorkSize, + &numArgs, + &pArgs, + &numPropsInLaunchPropList, + &launchPropList, + &numEventsInWaitList, + &phEventWaitList, + &phEvent}; + uint64_t instance = getContext()->notify_begin( + UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, + "urEnqueueKernelLaunchWithArgsExp", ¶ms); + + auto &logger = getContext()->logger; + UR_LOG_L(logger, INFO, " ---> urEnqueueKernelLaunchWithArgsExp\n"); + + ur_result_t result = pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); + + getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, + "urEnqueueKernelLaunchWithArgsExp", ¶ms, &result, + instance); + + if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, ¶ms); + UR_LOG_L(logger, INFO, + " <--- urEnqueueKernelLaunchWithArgsExp({}) -> {};\n", + args_str.str(), result); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -10707,6 +10799,10 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; + dditable.pfnKernelLaunchWithArgsExp = pDdiTable->pfnKernelLaunchWithArgsExp; + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_tracing_layer::urEnqueueKernelLaunchWithArgsExp; + dditable.pfnUSMDeviceAllocExp = pDdiTable->pfnUSMDeviceAllocExp; pDdiTable->pfnUSMDeviceAllocExp = ur_tracing_layer::urEnqueueUSMDeviceAllocExp; diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index 6b6afc878eead..8b1dc6c64e246 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -10870,6 +10870,122 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) { + auto pfnKernelLaunchWithArgsExp = + getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp; + + if (nullptr == pfnKernelLaunchWithArgsExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == pGlobalWorkSize) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (launchPropList == NULL && numPropsInLaunchPropList > 0) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (pArgs == NULL && numArgs > 0) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == hQueue) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (NULL == hKernel) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (NULL != pArgs && UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type) + return UR_RESULT_ERROR_INVALID_ENUMERATION; + + if (phEventWaitList == NULL && numEventsInWaitList > 0) + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + + if (phEventWaitList != NULL && numEventsInWaitList == 0) + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + + if (pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || + pGlobalWorkSize[2] == 0) + return UR_RESULT_ERROR_INVALID_WORK_DIMENSION; + + if (pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == 0 || + pLocalWorkSize[2] == 0)) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hQueue)) { + URLOG_CTX_INVALID_REFERENCE(hQueue); + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hKernel)) { + URLOG_CTX_INVALID_REFERENCE(hKernel); + } + + ur_result_t result = pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); + + if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS && + phEvent) { + getContext()->refCountContext->createRefCount(*phEvent); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -11505,6 +11621,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; + dditable.pfnKernelLaunchWithArgsExp = pDdiTable->pfnKernelLaunchWithArgsExp; + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_validation_layer::urEnqueueKernelLaunchWithArgsExp; + dditable.pfnUSMDeviceAllocExp = pDdiTable->pfnUSMDeviceAllocExp; pDdiTable->pfnUSMDeviceAllocExp = ur_validation_layer::urEnqueueUSMDeviceAllocExp; diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index e86a6c65a7957..d337292275873 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -76,6 +76,7 @@ EXPORTS urEnqueueEventsWaitWithBarrier urEnqueueEventsWaitWithBarrierExt urEnqueueKernelLaunch + urEnqueueKernelLaunchWithArgsExp urEnqueueMemBufferCopy urEnqueueMemBufferCopyRect urEnqueueMemBufferFill @@ -288,6 +289,7 @@ EXPORTS urPrintEnqueueEventsWaitWithBarrierExtParams urPrintEnqueueEventsWaitWithBarrierParams urPrintEnqueueKernelLaunchParams + urPrintEnqueueKernelLaunchWithArgsExpParams urPrintEnqueueMemBufferCopyParams urPrintEnqueueMemBufferCopyRectParams urPrintEnqueueMemBufferFillParams @@ -349,6 +351,9 @@ EXPORTS urPrintExpImageCopyInputTypes urPrintExpImageCopyRegion urPrintExpImageMemType + urPrintExpKernelArgMemObjTuple + urPrintExpKernelArgProperties + urPrintExpKernelArgType urPrintExpPeerInfo urPrintExpSamplerAddrModes urPrintExpSamplerCubemapFilterMode diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index 6a30c9186f674..023032aecea01 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -76,6 +76,7 @@ urEnqueueEventsWaitWithBarrier; urEnqueueEventsWaitWithBarrierExt; urEnqueueKernelLaunch; + urEnqueueKernelLaunchWithArgsExp; urEnqueueMemBufferCopy; urEnqueueMemBufferCopyRect; urEnqueueMemBufferFill; @@ -288,6 +289,7 @@ urPrintEnqueueEventsWaitWithBarrierExtParams; urPrintEnqueueEventsWaitWithBarrierParams; urPrintEnqueueKernelLaunchParams; + urPrintEnqueueKernelLaunchWithArgsExpParams; urPrintEnqueueMemBufferCopyParams; urPrintEnqueueMemBufferCopyRectParams; urPrintEnqueueMemBufferFillParams; @@ -349,6 +351,9 @@ urPrintExpImageCopyInputTypes; urPrintExpImageCopyRegion; urPrintExpImageMemType; + urPrintExpKernelArgMemObjTuple; + urPrintExpKernelArgProperties; + urPrintExpKernelArgType; urPrintExpPeerInfo; urPrintExpSamplerAddrModes; urPrintExpSamplerCubemapFilterMode; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index 80ff806b0e116..e0779fa823d49 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -5737,6 +5737,65 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( pPropValue, pPropSizeRet); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) { + + auto *dditable = *reinterpret_cast(hQueue); + + auto *pfnKernelLaunchWithArgsExp = + dditable->EnqueueExp.pfnKernelLaunchWithArgsExp; + if (nullptr == pfnKernelLaunchWithArgsExp) + return UR_RESULT_ERROR_UNINITIALIZED; + + // forward to device-platform + return pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -6247,6 +6306,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( if (ur_loader::getContext()->platforms.size() != 1 || ur_loader::getContext()->forceIntercept) { // return pointers to loader's DDIs + pDdiTable->pfnKernelLaunchWithArgsExp = + ur_loader::urEnqueueKernelLaunchWithArgsExp; pDdiTable->pfnUSMDeviceAllocExp = ur_loader::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = ur_loader::urEnqueueUSMSharedAllocExp; pDdiTable->pfnUSMHostAllocExp = ur_loader::urEnqueueUSMHostAllocExp; diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 85a085c8fb6bf..647f1607735f7 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -10580,6 +10580,104 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command to execute a kernel +/// +/// @remarks +/// _Analogues_ +/// - **clEnqueueNDRangeKernel** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// + `NULL == hKernel` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkSize` +/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `pArgs == NULL && numArgs > 0` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS +/// + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR. +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || +/// pGlobalWorkSize[2] == 0` +/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE +/// + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == +/// 0 || pLocalWorkSize[2] == 0)` +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values +/// have not been specified." +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If any property in `launchPropList` isn't supported by the device. +ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) try { + auto pfnKernelLaunchWithArgsExp = + ur_lib::getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp; + if (nullptr == pfnKernelLaunchWithArgsExp) + return UR_RESULT_ERROR_UNINITIALIZED; + + return pfnKernelLaunchWithArgsExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a barrier command which waits a list of events to complete /// before it completes, with optional extended properties diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp index 06619c8f7f625..81e41cb849688 100644 --- a/unified-runtime/source/loader/ur_print.cpp +++ b/unified-runtime/source/loader/ur_print.cpp @@ -1147,6 +1147,30 @@ ur_result_t urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintExpKernelArgType(enum ur_exp_kernel_arg_type_t value, + char *buffer, const size_t buff_size, + size_t *out_size) { + std::stringstream ss; + ss << value; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t urPrintExpKernelArgMemObjTuple( + const struct ur_exp_kernel_arg_mem_obj_tuple_t params, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t urPrintExpKernelArgProperties( + const struct ur_exp_kernel_arg_properties_t params, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintExpEnqueueExtFlags(enum ur_exp_enqueue_ext_flag_t value, char *buffer, const size_t buff_size, size_t *out_size) { @@ -1878,6 +1902,14 @@ ur_result_t urPrintEnqueueWriteHostPipeParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintEnqueueKernelLaunchWithArgsExpParams( + const struct ur_enqueue_kernel_launch_with_args_exp_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintEnqueueEventsWaitWithBarrierExtParams( const struct ur_enqueue_events_wait_with_barrier_ext_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index ebcdbe1a7882d..5bfb2bb96b51b 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -9210,6 +9210,95 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command to execute a kernel +/// +/// @remarks +/// _Analogues_ +/// - **clEnqueueNDRangeKernel** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// + `NULL == hKernel` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkSize` +/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `pArgs == NULL && numArgs > 0` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS +/// + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR. +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || +/// pGlobalWorkSize[2] == 0` +/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE +/// + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == +/// 0 || pLocalWorkSize[2] == 0)` +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values +/// have not been specified." +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If any property in `launchPropList` isn't supported by the device. +ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] number of dimensions, from 1 to 3, to specify the global and + /// work-group work-items + uint32_t workDim, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of global work-items in workDim that will execute the kernel + /// function + const size_t *pGlobalWorkSize, + /// [in][optional] pointer to an array of workDim unsigned values that + /// specify the number of local work-items forming a work-group that will + /// execute the kernel function. + /// If nullptr, the runtime implementation will choose the work-group size. + const size_t *pLocalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, + /// [in] size of the event wait list + uint32_t numEventsInWaitList, + /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of + /// events that must be complete before the kernel execution. + /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait + /// event. + const ur_event_handle_t *phEventWaitList, + /// [out][optional][alloc] return an event object that identifies this + /// particular kernel execution instance. If phEventWaitList and phEvent + /// are not NULL, phEvent must not refer to an element of the + /// phEventWaitList array. + ur_event_handle_t *phEvent) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a barrier command which waits a list of events to complete /// before it completes, with optional extended properties diff --git a/unified-runtime/test/conformance/CMakeLists.txt b/unified-runtime/test/conformance/CMakeLists.txt index 5d579dbbf506b..c1ca49f8e992a 100644 --- a/unified-runtime/test/conformance/CMakeLists.txt +++ b/unified-runtime/test/conformance/CMakeLists.txt @@ -79,6 +79,7 @@ set(TEST_SUBDIRECTORIES_DPCXX "integration" "exp_command_buffer" "exp_enqueue_native" + "exp_enqueue_kernel_launch_with_args" "exp_usm_p2p" "memory-migrate" "usm" diff --git a/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt b/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt new file mode 100644 index 0000000000000..d03e5ef7c072f --- /dev/null +++ b/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (C) 2025 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +add_conformance_kernels_test( + exp_kernel_launch_with_args + urEnqueueKernelLaunchWithArgsExp.cpp +) diff --git a/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp b/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp new file mode 100644 index 0000000000000..093bc56004e9a --- /dev/null +++ b/unified-runtime/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp @@ -0,0 +1,303 @@ +// Copyright (C) 2025 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include + +// This test runs a kernel with a mix of local memory, pointer and value args. +struct urEnqueueKernelLaunchWithArgsTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = + sizeof(uint32_t) * global_size[0] * local_size[0]; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + uint32_t current_index = 0; + // Index 0 is local_mem_a arg + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, + nullptr, + UR_EXP_KERNEL_ARG_TYPE_LOCAL, + current_index++, + local_mem_a_size, + {nullptr}}); + + // Hip has extra args for local mem at index 1-3 + ur_exp_kernel_arg_value_t argValue = {}; + if (backend == UR_BACKEND_HIP) { + argValue.value = &hip_local_offset; + ur_exp_kernel_arg_properties_t local_offset = { + UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, + nullptr, + UR_EXP_KERNEL_ARG_TYPE_VALUE, + current_index++, + sizeof(hip_local_offset), + argValue}; + args.push_back(local_offset); + local_offset.index = current_index++; + args.push_back(local_offset); + local_offset.index = current_index++; + args.push_back(local_offset); + } + + // Index 1 is local_mem_b arg + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, + nullptr, + UR_EXP_KERNEL_ARG_TYPE_LOCAL, + current_index++, + local_mem_b_size, + {nullptr}}); + + if (backend == UR_BACKEND_HIP) { + argValue.value = &hip_local_offset; + ur_exp_kernel_arg_properties_t local_offset = { + UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, + nullptr, + UR_EXP_KERNEL_ARG_TYPE_VALUE, + current_index++, + sizeof(hip_local_offset), + argValue}; + args.push_back(local_offset); + local_offset.index = current_index++; + args.push_back(local_offset); + local_offset.index = current_index++; + args.push_back(local_offset); + } + + // Index 2 is output + argValue.pointer = shared_ptrs[0]; + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++, + sizeof(shared_ptrs[0]), argValue}); + // Index 3 is A + argValue.value = &A; + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_VALUE, current_index++, sizeof(A), + argValue}); + // Index 4 is X + argValue.pointer = shared_ptrs[1]; + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++, + sizeof(shared_ptrs[1]), argValue}); + // Index 5 is Y + argValue.pointer = shared_ptrs[2]; + args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, + UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++, + sizeof(shared_ptrs[2]), argValue}); + } + + void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A, + size_t length, size_t local_size) { + for (size_t i = 0; i < length; i++) { + uint32_t result = A * X[i] + Y[i] + local_size; + ASSERT_EQ(result, output[i]); + } + } + + virtual void TearDown() override { + for (auto &shared_ptr : shared_ptrs) { + if (shared_ptr) { + EXPECT_SUCCESS(urUSMFree(context, shared_ptr)); + } + } + + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown()); + } + + static constexpr size_t local_size[3] = {4, 1, 1}; + static constexpr size_t local_mem_a_size = local_size[0] * sizeof(uint32_t); + static constexpr size_t local_mem_b_size = local_mem_a_size * 2; + static constexpr size_t global_size[3] = {16, 1, 1}; + static constexpr size_t global_offset[3] = {0, 0, 0}; + static constexpr uint32_t workDim = 3; + static constexpr uint32_t A = 42; + std::array shared_ptrs = {nullptr, nullptr, nullptr, nullptr, + nullptr}; + + uint32_t hip_arg_offset = 0; + static constexpr uint64_t hip_local_offset = 0; + ur_backend_t backend{}; + std::vector args; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchWithArgsTest); + +TEST_P(urEnqueueKernelLaunchWithArgsTest, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, global_size, local_size, + args.size(), args.data(), 0, nullptr, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size[0], local_size[0]); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullHandleQueue) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, + urEnqueueKernelLaunchWithArgsExp( + nullptr, kernel, workDim, global_offset, global_size, + local_size, args.size(), args.data(), 0, nullptr, 0, + nullptr, nullptr)); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullHandleKernel) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, + urEnqueueKernelLaunchWithArgsExp( + queue, nullptr, workDim, global_offset, global_size, + local_size, args.size(), args.data(), 0, nullptr, 0, + nullptr, nullptr)); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerGlobalSize) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, nullptr, + local_size, args.size(), args.data(), 0, nullptr, 0, + nullptr, nullptr)); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerProperties) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, global_size, + local_size, args.size(), args.data(), 1, nullptr, 0, + nullptr, nullptr)); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerArgs) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, global_size, + local_size, args.size(), nullptr, 0, nullptr, 0, nullptr, + nullptr)); +} + +TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidEventWaitList) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, + urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, global_size, + local_size, args.size(), args.data(), 0, nullptr, 1, + nullptr, nullptr)); + ur_event_handle_t event = nullptr; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, + urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, global_offset, global_size, + local_size, args.size(), args.data(), 0, nullptr, 0, + &event, nullptr)); +} + +// This test runs a kernel with a buffer (MEM_OBJ) arg. +struct urEnqueueKernelLaunchWithArgsMemObjTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "fill"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, + sizeof(val) * global_size[0], nullptr, + &buffer)); + + char zero = 0; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffer, &zero, sizeof(zero), 0, + buffer_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // First argument is buffer to fill + unsigned current_arg_index = 0; + ur_exp_kernel_arg_mem_obj_tuple_t buffer_and_properties = {buffer, 0}; + ur_exp_kernel_arg_properties_t arg = { + UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, + nullptr, + UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ, + current_arg_index++, + sizeof(buffer), + {nullptr}}; + arg.value.memObjTuple = buffer_and_properties; + args.push_back(arg); + + // Add accessor arguments depending on backend. + // HIP has 3 offset parameters and other backends only have 1. + if (backend == UR_BACKEND_HIP) { + arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE; + arg.size = sizeof(hip_local_offset); + arg.value.value = &hip_local_offset; + arg.index = current_arg_index++; + args.push_back(arg); + arg.index = current_arg_index++; + args.push_back(arg); + arg.index = current_arg_index++; + args.push_back(arg); + } else { + arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE; + arg.index = current_arg_index++; + arg.size = sizeof(accessor); + arg.value.value = &accessor; + args.push_back(arg); + } + + // Second user defined argument is scalar to fill with. + arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE; + arg.index = current_arg_index++; + arg.size = sizeof(val); + arg.value.value = &val; + args.push_back(arg); + } + + void TearDown() override { + if (buffer) { + EXPECT_SUCCESS(urMemRelease(buffer)); + } + + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown()); + } + + static constexpr uint32_t val = 42; + static constexpr size_t global_size[3] = {32, 1, 1}; + static constexpr uint32_t workDim = 3; + static constexpr size_t buffer_size = sizeof(val) * global_size[0]; + static constexpr uint64_t hip_local_offset = 0; + ur_backend_t backend{}; + ur_mem_handle_t buffer = nullptr; + // This is the accessor offset struct sycl kernels expect to accompany buffer args. + struct { + size_t offsets[1] = {0}; + } accessor; + std::vector args; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchWithArgsMemObjTest); + +TEST_P(urEnqueueKernelLaunchWithArgsMemObjTest, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunchWithArgsExp( + queue, kernel, workDim, nullptr, global_size, nullptr, args.size(), + args.data(), 0, nullptr, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ValidateBuffer(buffer, buffer_size, val); +}