diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index f0908eb9742b5..8b81e68b6fd3c 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -54,7 +54,7 @@ event_impl::~event_impl() { void event_impl::waitInternal(bool *Success) { auto Handle = this->getHandle(); - if (!MIsHostEvent && Handle) { + if (Handle) { // Wait for the native event ur_result_t Err = getAdapter()->call_nocheck(1, &Handle); @@ -92,7 +92,7 @@ void event_impl::waitInternal(bool *Success) { } void event_impl::setComplete() { - if (MIsHostEvent || !this->getHandle()) { + if (!this->getHandle()) { { std::unique_lock lock(MMutex); #ifndef NDEBUG @@ -138,7 +138,6 @@ const AdapterPtr &event_impl::getAdapter() { void event_impl::setStateIncomplete() { MState = HES_NotComplete; } void event_impl::setContextImpl(const ContextImplPtr &Context) { - MIsHostEvent = Context == nullptr; MContext = Context; } @@ -159,6 +158,14 @@ event_impl::event_impl(ur_event_handle_t Event, const context &SyclContext) } } +void event_impl::allocateHostProfilingInfo() { + MHostProfilingInfo.reset(new HostProfilingInfo()); + if (!MHostProfilingInfo) + throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), + "Out of host memory " + + codeToString(UR_RESULT_ERROR_OUT_OF_HOST_MEMORY)); +} + event_impl::event_impl(const QueueImplPtr &Queue) : MQueue{Queue}, MIsProfilingEnabled{!Queue || Queue->MIsProfilingEnabled}, MFallbackProfiling{MIsProfilingEnabled && Queue && @@ -167,12 +174,7 @@ event_impl::event_impl(const QueueImplPtr &Queue) this->setContextImpl(Queue->getContextImplPtr()); else { MState.store(HES_NotComplete); - MHostProfilingInfo.reset(new HostProfilingInfo()); - if (!MHostProfilingInfo) - throw sycl::exception( - sycl::make_error_code(sycl::errc::runtime), - "Out of host memory " + - codeToString(UR_RESULT_ERROR_OUT_OF_HOST_MEMORY)); + allocateHostProfilingInfo(); return; } MState.store(HES_Complete); @@ -400,7 +402,7 @@ uint64_t event_impl::get_profiling_info() { template <> uint32_t event_impl::get_info() { auto Handle = this->getHandle(); - if (!MIsHostEvent && Handle) { + if (Handle) { return get_event_info(Handle, this->getAdapter()); } @@ -496,18 +498,20 @@ void HostProfilingInfo::start() { StartTime = getTimestamp(); } void HostProfilingInfo::end() { EndTime = getTimestamp(); } ur_native_handle_t event_impl::getNative() { - if (isHost()) + auto Handle = getHandle(); + if (MIsHostEvent && !Handle) return {}; - initContextIfNeeded(); + initContextIfNeeded(); auto Adapter = getAdapter(); - auto Handle = getHandle(); + if (MIsDefaultConstructed && !Handle) { auto TempContext = MContext.get()->getHandleRef(); ur_event_native_properties_t NativeProperties{}; ur_event_handle_t UREvent = nullptr; Adapter->call( 0, TempContext, &NativeProperties, &UREvent); + Adapter->call(UREvent); this->setHandle(UREvent); Handle = UREvent; } @@ -627,8 +631,14 @@ bool event_impl::isCompleted() { void event_impl::setCommand(void *Cmd) { MCommand = Cmd; auto TypedCommand = static_cast(Cmd); - if (TypedCommand) - MIsHostEvent = TypedCommand->getWorkerContext() == nullptr; + if (TypedCommand && TypedCommand->getWorkerContext() == nullptr) + markAsHost(); +} + +void event_impl::markAsHost() { + MIsHostEvent = true; + if (!MHostProfilingInfo) + allocateHostProfilingInfo(); } } // namespace detail diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index e07098200b57d..3e419d1842e8b 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -349,6 +349,8 @@ class event_impl { return MEvent && MQueue.expired() && !MIsEnqueued && !MCommand; } + void markAsHost(); + protected: // When instrumentation is enabled emits trace event for event wait begin and // returns the telemetry event generated for the wait @@ -358,6 +360,7 @@ class event_impl { void instrumentationEpilog(void *TelementryEvent, const std::string &Name, int32_t StreamID, uint64_t IId) const; void checkProfilingPreconditions() const; + void allocateHostProfilingInfo(); std::atomic MEvent = nullptr; // Stores submission time of command associated with event diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 6df1e1ea4bf67..d9d3549219567 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -315,8 +315,10 @@ void queue_impl::addEvent(const event &Event) { } // As long as the queue supports urQueueFinish we only need to store events // for undiscarded, unenqueued commands and host tasks. - else if (MEmulateOOO || - (EImpl->getHandle() == nullptr && !EImpl->isDiscarded())) { + // Event->isHost can be false for L0 on user events but we still need to sync + // host tasks explicitly. + else if (MEmulateOOO || ((EImpl->getHandle() == nullptr || EImpl->isHost()) && + !EImpl->isDiscarded())) { std::weak_ptr EventWeakPtr{EImpl}; std::lock_guard Lock{MMutex}; MEventsWeak.push_back(std::move(EventWeakPtr)); @@ -637,7 +639,8 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { EventImplWeakPtrIt->lock()) { // A nullptr UR event indicates that urQueueFinish will not cover it, // either because it's a host task event or an unenqueued one. - if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandle()) { + if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandle() || + EventImplSharedPtr->isHost()) { EventImplSharedPtr->wait(EventImplSharedPtr); } } diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index d42cb1c87e6cf..a28be75156cf5 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -755,6 +755,11 @@ class queue_impl { return ResEvent; } + bool nativeHostTaskHandling() { + return std::getenv("SYCL_ENABLE_USER_EVENTS_PATH") && + (MDevice->getBackend() == backend::ext_oneapi_level_zero); + } + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES // CMPLRLLVM-66082 // These methods are for accessing a member that should live in the diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 93b78628c13b1..7ea5a68da5aa9 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -233,7 +233,7 @@ Command::getUrEvents(const std::vector &EventImpls, // current one is a host task. In this case we should not skip ur event due // to different sync mechanisms for different task types on in-order queue. if (CommandQueue && EventImpl->getWorkerQueue() == CommandQueue && - CommandQueue->isInOrder() && !IsHostTaskCommand) + CommandQueue->isInOrder() && (!IsHostTaskCommand)) continue; RetUrEvents.push_back(Handle); @@ -285,7 +285,7 @@ Command::getUrEventsBlocking(const std::vector &EventImpls, // kept. if (!HasEventMode && MWorkerQueue && EventImpl->getWorkerQueue() == MWorkerQueue && - MWorkerQueue->isInOrder() && !isHostTask()) + MWorkerQueue->isInOrder() && (!isHostTask())) continue; RetUrEvents.push_back(EventImpl->getHandle()); @@ -502,7 +502,7 @@ void Command::waitForEvents(QueueImplPtr Queue, ur_event_handle_t &Event) { #ifndef NDEBUG for (const EventImplPtr &Event : EventImpls) - assert(!Event->isHost() && + assert(Event->getHandle() && "Only non-host events are expected to be waited for here"); #endif if (!EventImpls.empty()) { @@ -759,10 +759,11 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, // 2. Some types of commands do not produce UR events after they are // enqueued (e.g. alloca). Note that we can't check the ur event to make that // distinction since the command might still be unenqueued at this point. - bool PiEventExpected = - (!DepEvent->isHost() && !DepEvent->isDefaultConstructed()); + bool PiEventExpected = !DepEvent->isDefaultConstructed(); if (auto *DepCmd = static_cast(DepEvent->getCommand())) PiEventExpected &= DepCmd->producesPiEvent(); + else + PiEventExpected &= DepEvent->getHandle() != nullptr; if (!PiEventExpected) { // call to waitInternal() is in waitForPreparedHostEvents() as it's called @@ -774,7 +775,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, Command *ConnectionCmd = nullptr; ContextImplPtr DepEventContext = DepEvent->getContextImpl(); - // If contexts don't match we'll connect them using host task + // If contexts don't match we'll connect them using host task. if (DepEventContext != WorkerContext && WorkerContext) { Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp); @@ -785,9 +786,9 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, } ContextImplPtr Command::getWorkerContext() const { - if (!MQueue) + if (!MWorkerQueue) return nullptr; - return MQueue->getContextImplPtr(); + return MWorkerQueue->getContextImplPtr(); } bool Command::producesPiEvent() const { return true; } @@ -928,7 +929,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, else { MEvent->setEnqueued(); if (MShouldCompleteEventIfPossible && !MEvent->isDiscarded() && - (MEvent->isHost() || MEvent->getHandle() == nullptr)) + (MEvent->getHandle() == nullptr)) MEvent->setComplete(); // Consider the command is successfully enqueued if return code is @@ -1059,8 +1060,6 @@ void AllocaCommandBase::emitInstrumentationData() { #endif } -bool AllocaCommandBase::producesPiEvent() const { return false; } - bool AllocaCommandBase::supportsPostEnqueueCleanup() const { return false; } bool AllocaCommandBase::readyForCleanup() const { return false; } @@ -1092,6 +1091,14 @@ void AllocaCommand::emitInstrumentationData() { #endif } +bool AllocaCommand::producesPiEvent() const { + // for reference see enqueueImp() + auto TypedSyclMemObj = static_cast(getSYCLMemObj()); + // Event presence implies interop context esistence + return (TypedSyclMemObj->hasInteropEvent() && + (getContext(MQueue) == TypedSyclMemObj->getInteropContext())); +} + ur_result_t AllocaCommand::enqueueImp() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; @@ -1104,7 +1111,8 @@ ur_result_t AllocaCommand::enqueueImp() { if (!MQueue) { // Do not need to make allocation if we have a linked device allocation Command::waitForEvents(MQueue, EventImpls, UREvent); - MEvent->setHandle(UREvent); + assert(UREvent == nullptr && "AllocaCommand: waitForEvents without Queue " + "shouldn't produce native event."); return UR_RESULT_SUCCESS; } @@ -1119,6 +1127,10 @@ ur_result_t AllocaCommand::enqueueImp() { Result != UR_RESULT_SUCCESS) return Result; + assert((!!UREvent == producesPiEvent()) && + "AllocaCommand: native event is expected only when it is for interop " + "memory object with native event provided."); + MEvent->setHandle(UREvent); return UR_RESULT_SUCCESS; } @@ -1161,6 +1173,8 @@ AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, ToEnqueue.push_back(ConnectionCmd); } +bool AllocaSubBufCommand::producesPiEvent() const { return false; } + void AllocaSubBufCommand::emitInstrumentationData() { #ifdef XPTI_ENABLE_INSTRUMENTATION if (!xptiCheckTraceEnabled(MStreamID)) @@ -1200,8 +1214,8 @@ ur_result_t AllocaSubBufCommand::enqueueImp() { MRequirement.MAccessRange, std::move(EventImpls), UREvent); Result != UR_RESULT_SUCCESS) return Result; - - MEvent->setHandle(UREvent); + assert(UREvent == nullptr && + "AllocaSubBufCommand: it shouldn't produce native event."); XPTIRegistry::bufferAssociateNotification(MParentAlloca->getSYCLMemObj(), MMemAllocation); @@ -1312,6 +1326,7 @@ ur_result_t ReleaseCommand::enqueueImp() { EventImpls.push_back(UnmapEventImpl); } ur_event_handle_t UREvent = nullptr; + // Synchronous wait if (SkipRelease) Command::waitForEvents(MQueue, EventImpls, UREvent); else { @@ -1322,7 +1337,8 @@ ur_result_t ReleaseCommand::enqueueImp() { Result != UR_RESULT_SUCCESS) return Result; } - MEvent->setHandle(UREvent); + assert(!UREvent && "ReleaseCommand: release shouldn't produce native event."); + return UR_RESULT_SUCCESS; } @@ -1392,6 +1408,7 @@ ur_result_t MapMemObject::enqueueImp() { MSrcReq.MElemSize, std::move(RawEvents), UREvent); Result != UR_RESULT_SUCCESS) return Result; + assert(UREvent && "MapMemObject command must produce native event"); MEvent->setHandle(UREvent); return UR_RESULT_SUCCESS; @@ -1440,7 +1457,8 @@ void UnMapMemObject::emitInstrumentationData() { #endif } -bool UnMapMemObject::producesPiEvent() const { +static bool checkNativeEventForWA(const QueueImplPtr &Queue, + const ur_event_handle_t &NativeEvent) { // TODO remove this workaround once the batching issue is addressed in Level // Zero adapter. // Consider the following scenario on Level Zero: @@ -1456,9 +1474,13 @@ bool UnMapMemObject::producesPiEvent() const { // an event waitlist and Level Zero adapter attempts to batch these commands, // so the execution of kernel B starts only on step 4. This workaround // restores the old behavior in this case until this is resolved. - return MQueue && (MQueue->getDeviceImplPtr()->getBackend() != - backend::ext_oneapi_level_zero || - MEvent->getHandle() != nullptr); + return Queue && (Queue->getDeviceImplPtr()->getBackend() != + backend::ext_oneapi_level_zero || + NativeEvent != nullptr); +} + +bool UnMapMemObject::producesPiEvent() const { + return checkNativeEventForWA(MQueue, MEvent->getHandle()); } ur_result_t UnMapMemObject::enqueueImp() { @@ -1475,6 +1497,8 @@ ur_result_t UnMapMemObject::enqueueImp() { Result != UR_RESULT_SUCCESS) return Result; + assert((!!UREvent == checkNativeEventForWA(MQueue, UREvent)) && + "UnMapMemObject command must produce native event"); MEvent->setHandle(UREvent); return UR_RESULT_SUCCESS; @@ -1540,32 +1564,9 @@ void MemCpyCommand::emitInstrumentationData() { #endif } -ContextImplPtr MemCpyCommand::getWorkerContext() const { - if (!MWorkerQueue) - return nullptr; - return MWorkerQueue->getContextImplPtr(); -} - bool MemCpyCommand::producesPiEvent() const { - // TODO remove this workaround once the batching issue is addressed in Level - // Zero adapter. - // Consider the following scenario on Level Zero: - // 1. Kernel A, which uses buffer A, is submitted to queue A. - // 2. Kernel B, which uses buffer B, is submitted to queue B. - // 3. queueA.wait(). - // 4. queueB.wait(). - // DPCPP runtime used to treat unmap/write commands for buffer A/B as host - // dependencies (i.e. they were waited for prior to enqueueing any command - // that's dependent on them). This allowed Level Zero adapter to detect that - // each queue is idle on steps 1/2 and submit the command list right away. - // This is no longer the case since we started passing these dependencies in - // an event waitlist and Level Zero adapter attempts to batch these commands, - // so the execution of kernel B starts only on step 4. This workaround - // restores the old behavior in this case until this is resolved. - return !MQueue || - MQueue->getDeviceImplPtr()->getBackend() != - backend::ext_oneapi_level_zero || - MEvent->getHandle() != nullptr; + return checkNativeEventForWA(MSrcQueue ? MSrcQueue : MQueue, + MEvent->getHandle()); } ur_result_t MemCpyCommand::enqueueImp() { @@ -1587,6 +1588,9 @@ ur_result_t MemCpyCommand::enqueueImp() { MEvent); Result != UR_RESULT_SUCCESS) return Result; + assert((!!UREvent == + checkNativeEventForWA(MSrcQueue ? MSrcQueue : MQueue, UREvent)) && + "MemCpyCommand must produce native event"); MEvent->setHandle(UREvent); return UR_RESULT_SUCCESS; @@ -1639,7 +1643,8 @@ ur_result_t UpdateHostRequirementCommand::enqueueImp() { std::vector EventImpls = MPreparedDepsEvents; ur_event_handle_t UREvent = nullptr; Command::waitForEvents(MQueue, EventImpls, UREvent); - MEvent->setHandle(UREvent); + assert((!!UREvent == producesPiEvent()) && + "UpdateHostRequirementCommand doesn't produce native event"); assert(MSrcAllocaCmd && "Expected valid alloca command"); assert(MSrcAllocaCmd->getMemAllocation() && "Expected valid source pointer"); @@ -1672,25 +1677,17 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const { } } -MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq, - AllocaCommandBase *SrcAllocaCmd, - Requirement DstReq, void **DstPtr, - QueueImplPtr SrcQueue, - QueueImplPtr DstQueue) - : Command(CommandType::COPY_MEMORY, std::move(DstQueue)), - MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)), - MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) { - if (MSrcQueue) { - MEvent->setContextImpl(MSrcQueue->getContextImplPtr()); - } - - MWorkerQueue = !MQueue ? MSrcQueue : MQueue; - MEvent->setWorkerQueue(MWorkerQueue); - +MemCpyToHostCommand::MemCpyToHostCommand(Requirement SrcReq, + AllocaCommandBase *SrcAllocaCmd, + Requirement DstReq, void **DstPtr, + QueueImplPtr SrcQueue) + : Command(CommandType::COPY_MEMORY, std::move(SrcQueue)), + MSrcReq(std::move(SrcReq)), MSrcAllocaCmd(SrcAllocaCmd), + MDstReq(std::move(DstReq)), MDstPtr(DstPtr) { emitInstrumentationDataProxy(); } -void MemCpyCommandHost::emitInstrumentationData() { +void MemCpyToHostCommand::emitInstrumentationData() { #ifdef XPTI_ENABLE_INSTRUMENTATION if (!xptiCheckTraceEnabled(MStreamID)) return; @@ -1704,9 +1701,8 @@ void MemCpyCommandHost::emitInstrumentationData() { xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); xpti::addMetadata(CmdTraceEvent, "copy_from", - MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0); - xpti::addMetadata(CmdTraceEvent, "copy_to", MQueue ? deviceToID(MQueue->get_device()) : 0); + xpti::addMetadata(CmdTraceEvent, "copy_to", 0); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); @@ -1714,23 +1710,17 @@ void MemCpyCommandHost::emitInstrumentationData() { #endif } -ContextImplPtr MemCpyCommandHost::getWorkerContext() const { - if (!MWorkerQueue) - return nullptr; - return MWorkerQueue->getContextImplPtr(); -} - -ur_result_t MemCpyCommandHost::enqueueImp() { +ur_result_t MemCpyToHostCommand::enqueueImp() { const QueueImplPtr &Queue = MWorkerQueue; waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; std::vector RawEvents = getUrEvents(EventImpls); - ur_event_handle_t UREvent = nullptr; // Omit copying if mode is discard one. // TODO: Handle this at the graph building time by, for example, creating // empty node instead of memcpy. - if (MDstReq.MAccessMode == access::mode::discard_read_write || + if (ur_event_handle_t UREvent = nullptr; + MDstReq.MAccessMode == access::mode::discard_read_write || MDstReq.MAccessMode == access::mode::discard_write) { Command::waitForEvents(Queue, EventImpls, UREvent); @@ -1738,17 +1728,18 @@ ur_result_t MemCpyCommandHost::enqueueImp() { } flushCrossQueueDeps(EventImpls, MWorkerQueue); - + ur_event_handle_t UREvent = nullptr; if (auto Result = callMemOpHelper( MemoryManager::copy, MSrcAllocaCmd->getSYCLMemObj(), - MSrcAllocaCmd->getMemAllocation(), MSrcQueue, MSrcReq.MDims, + MSrcAllocaCmd->getMemAllocation(), MQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange, MSrcReq.MOffset, - MSrcReq.MElemSize, *MDstPtr, MQueue, MDstReq.MDims, + MSrcReq.MElemSize, *MDstPtr, nullptr, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange, MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), UREvent, MEvent); Result != UR_RESULT_SUCCESS) return Result; - + assert((!!UREvent == producesPiEvent()) && + "MemCpyCommandHost must produce native event"); MEvent->setHandle(UREvent); return UR_RESULT_SUCCESS; } @@ -1761,7 +1752,8 @@ ur_result_t EmptyCommand::enqueueImp() { waitForPreparedHostEvents(); ur_event_handle_t UREvent = nullptr; waitForEvents(MQueue, MPreparedDepsEvents, UREvent); - MEvent->setHandle(UREvent); + assert((!!UREvent == producesPiEvent()) && + "EmptyCommand doesn't produce native event"); return UR_RESULT_SUCCESS; } @@ -1826,7 +1818,7 @@ void EmptyCommand::printDot(std::ostream &Stream) const { bool EmptyCommand::producesPiEvent() const { return false; } -void MemCpyCommandHost::printDot(std::ostream &Stream) const { +void MemCpyToHostCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\""; Stream << "ID = " << this << "\\n"; @@ -1844,9 +1836,8 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const { } UpdateHostRequirementCommand::UpdateHostRequirementCommand( - QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, - void **DstPtr) - : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)), + Requirement Req, AllocaCommandBase *SrcAllocaCmd, void **DstPtr) + : Command(CommandType::UPDATE_REQUIREMENT, nullptr), MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) { emitInstrumentationDataProxy(); @@ -1951,7 +1942,11 @@ ExecCGCommand::ExecCGCommand( if (MCommandGroup->getType() == detail::CGType::CodeplayHostTask) { MEvent->setSubmittedQueue( static_cast(MCommandGroup.get())->MQueue); + MWorkerQueue = nullptr; + MEvent->setWorkerQueue(MWorkerQueue); + MEvent->markAsHost(); } + if (MCommandGroup->getType() == detail::CGType::ProfilingTag) MEvent->markAsProfilingTagEvent(); @@ -3137,6 +3132,8 @@ ur_result_t ExecCGCommand::enqueueImp() { } else { return enqueueImpQueue(); } + assert((!!MEvent->getHandle() == producesPiEvent()) && + "ExecCGCommand must produce native event"); } ur_result_t ExecCGCommand::enqueueImpQueue() { @@ -3430,7 +3427,14 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { // Host task is executed asynchronously so we should record where it was // submitted to report exception origin properly. copySubmissionCodeLocation(); - + if (producesPiEvent()) { + auto TempContext = getContext(HostTask->MQueue)->getHandleRef(); + ur_event_native_properties_t NativeProperties{}; + auto &Adapter = MQueue->getAdapter(); + Adapter->call( + 0, TempContext, &NativeProperties, &UREvent); + MEvent->setHandle(UREvent); + } queue_impl::getThreadPool().submit( DispatchHostTask(this, std::move(ReqToMem), std::move(ReqUrMem))); @@ -3788,7 +3792,8 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { bool ExecCGCommand::producesPiEvent() const { return !MCommandBuffer && - MCommandGroup->getType() != CGType::CodeplayHostTask; + !( + MCommandGroup->getType() == CGType::CodeplayHostTask && !MQueue /* MQueue is set only when we have native sync with host task */); } bool ExecCGCommand::supportsPostEnqueueCleanup() const { @@ -3811,11 +3816,17 @@ UpdateCommandBufferCommand::UpdateCommandBufferCommand( : Command(CommandType::UPDATE_CMD_BUFFER, Queue), MGraph(Graph), MNodes(Nodes) {} +bool UpdateCommandBufferCommand::producesPiEvent() const { + return !MPreparedDepsEvents.empty(); +} + ur_result_t UpdateCommandBufferCommand::enqueueImp() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; ur_event_handle_t UREvent = nullptr; Command::waitForEvents(MQueue, EventImpls, UREvent); + assert((!!UREvent == producesPiEvent()) && + "UpdateCommandBufferCommand produces native event"); MEvent->setHandle(UREvent); auto CheckAndFindAlloca = [](Requirement *Req, const DepDesc &Dep) { @@ -3885,7 +3896,6 @@ void UpdateCommandBufferCommand::printDot(std::ostream &Stream) const { } void UpdateCommandBufferCommand::emitInstrumentationData() {} -bool UpdateCommandBufferCommand::producesPiEvent() const { return false; } } // namespace detail } // namespace _V1 diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 12e605ac626d5..928a0ab99ec4e 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -168,7 +168,9 @@ class Command { } // Shows that command could be enqueued, but is blocking enqueue of all // commands depending on it. Regular usage - host task. - bool isBlocking() const { return isHostTask() && !MEvent->isCompleted(); } + bool isBlocking() const { + return isHostTask() && !producesPiEvent() && !MEvent->isCompleted(); + } void addBlockedUserUnique(const EventImplPtr &NewUser) { if (std::find(MBlockedUsers.begin(), MBlockedUsers.end(), NewUser) != @@ -225,7 +227,8 @@ class Command { /// Get the context of the queue this command will be submitted to. Could /// differ from the context of MQueue for memory copy commands. - virtual ContextImplPtr getWorkerContext() const; + ContextImplPtr getWorkerContext() const; + QueueImplPtr getWorkerQueue() const { return MWorkerQueue; } /// Returns true iff the command produces a UR event on non-host devices. virtual bool producesPiEvent() const; @@ -265,6 +268,7 @@ class Command { std::vector &MPreparedDepsEvents; std::vector &MPreparedHostDepsEvents; + // Event is nullptr when Queue == nullptr too void waitForEvents(QueueImplPtr Queue, std::vector &RawEvents, ur_event_handle_t &Event); @@ -460,8 +464,6 @@ class AllocaCommandBase : public Command { void emitInstrumentationData() override; - bool producesPiEvent() const final; - bool supportsPostEnqueueCleanup() const final; bool readyForCleanup() const final; @@ -500,6 +502,7 @@ class AllocaCommand : public AllocaCommandBase { void *getMemAllocation() const final { return MMemAllocation; } void printDot(std::ostream &Stream) const final; void emitInstrumentationData() override; + bool producesPiEvent() const final; private: ur_result_t enqueueImp() final; @@ -521,6 +524,7 @@ class AllocaSubBufCommand : public AllocaCommandBase { void printDot(std::ostream &Stream) const final; AllocaCommandBase *getParentAlloca() { return MParentAlloca; } void emitInstrumentationData() override; + bool producesPiEvent() const final; private: ur_result_t enqueueImp() final; @@ -577,7 +581,6 @@ class MemCpyCommand : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData() final; - ContextImplPtr getWorkerContext() const final; bool producesPiEvent() const final; private: @@ -592,21 +595,19 @@ class MemCpyCommand : public Command { /// The mem copy host command enqueues memory copy between two instances of /// memory object. -class MemCpyCommandHost : public Command { +class MemCpyToHostCommand : public Command { public: - MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, - Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue, - QueueImplPtr DstQueue); + MemCpyToHostCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, + Requirement DstReq, void **DstPtr, QueueImplPtr SrcQueue); void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData() final; - ContextImplPtr getWorkerContext() const final; + bool producesPiEvent() const final { return !!MQueue; } private: ur_result_t enqueueImp() final; - QueueImplPtr MSrcQueue; Requirement MSrcReq; AllocaCommandBase *MSrcAllocaCmd = nullptr; Requirement MDstReq; @@ -694,12 +695,13 @@ std::pair emitKernelInstrumentationData( class UpdateHostRequirementCommand : public Command { public: - UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, - AllocaCommandBase *SrcAllocaCmd, void **DstPtr); + UpdateHostRequirementCommand(Requirement Req, AllocaCommandBase *SrcAllocaCmd, + void **DstPtr); void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData() final; + bool producesPiEvent() const final { return false; } private: ur_result_t enqueueImp() final; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 85bc93f7d6a9a..7b99f3e0a0917 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -270,18 +270,16 @@ void Scheduler::GraphBuilder::addNodeToLeaves( } UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( - MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue, - std::vector &ToEnqueue) { - auto Context = queue_impl::getContext(Queue); - AllocaCommandBase *AllocaCmd = findAllocaForReq(Record, Req, Context); + MemObjRecord *Record, Requirement *Req, std::vector &ToEnqueue) { + AllocaCommandBase *AllocaCmd = findAllocaForReq(Record, Req, nullptr); assert(AllocaCmd && "There must be alloca for requirement!"); UpdateHostRequirementCommand *UpdateCommand = - new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData); + new UpdateHostRequirementCommand(*Req, AllocaCmd, &Req->MData); // Need copy of requirement because after host accessor destructor call // dependencies become invalid if requirement is stored by pointer. const Requirement *StoredReq = UpdateCommand->getRequirement(); - std::set Deps = findDepsForReq(Record, Req, Context); + std::set Deps = findDepsForReq(Record, Req, nullptr); std::vector ToCleanUp; for (Command *Dep : Deps) { Command *ConnCmd = @@ -306,10 +304,10 @@ static Command *insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, assert(AllocaCmdSrc->MIsActive && "Expected source alloca command to be active"); - if (!AllocaCmdSrc->getQueue()) { + if (!AllocaCmdSrc->getWorkerQueue()) { UnMapMemObject *UnMapCmd = new UnMapMemObject( AllocaCmdDst, *AllocaCmdDst->getRequirement(), - &AllocaCmdSrc->MMemAllocation, AllocaCmdDst->getQueue()); + &AllocaCmdSrc->MMemAllocation, AllocaCmdDst->getWorkerQueue()); std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive); @@ -318,7 +316,7 @@ static Command *insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, MapMemObject *MapCmd = new MapMemObject( AllocaCmdSrc, *AllocaCmdSrc->getRequirement(), - &AllocaCmdDst->MMemAllocation, AllocaCmdSrc->getQueue(), MapMode); + &AllocaCmdDst->MMemAllocation, AllocaCmdSrc->getWorkerQueue(), MapMode); std::swap(AllocaCmdSrc->MIsActive, AllocaCmdDst->MIsActive); @@ -351,9 +349,10 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( // current context, need to find a parent alloca command for it (it must be // there) auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) { - bool Res = isOnSameContext(Record->MCurContext, AllocaCmd->getQueue()) && - // Looking for a parent buffer alloca command - AllocaCmd->getType() == Command::CommandType::ALLOCA; + bool Res = + isOnSameContext(Record->MCurContext, AllocaCmd->getWorkerQueue()) && + // Looking for a parent buffer alloca command + AllocaCmd->getType() == Command::CommandType::ALLOCA; return Res; }; const auto It = @@ -391,10 +390,10 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( } else { // Full copy of buffer is needed to avoid loss of data that may be caused // by copying specific range from host to device and backwards. - NewCmd = - new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc, - *AllocaCmdDst->getRequirement(), AllocaCmdDst, - AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue()); + NewCmd = new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc, + *AllocaCmdDst->getRequirement(), AllocaCmdDst, + AllocaCmdSrc->getWorkerQueue(), + AllocaCmdDst->getWorkerQueue()); } } std::vector ToCleanUp; @@ -415,7 +414,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( Command *Scheduler::GraphBuilder::remapMemoryObject( MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd, std::vector &ToEnqueue) { - assert(!HostAllocaCmd->getQueue() && "Host alloca command expected"); + assert(!HostAllocaCmd->getWorkerQueue() && "Host alloca command expected"); assert(HostAllocaCmd->MIsActive && "Active alloca command expected"); AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd; @@ -425,15 +424,16 @@ Command *Scheduler::GraphBuilder::remapMemoryObject( UnMapMemObject *UnMapCmd = new UnMapMemObject( LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(), - &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue()); + &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getWorkerQueue()); // Map write only as read-write access::mode MapMode = Req->MAccessMode; if (MapMode == access::mode::write) MapMode = access::mode::read_write; - MapMemObject *MapCmd = new MapMemObject( - LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(), - &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode); + MapMemObject *MapCmd = + new MapMemObject(LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(), + &HostAllocaCmd->MMemAllocation, + LinkedAllocaCmd->getWorkerQueue(), MapMode); std::vector ToCleanUp; for (Command *Dep : Deps) { @@ -474,15 +474,15 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req, AllocaCommandBase *SrcAllocaCmd = findAllocaForReq(Record, Req, Record->MCurContext); - auto MemCpyCmdUniquePtr = std::make_unique( + auto MemCpyCmdUniquePtr = std::make_unique( *SrcAllocaCmd->getRequirement(), SrcAllocaCmd, *Req, &Req->MData, - SrcAllocaCmd->getQueue(), nullptr); + SrcAllocaCmd->getWorkerQueue()); if (!MemCpyCmdUniquePtr) throw exception(make_error_code(errc::memory_allocation), "Out of host memory"); - MemCpyCommandHost *MemCpyCmd = MemCpyCmdUniquePtr.release(); + MemCpyToHostCommand *MemCpyCmd = MemCpyCmdUniquePtr.release(); std::vector ToCleanUp; for (Command *Dep : Deps) { @@ -524,7 +524,7 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, AllocaCommandBase *HostAllocaCmd = getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue); - if (isOnSameContext(Record->MCurContext, HostAllocaCmd->getQueue())) { + if (isOnSameContext(Record->MCurContext, HostAllocaCmd->getWorkerQueue())) { if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) { remapMemoryObject(Record, Req, Req->MIsSubBuffer ? (static_cast( @@ -536,8 +536,7 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, } else insertMemoryMove(Record, Req, nullptr, ToEnqueue); - Command *UpdateHostAccCmd = - insertUpdateHostReqCmd(Record, Req, nullptr, ToEnqueue); + Command *UpdateHostAccCmd = insertUpdateHostReqCmd(Record, Req, ToEnqueue); // Need empty command to be blocked until host accessor is destructed EmptyCommand *EmptyCmd = addEmptyCmd( @@ -606,7 +605,7 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, // Going through copying memory between contexts is not supported. if (Dep.MDepCommand) { - auto DepQueue = Dep.MDepCommand->getQueue(); + auto DepQueue = Dep.MDepCommand->getWorkerQueue(); CanBypassDep &= isOnSameContext(Context, DepQueue); } @@ -647,7 +646,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq( bool AllowConst) { auto IsSuitableAlloca = [&Context, Req, AllowConst](AllocaCommandBase *AllocaCmd) { - bool Res = isOnSameContext(Context, AllocaCmd->getQueue()); + bool Res = isOnSameContext(Context, AllocaCmd->getWorkerQueue()); if (IsSuitableSubReq(Req)) { const Requirement *TmpReq = AllocaCmd->getRequirement(); Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF; @@ -921,20 +920,20 @@ static void combineAccessModesOfReqs(std::vector &Reqs) { } Command *Scheduler::GraphBuilder::addCG( - std::unique_ptr CommandGroup, const QueueImplPtr &Queue, - std::vector &ToEnqueue, bool EventNeeded, - ur_exp_command_buffer_handle_t CommandBuffer, + std::unique_ptr CommandGroup, + const QueueImplPtr &SubmittedQueue, std::vector &ToEnqueue, + bool EventNeeded, ur_exp_command_buffer_handle_t CommandBuffer, const std::vector &Dependencies) { std::vector &Reqs = CommandGroup->getRequirements(); std::vector &Events = CommandGroup->getEvents(); - auto NewCmd = std::make_unique(std::move(CommandGroup), Queue, - EventNeeded, CommandBuffer, - std::move(Dependencies)); - + auto NewCmd = std::make_unique( + std::move(CommandGroup), SubmittedQueue, EventNeeded, CommandBuffer, + std::move(Dependencies)); if (!NewCmd) throw exception(make_error_code(errc::memory_allocation), "Out of host memory"); + auto Queue = NewCmd->getWorkerQueue(); bool isInteropTask = isInteropHostTask(NewCmd.get()); @@ -1225,7 +1224,7 @@ Command *Scheduler::GraphBuilder::connectDepEvent( try { std::shared_ptr HT(new detail::HostTask); std::unique_ptr ConnectCG(new detail::CGHostTask( - std::move(HT), /* Queue = */ Cmd->getQueue(), /* Context = */ {}, + std::move(HT), /* Queue = */ Cmd->getWorkerQueue(), /* Context = */ {}, /* Args = */ {}, detail::CG::StorageInitHelper( /* ArgsStorage = */ {}, /* AccStorage = */ {}, diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c36ff2acbb21a..4584ab7acdb04 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -121,8 +121,10 @@ EventImplPtr Scheduler::addCG( MGraphBuilder.addCGUpdateHost(std::move(CommandGroup), AuxiliaryCmds); break; case CGType::CodeplayHostTask: { - NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), nullptr, - AuxiliaryCmds, EventNeeded); + NewCmd = + MGraphBuilder.addCG(std::move(CommandGroup), + Queue->nativeHostTaskHandling() ? Queue : nullptr, + AuxiliaryCmds, EventNeeded); break; } default: @@ -467,12 +469,17 @@ void Scheduler::NotifyHostTaskCompletion(Command *Cmd) { ToCleanUp.push_back(Cmd); Cmd->MMarkedForCleanup = true; } - { - std::lock_guard Guard(Cmd->MBlockedUsersMutex); - // update self-event status - CmdEvent->setComplete(); + + if (auto NativeEvent = CmdEvent->getHandle()) { + QueueImpl->getAdapter()->call(NativeEvent); + } else { + { + std::lock_guard Guard(Cmd->MBlockedUsersMutex); + // update self-event status + CmdEvent->setComplete(); + } + Scheduler::enqueueUnblockedCommands(Cmd->MBlockedUsers, Lock, ToCleanUp); } - Scheduler::enqueueUnblockedCommands(Cmd->MBlockedUsers, Lock, ToCleanUp); } QueueImpl->revisitUnenqueuedCommandsState(CmdEvent); diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 43c70ea55e2fd..a87005b60da29 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -678,7 +678,6 @@ class Scheduler { UpdateHostRequirementCommand * insertUpdateHostReqCmd(MemObjRecord *Record, Requirement *Req, - const QueueImplPtr &Queue, std::vector &ToEnqueue); /// Finds dependencies for the requirement. diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index f289c1e62b371..3381261a5ab5a 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -246,6 +246,10 @@ void SYCLMemObjT::handleWriteAccessorCreation() { } } +bool SYCLMemObjT::hasInteropEvent() const { + return MInteropEvent && MInteropEvent->getHandle(); +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/sycl_mem_obj_t.hpp b/sycl/source/detail/sycl_mem_obj_t.hpp index cd3a717fdb8cf..4003aacd198f4 100644 --- a/sycl/source/detail/sycl_mem_obj_t.hpp +++ b/sycl/source/detail/sycl_mem_obj_t.hpp @@ -159,6 +159,8 @@ class SYCLMemObjT : public SYCLMemObjI { MHostPtrProvided = true; } + bool hasInteropEvent() const; + protected: void updateHostMemory(void *const Ptr); diff --git a/sycl/unittests/queue/ShortcutFunctions.cpp b/sycl/unittests/queue/ShortcutFunctions.cpp index f6b8dd99dd2f5..c573bc069ff22 100644 --- a/sycl/unittests/queue/ShortcutFunctions.cpp +++ b/sycl/unittests/queue/ShortcutFunctions.cpp @@ -34,8 +34,10 @@ ur_result_t redefinedEnqueueMemBufferWrite(void *) { return UR_RESULT_SUCCESS; } -ur_result_t redefinedEnqueueMemBufferRead(void *) { +ur_result_t redefinedEnqueueMemBufferRead(void *pParams) { TestContext->BufferReadCalled = true; + auto params = *static_cast(pParams); + **params.pphEvent = mock::createDummyHandle(); return UR_RESULT_SUCCESS; } diff --git a/sycl/unittests/scheduler/CMakeLists.txt b/sycl/unittests/scheduler/CMakeLists.txt index 0bb1905ba8e6e..ef0ca5ff98b21 100644 --- a/sycl/unittests/scheduler/CMakeLists.txt +++ b/sycl/unittests/scheduler/CMakeLists.txt @@ -1,3 +1,4 @@ +add_definitions(-g -O0) add_sycl_unittest(SchedulerTests OBJECT BlockedCommands.cpp Commands.cpp diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index cf0c91e1478c5..a03a75cffbf99 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -9,6 +9,7 @@ #include "SchedulerTest.hpp" #include "SchedulerTestUtils.hpp" +#include #include #include @@ -23,19 +24,55 @@ using namespace sycl; size_t GEventsWaitCounter = 0; +auto DummyHostTaskEvent = mock::createDummyHandle(); +bool HostTaskReady = false; +std::mutex HostTaskMutex; +std::condition_variable HostTaskCV; + +size_t EventSignaled = 0; +size_t EventCreated = 0; + inline ur_result_t redefinedEventsWait(void *pParams) { auto params = *static_cast(pParams); if (*params.pnumEvents > 0) { GEventsWaitCounter++; + + if (**params.pphEventWaitList == DummyHostTaskEvent) { + std::unique_lock lk(HostTaskMutex); + HostTaskCV.wait(lk, [] { return HostTaskReady; }); + } + } + + return UR_RESULT_SUCCESS; +} + +inline ur_result_t redefinedEventHostSignal(void *pParams) { + EventSignaled = true; + { + std::unique_lock lk(HostTaskMutex); + HostTaskReady = true; } + HostTaskCV.notify_one(); + return UR_RESULT_SUCCESS; +} + +inline ur_result_t redefinedEventCreateWithNativeHandle(void *pParams) { + EventCreated = true; + auto params = + *static_cast(pParams); + **params.pphEvent = DummyHostTaskEvent; return UR_RESULT_SUCCESS; } -TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { +template void InOrderQueueHostTaskDepsTestBody() { GEventsWaitCounter = 0; - sycl::unittest::UrMock<> Mock; + sycl::unittest::UrMock Mock; sycl::platform Plt = sycl::platform(); mock::getCallbacks().set_before_callback("urEventWait", &redefinedEventsWait); + mock::getCallbacks().set_replace_callback("urEventHostSignal", + &redefinedEventHostSignal); + mock::getCallbacks().set_replace_callback( + "urEventCreateWithNativeHandle", &redefinedEventCreateWithNativeHandle); context Ctx{Plt}; queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; @@ -45,8 +82,21 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { [&](sycl::handler &CGH) { CGH.memset(buf, 0, sizeof(buf[0])); }); InOrderQueue.submit([&](sycl::handler &CGH) { CGH.host_task([=] {}); }) .wait(); + EXPECT_EQ(EventCreated, Backend == sycl::backend::ext_oneapi_level_zero); + EXPECT_EQ(GEventsWaitCounter, + 1u + size_t(Backend == sycl::backend::ext_oneapi_level_zero)); + EXPECT_EQ(EventSignaled, Backend == sycl::backend::ext_oneapi_level_zero); +} + +TEST_F(SchedulerTest, InOrderQueueHostTaskDepsOCL) { + InOrderQueueHostTaskDepsTestBody(); +} - EXPECT_EQ(GEventsWaitCounter, 1u); +TEST_F(SchedulerTest, InOrderQueueHostTaskDepsL0) { + std::function DoNothing = [] {}; + unittest::ScopedEnvVar HostTaskViaNativeEvent{"SYCL_ENABLE_USER_EVENTS_PATH", + "1", DoNothing}; + InOrderQueueHostTaskDepsTestBody(); } enum class CommandType { KERNEL = 1, MEMSET = 2 }; diff --git a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp index 1a5fa726170b8..371ad6a08e7c3 100644 --- a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp +++ b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp @@ -13,12 +13,12 @@ using namespace sycl; -class MemObjMock : public sycl::detail::SYCLMemObjI { +class MemObjMock : public sycl::detail::SYCLMemObjT { public: using ContextImplPtr = std::shared_ptr; MemObjMock(const std::shared_ptr &Record) - : SYCLMemObjI() { + : SYCLMemObjT({}, nullptr) { MRecord = Record; } @@ -38,6 +38,7 @@ class MemObjMock : public sycl::detail::SYCLMemObjI { bool hasUserDataPtr() const override { return false; } bool isHostPointerReadOnly() const override { return false; } bool usesPinnedHostMemory() const override { return false; } + bool hasInteropEvent() const { return false; } detail::ContextImplPtr getInteropContext() const override { return nullptr; } }; @@ -62,7 +63,9 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) { // Commands are linked only if the device supports host unified memory. sycl::queue Queue1{Dev}; + sycl::queue Queue2{Dev}; sycl::detail::QueueImplPtr Q1 = sycl::detail::getSyclObjImpl(Queue1); + sycl::detail::QueueImplPtr Q2 = sycl::detail::getSyclObjImpl(Queue2); auto AllocaDep = [](sycl::detail::Command *, sycl::detail::Command *, sycl::detail::MemObjRecord *, @@ -77,8 +80,8 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) { sycl::detail::AllocaCommand AllocaCmd1(nullptr, Req, false); Record->MAllocaCommands.push_back(&AllocaCmd1); - MockCommand DepCmd(nullptr, Req); - MockCommand DepDepCmd(nullptr, Req); + MockCommand DepCmd(Q2, Req); + MockCommand DepDepCmd(Q2, Req); DepCmd.MDeps.push_back({&DepDepCmd, DepDepCmd.getRequirement(), &AllocaCmd1}); DepDepCmd.MUsers.insert(&DepCmd); std::vector ToEnqueue; @@ -87,7 +90,7 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) { MockScheduler MS; sycl::detail::Command *AllocaCmd2 = MS.getOrCreateAllocaForReq(Record.get(), &Req, Q1, ToEnqueue); - + ASSERT_NE(AllocaCmd2, &AllocaCmd1); ASSERT_TRUE(!!AllocaCmd1.MLinkedAllocaCmd) << "No link appeared in existing command"; ASSERT_EQ(AllocaCmd1.MLinkedAllocaCmd, AllocaCmd2) << "Invalid link appeared"; diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp index 82cda17f1fa95..f05779f632078 100644 --- a/sycl/unittests/scheduler/QueueFlushing.cpp +++ b/sycl/unittests/scheduler/QueueFlushing.cpp @@ -20,6 +20,13 @@ static bool QueueFlushed = false; static bool EventStatusQueried = false; static ur_event_status_t EventStatus = UR_EVENT_STATUS_QUEUED; +static ur_result_t redefinedMemBufferReadRect(void *pParams) { + auto params = + *static_cast(pParams); + **params.pphEvent = mock::createDummyHandle(); + return UR_RESULT_SUCCESS; +} + static ur_result_t redefinedQueueFlush(void *pParams) { auto params = *static_cast(pParams); EXPECT_EQ(ExpectedDepQueue, *params.phQueue); @@ -87,6 +94,8 @@ TEST_F(SchedulerTest, QueueFlushing) { &redefinedQueueFlush); mock::getCallbacks().set_after_callback("urEventGetInfo", &redefinedEventGetInfoAfter); + mock::getCallbacks().set_after_callback("urEnqueueMemBufferReadRect", + &redefinedMemBufferReadRect); context Ctx{Plt}; queue QueueA{Ctx, default_selector_v}; @@ -124,8 +133,8 @@ TEST_F(SchedulerTest, QueueFlushing) { &HostAllocaCmd, QueueImplA, nullptr}; testCommandEnqueue(&MemCpyCmd, QueueImplB, MockReq); - detail::MemCpyCommandHost MemCpyCmdHost{MockReq, &AllocaCmd, MockReq, - &MockHostPtr, QueueImplA, nullptr}; + detail::MemCpyToHostCommand MemCpyCmdHost{MockReq, &AllocaCmd, MockReq, + &MockHostPtr, QueueImplA}; testCommandEnqueue(&MemCpyCmdHost, QueueImplB, MockReq); std::unique_ptr CG{ diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index ffc0567ba7daa..7a670c35124b5 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -180,9 +180,8 @@ class MockScheduler : public sycl::detail::Scheduler { sycl::detail::UpdateHostRequirementCommand * insertUpdateHostReqCmd(sycl::detail::MemObjRecord *Record, sycl::detail::Requirement *Req, - const sycl::detail::QueueImplPtr &Queue, std::vector &ToEnqueue) { - return MGraphBuilder.insertUpdateHostReqCmd(Record, Req, Queue, ToEnqueue); + return MGraphBuilder.insertUpdateHostReqCmd(Record, Req, ToEnqueue); } sycl::detail::EmptyCommand * diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 814cdd58413fd..50d8d3fbbb5ee 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -457,6 +457,8 @@ typedef enum ur_function_t { UR_FUNCTION_COMMAND_BUFFER_GET_NATIVE_HANDLE_EXP = 264, /// Enumerator for ::urUSMPoolSetInfoExp UR_FUNCTION_USM_POOL_SET_INFO_EXP = 265, + /// Enumerator for ::urEventHostSignal + UR_FUNCTION_EVENT_HOST_SIGNAL = 266, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -7426,6 +7428,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( /// [out] a pointer to the native handle of the event. ur_native_handle_t *phNativeEvent); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Signals an event from host +/// +/// @remarks +/// _Analogues_ +/// - **clSetUserEventStatus** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +UR_APIEXPORT ur_result_t UR_APICALL urEventHostSignal( + /// [in] handle of the event object + ur_event_handle_t hEvent); + /////////////////////////////////////////////////////////////////////////////// /// @brief Properties for for ::urEventCreateWithNativeHandle. typedef struct ur_event_native_properties_t { @@ -12932,6 +12955,14 @@ typedef struct ur_event_get_native_handle_params_t { ur_native_handle_t **pphNativeEvent; } ur_event_get_native_handle_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urEventHostSignal +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_event_host_signal_params_t { + ur_event_handle_t *phEvent; +} ur_event_host_signal_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urEventCreateWithNativeHandle /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/unified-runtime/include/ur_api_funcs.def b/unified-runtime/include/ur_api_funcs.def index de3e0a5e38d92..c78edf3f94927 100644 --- a/unified-runtime/include/ur_api_funcs.def +++ b/unified-runtime/include/ur_api_funcs.def @@ -36,6 +36,7 @@ _UR_API(urEventWait) _UR_API(urEventRetain) _UR_API(urEventRelease) _UR_API(urEventGetNativeHandle) +_UR_API(urEventHostSignal) _UR_API(urEventCreateWithNativeHandle) _UR_API(urEventSetCallback) _UR_API(urProgramCreateWithIL) diff --git a/unified-runtime/include/ur_ddi.h b/unified-runtime/include/ur_ddi.h index 68dc0a265d284..051a705eac989 100644 --- a/unified-runtime/include/ur_ddi.h +++ b/unified-runtime/include/ur_ddi.h @@ -188,6 +188,10 @@ typedef ur_result_t(UR_APICALL *ur_pfnEventRelease_t)(ur_event_handle_t); typedef ur_result_t(UR_APICALL *ur_pfnEventGetNativeHandle_t)( ur_event_handle_t, ur_native_handle_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urEventHostSignal +typedef ur_result_t(UR_APICALL *ur_pfnEventHostSignal_t)(ur_event_handle_t); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urEventCreateWithNativeHandle typedef ur_result_t(UR_APICALL *ur_pfnEventCreateWithNativeHandle_t)( @@ -210,6 +214,7 @@ typedef struct ur_event_dditable_t { ur_pfnEventRetain_t pfnRetain; ur_pfnEventRelease_t pfnRelease; ur_pfnEventGetNativeHandle_t pfnGetNativeHandle; + ur_pfnEventHostSignal_t pfnHostSignal; ur_pfnEventCreateWithNativeHandle_t pfnCreateWithNativeHandle; ur_pfnEventSetCallback_t pfnSetCallback; } ur_event_dditable_t; diff --git a/unified-runtime/include/ur_print.h b/unified-runtime/include/ur_print.h index d42a2eab16289..55efa0198a884 100644 --- a/unified-runtime/include/ur_print.h +++ b/unified-runtime/include/ur_print.h @@ -1685,6 +1685,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEventGetNativeHandleParams( const struct ur_event_get_native_handle_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_event_host_signal_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintEventHostSignalParams( + const struct ur_event_host_signal_params_t *params, char *buffer, + const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_event_create_with_native_handle_params_t struct /// @returns diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp index 98ed3d9990c1d..def47cfed825e 100644 --- a/unified-runtime/include/ur_print.hpp +++ b/unified-runtime/include/ur_print.hpp @@ -1234,6 +1234,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_USM_POOL_SET_INFO_EXP: os << "UR_FUNCTION_USM_POOL_SET_INFO_EXP"; break; + case UR_FUNCTION_EVENT_HOST_SIGNAL: + os << "UR_FUNCTION_EVENT_HOST_SIGNAL"; + break; default: os << "unknown enumerator"; break; @@ -12859,6 +12862,21 @@ inline std::ostream &operator<<( return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_event_host_signal_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<( + std::ostream &os, + [[maybe_unused]] const struct ur_event_host_signal_params_t *params) { + + os << ".hEvent = "; + + ur::details::printPtr(os, *(params->phEvent)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_event_create_with_native_handle_params_t /// type @@ -20663,6 +20681,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_EVENT_GET_NATIVE_HANDLE: { os << (const struct ur_event_get_native_handle_params_t *)params; } break; + case UR_FUNCTION_EVENT_HOST_SIGNAL: { + os << (const struct ur_event_host_signal_params_t *)params; + } break; case UR_FUNCTION_EVENT_CREATE_WITH_NATIVE_HANDLE: { os << (const struct ur_event_create_with_native_handle_params_t *)params; } break; diff --git a/unified-runtime/scripts/core/event.yml b/unified-runtime/scripts/core/event.yml index 1b0eeca23e9de..d06d1da7537af 100644 --- a/unified-runtime/scripts/core/event.yml +++ b/unified-runtime/scripts/core/event.yml @@ -282,6 +282,22 @@ returns: - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: - "If the adapter has no underlying equivalent handle." --- #-------------------------------------------------------------------------- +type: function +desc: "Signals an event from host" +class: $xEvent +name: HostSignal +ordinal: "0" +analogue: + - "**clSetUserEventStatus**" +params: + - type: $x_event_handle_t + name: hEvent + desc: "[in] handle of the event object" +returns: + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_OUT_OF_RESOURCES + - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY +--- #-------------------------------------------------------------------------- type: struct desc: "Properties for for $xEventCreateWithNativeHandle." class: $xQueue diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml index 9c8a61184dfbb..1107ba89c0522 100644 --- a/unified-runtime/scripts/core/registry.yml +++ b/unified-runtime/scripts/core/registry.yml @@ -643,6 +643,9 @@ etors: - name: USM_POOL_SET_INFO_EXP desc: Enumerator for $xUSMPoolSetInfoExp value: '265' +- name: EVENT_HOST_SIGNAL + desc: Enumerator for $xEventHostSignal + value: '266' --- type: enum desc: Defines structure types diff --git a/unified-runtime/source/adapters/cuda/event.cpp b/unified-runtime/source/adapters/cuda/event.cpp index d440567ffaac6..d52a35fe40446 100644 --- a/unified-runtime/source/adapters/cuda/event.cpp +++ b/unified-runtime/source/adapters/cuda/event.cpp @@ -305,3 +305,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL urEventHostSignal(ur_event_handle_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/adapters/hip/event.cpp b/unified-runtime/source/adapters/hip/event.cpp index b91340b4ac679..a39595f1050a6 100644 --- a/unified-runtime/source/adapters/hip/event.cpp +++ b/unified-runtime/source/adapters/hip/event.cpp @@ -248,6 +248,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t, return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urEventHostSignal(ur_event_handle_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { const auto RefCount = hEvent->incrementReferenceCount(); diff --git a/unified-runtime/source/adapters/level_zero/event.cpp b/unified-runtime/source/adapters/level_zero/event.cpp index 11c0502a55939..79e8f84bc47e2 100644 --- a/unified-runtime/source/adapters/level_zero/event.cpp +++ b/unified-runtime/source/adapters/level_zero/event.cpp @@ -895,6 +895,11 @@ urEventRetain(/** [in] handle of the event object */ ur_event_handle_t Event) { return UR_RESULT_SUCCESS; } +ur_result_t urEventHostSignal(ur_event_handle_t Event) { + auto ZeResult = ZE_CALL_NOCHECK(zeEventHostSignal, (Event->ZeEvent)); + return ze2urResult(ZeResult); +} + ur_result_t urEventRelease(/** [in] handle of the event object */ ur_event_handle_t Event) { @@ -948,8 +953,6 @@ ur_result_t urExtEventCreate( false /*ForceDisableProfiling*/, false)); (*Event)->RefCountExternal++; - if (!(*Event)->CounterBasedEventsEnabled) - ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); return UR_RESULT_SUCCESS; } @@ -970,8 +973,6 @@ ur_result_t urEventCreateWithNativeHandle( false /*ForceDisableProfiling*/, false)); (*Event)->RefCountExternal++; - if (!(*Event)->CounterBasedEventsEnabled) - ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index ede76fa63baf8..7f38f529b7ab7 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -240,6 +240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( pDdiTable->pfnRetain = ur::level_zero::urEventRetain; pDdiTable->pfnRelease = ur::level_zero::urEventRelease; pDdiTable->pfnGetNativeHandle = ur::level_zero::urEventGetNativeHandle; + pDdiTable->pfnHostSignal = ur::level_zero::urEventHostSignal; pDdiTable->pfnCreateWithNativeHandle = ur::level_zero::urEventCreateWithNativeHandle; pDdiTable->pfnSetCallback = ur::level_zero::urEventSetCallback; diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index 78eb006d4d2ff..7af0cc09c7293 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -320,6 +320,7 @@ ur_result_t urEventRetain(ur_event_handle_t hEvent); ur_result_t urEventRelease(ur_event_handle_t hEvent); ur_result_t urEventGetNativeHandle(ur_event_handle_t hEvent, ur_native_handle_t *phNativeEvent); +ur_result_t urEventHostSignal(ur_event_handle_t hEvent); ur_result_t urEventCreateWithNativeHandle(ur_native_handle_t hNativeEvent, ur_context_handle_t hContext, diff --git a/unified-runtime/source/adapters/level_zero/v2/event.cpp b/unified-runtime/source/adapters/level_zero/v2/event.cpp index 68c0d439dce7d..560fc68980a14 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.cpp @@ -232,6 +232,13 @@ ur_result_t urEventRelease(ur_event_handle_t hEvent) try { return exceptionToResult(std::current_exception()); } +ur_result_t urEventHostSignal(ur_event_handle_t hEvent) try { + auto ZeResult = ZE_CALL_NOCHECK(zeEventHostSignal, (hEvent->getZeEvent())); + return ze2urResult(ZeResult); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + ur_result_t urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) try { for (uint32_t i = 0; i < numEvents; ++i) { diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index 805b612dd69a5..5b025b3eaa3b0 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -5377,6 +5377,48 @@ __urdlllocal ur_result_t UR_APICALL urEventGetNativeHandle( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEventHostSignal +__urdlllocal ur_result_t UR_APICALL urEventHostSignal( + /// [in] handle of the event object + ur_event_handle_t hEvent) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_event_host_signal_params_t params = {&hEvent}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback("urEventHostSignal")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback("urEventHostSignal")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback("urEventHostSignal")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEventCreateWithNativeHandle __urdlllocal ur_result_t UR_APICALL urEventCreateWithNativeHandle( @@ -12041,6 +12083,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( pDdiTable->pfnGetNativeHandle = driver::urEventGetNativeHandle; + pDdiTable->pfnHostSignal = driver::urEventHostSignal; + pDdiTable->pfnCreateWithNativeHandle = driver::urEventCreateWithNativeHandle; pDdiTable->pfnSetCallback = driver::urEventSetCallback; diff --git a/unified-runtime/source/adapters/native_cpu/event.cpp b/unified-runtime/source/adapters/native_cpu/event.cpp index f981d24f42453..b5f23ad8d105a 100644 --- a/unified-runtime/source/adapters/native_cpu/event.cpp +++ b/unified-runtime/source/adapters/native_cpu/event.cpp @@ -121,6 +121,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( DIE_NO_IMPLEMENTATION; } +UR_APIEXPORT ur_result_t UR_APICALL urEventHostSignal(ur_event_handle_t) { + DIE_NO_IMPLEMENTATION; +} + ur_event_handle_t_::ur_event_handle_t_(ur_queue_handle_t queue, ur_command_t command_type) : queue(queue), context(queue->getContext()), command_type(command_type), diff --git a/unified-runtime/source/adapters/opencl/event.cpp b/unified-runtime/source/adapters/opencl/event.cpp index e039350c81dad..d6d74566c0e48 100644 --- a/unified-runtime/source/adapters/opencl/event.cpp +++ b/unified-runtime/source/adapters/opencl/event.cpp @@ -264,3 +264,7 @@ urEnqueueTimestampRecordingExp(ur_queue_handle_t, bool, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urEventHostSignal(ur_event_handle_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index 47b804becb41e..72baf3dbf1ee5 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -4381,6 +4381,39 @@ __urdlllocal ur_result_t UR_APICALL urEventGetNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEventHostSignal +__urdlllocal ur_result_t UR_APICALL urEventHostSignal( + /// [in] handle of the event object + ur_event_handle_t hEvent) { + auto pfnHostSignal = getContext()->urDdiTable.Event.pfnHostSignal; + + if (nullptr == pfnHostSignal) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + ur_event_host_signal_params_t params = {&hEvent}; + uint64_t instance = getContext()->notify_begin(UR_FUNCTION_EVENT_HOST_SIGNAL, + "urEventHostSignal", ¶ms); + + auto &logger = getContext()->logger; + logger.info(" ---> urEventHostSignal\n"); + + ur_result_t result = pfnHostSignal(hEvent); + + getContext()->notify_end(UR_FUNCTION_EVENT_HOST_SIGNAL, "urEventHostSignal", + ¶ms, &result, instance); + + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_HOST_SIGNAL, + ¶ms); + logger.info(" <--- urEventHostSignal({}) -> {};\n", args_str.str(), + result); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEventCreateWithNativeHandle __urdlllocal ur_result_t UR_APICALL urEventCreateWithNativeHandle( @@ -10270,6 +10303,9 @@ __urdlllocal ur_result_t UR_APICALL urGetEventProcAddrTable( dditable.pfnGetNativeHandle = pDdiTable->pfnGetNativeHandle; pDdiTable->pfnGetNativeHandle = ur_tracing_layer::urEventGetNativeHandle; + dditable.pfnHostSignal = pDdiTable->pfnHostSignal; + pDdiTable->pfnHostSignal = ur_tracing_layer::urEventHostSignal; + dditable.pfnCreateWithNativeHandle = pDdiTable->pfnCreateWithNativeHandle; pDdiTable->pfnCreateWithNativeHandle = ur_tracing_layer::urEventCreateWithNativeHandle; diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index 5fad2472419ff..73e10b5bd0a3f 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -4310,6 +4310,32 @@ __urdlllocal ur_result_t UR_APICALL urEventGetNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEventHostSignal +__urdlllocal ur_result_t UR_APICALL urEventHostSignal( + /// [in] handle of the event object + ur_event_handle_t hEvent) { + auto pfnHostSignal = getContext()->urDdiTable.Event.pfnHostSignal; + + if (nullptr == pfnHostSignal) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == hEvent) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hEvent)) { + getContext()->refCountContext->logInvalidReference(hEvent); + } + + ur_result_t result = pfnHostSignal(hEvent); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEventCreateWithNativeHandle __urdlllocal ur_result_t UR_APICALL urEventCreateWithNativeHandle( @@ -10985,6 +11011,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( dditable.pfnGetNativeHandle = pDdiTable->pfnGetNativeHandle; pDdiTable->pfnGetNativeHandle = ur_validation_layer::urEventGetNativeHandle; + dditable.pfnHostSignal = pDdiTable->pfnHostSignal; + pDdiTable->pfnHostSignal = ur_validation_layer::urEventHostSignal; + dditable.pfnCreateWithNativeHandle = pDdiTable->pfnCreateWithNativeHandle; pDdiTable->pfnCreateWithNativeHandle = ur_validation_layer::urEventCreateWithNativeHandle; diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index 2c1d2203ad31a..9f677c28ed531 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -101,6 +101,7 @@ EXPORTS urEventGetInfo urEventGetNativeHandle urEventGetProfilingInfo + urEventHostSignal urEventRelease urEventRetain urEventSetCallback @@ -305,6 +306,7 @@ EXPORTS urPrintEventGetInfoParams urPrintEventGetNativeHandleParams urPrintEventGetProfilingInfoParams + urPrintEventHostSignalParams urPrintEventInfo urPrintEventNativeProperties urPrintEventReleaseParams diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index 687f97b283506..f173482512f8e 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -101,6 +101,7 @@ urEventGetInfo; urEventGetNativeHandle; urEventGetProfilingInfo; + urEventHostSignal; urEventRelease; urEventRetain; urEventSetCallback; @@ -305,6 +306,7 @@ urPrintEventGetInfoParams; urPrintEventGetNativeHandleParams; urPrintEventGetProfilingInfoParams; + urPrintEventHostSignalParams; urPrintEventInfo; urPrintEventNativeProperties; urPrintEventReleaseParams; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index 842e93969f3e0..c51f81f72b306 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -4308,6 +4308,30 @@ __urdlllocal ur_result_t UR_APICALL urEventGetNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEventHostSignal +__urdlllocal ur_result_t UR_APICALL urEventHostSignal( + /// [in] handle of the event object + ur_event_handle_t hEvent) { + ur_result_t result = UR_RESULT_SUCCESS; + + [[maybe_unused]] auto context = getContext(); + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hEvent)->dditable; + auto pfnHostSignal = dditable->ur.Event.pfnHostSignal; + if (nullptr == pfnHostSignal) + return UR_RESULT_ERROR_UNINITIALIZED; + + // convert loader handle to platform handle + hEvent = reinterpret_cast(hEvent)->handle; + + // forward to device-platform + result = pfnHostSignal(hEvent); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEventCreateWithNativeHandle __urdlllocal ur_result_t UR_APICALL urEventCreateWithNativeHandle( @@ -10431,6 +10455,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( pDdiTable->pfnRetain = ur_loader::urEventRetain; pDdiTable->pfnRelease = ur_loader::urEventRelease; pDdiTable->pfnGetNativeHandle = ur_loader::urEventGetNativeHandle; + pDdiTable->pfnHostSignal = ur_loader::urEventHostSignal; pDdiTable->pfnCreateWithNativeHandle = ur_loader::urEventCreateWithNativeHandle; pDdiTable->pfnSetCallback = ur_loader::urEventSetCallback; diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 8019681b288d8..e45824adab185 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -4898,6 +4898,35 @@ ur_result_t UR_APICALL urEventGetNativeHandle( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Signals an event from host +/// +/// @remarks +/// _Analogues_ +/// - **clSetUserEventStatus** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +ur_result_t UR_APICALL urEventHostSignal( + /// [in] handle of the event object + ur_event_handle_t hEvent) try { + auto pfnHostSignal = ur_lib::getContext()->urDdiTable.Event.pfnHostSignal; + if (nullptr == pfnHostSignal) + return UR_RESULT_ERROR_UNINITIALIZED; + + return pfnHostSignal(hEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Create runtime event object from native event handle. /// diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp index 50af3986b2f38..32a1ab5e02d75 100644 --- a/unified-runtime/source/loader/ur_print.cpp +++ b/unified-runtime/source/loader/ur_print.cpp @@ -1913,6 +1913,15 @@ ur_result_t urPrintEventGetNativeHandleParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t +urPrintEventHostSignalParams(const struct ur_event_host_signal_params_t *params, + char *buffer, const size_t buff_size, + size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintEventCreateWithNativeHandleParams( const struct ur_event_create_with_native_handle_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index 2bc2da22f256a..25e9c97228899 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -4275,6 +4275,30 @@ ur_result_t UR_APICALL urEventGetNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Signals an event from host +/// +/// @remarks +/// _Analogues_ +/// - **clSetUserEventStatus** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +ur_result_t UR_APICALL urEventHostSignal( + /// [in] handle of the event object + ur_event_handle_t hEvent) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Create runtime event object from native event handle. ///