diff --git a/.github/intel-llvm-mirror-base-commit b/.github/intel-llvm-mirror-base-commit index 92a6c05337..9d93df2036 100644 --- a/.github/intel-llvm-mirror-base-commit +++ b/.github/intel-llvm-mirror-base-commit @@ -1 +1 @@ -63c70a1425d2c91fa54ec6495aae8ecfa7a5a10c +05538e008ad1e1b348e79bbb40888387288a2428 diff --git a/source/adapters/offload/device.cpp b/source/adapters/offload/device.cpp index fb18073da4..ece73a199c 100644 --- a/source/adapters/offload/device.cpp +++ b/source/adapters/offload/device.cpp @@ -121,6 +121,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: return ReturnValue(uint32_t{0}); case UR_DEVICE_INFO_QUEUE_PROPERTIES: + case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: + case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: + return ReturnValue( + ur_queue_flags_t{UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE}); case UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES: return ReturnValue(0); case UR_DEVICE_INFO_SUPPORTED_PARTITIONS: { diff --git a/source/adapters/offload/enqueue.cpp b/source/adapters/offload/enqueue.cpp index d35d2bd40e..7dd6754b92 100644 --- a/source/adapters/offload/enqueue.cpp +++ b/source/adapters/offload/enqueue.cpp @@ -68,10 +68,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( LaunchArgs.DynSharedMemory = 0; ol_event_handle_t EventOut; + ol_queue_handle_t Queue; + OL_RETURN_ON_ERR(hQueue->nextQueue(Queue)); OL_RETURN_ON_ERR( - olLaunchKernel(hQueue->OffloadQueue, hQueue->OffloadDevice, - hKernel->OffloadKernel, hKernel->Args.getStorage(), - hKernel->Args.getStorageSize(), &LaunchArgs, &EventOut)); + olLaunchKernel(Queue, hQueue->OffloadDevice, hKernel->OffloadKernel, + hKernel->Args.getStorage(), hKernel->Args.getStorageSize(), + &LaunchArgs, &EventOut)); if (phEvent) { auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue); @@ -105,15 +107,20 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue, (void)phEventWaitList; // - ol_event_handle_t EventOut = nullptr; - - OL_RETURN_ON_ERR(olMemcpy(hQueue->OffloadQueue, DestPtr, DestDevice, SrcPtr, - SrcDevice, size, phEvent ? &EventOut : nullptr)); - if (blocking) { - OL_RETURN_ON_ERR(olSyncQueue(hQueue->OffloadQueue)); + OL_RETURN_ON_ERR(olMemcpy(nullptr, DestPtr, DestDevice, SrcPtr, SrcDevice, + size, nullptr)); + if (phEvent) { + *phEvent = ur_event_handle_t_::createEmptyEvent(Command, hQueue); + } + return UR_RESULT_SUCCESS; } + ol_event_handle_t EventOut = nullptr; + ol_queue_handle_t Queue; + OL_RETURN_ON_ERR(hQueue->nextQueue(Queue)); + OL_RETURN_ON_ERR(olMemcpy(Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size, + phEvent ? &EventOut : nullptr)); if (phEvent) { auto *Event = new ur_event_handle_t_(Command, hQueue); Event->OffloadEvent = EventOut; diff --git a/source/adapters/offload/queue.cpp b/source/adapters/offload/queue.cpp index b2e28ddd70..43647d0041 100644 --- a/source/adapters/offload/queue.cpp +++ b/source/adapters/offload/queue.cpp @@ -17,22 +17,30 @@ #include "queue.hpp" #include "ur2offload.hpp" -UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - const ur_queue_properties_t *, - ur_queue_handle_t *phQueue) { +UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( + [[maybe_unused]] ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) { assert(hContext->Device == hDevice); - ur_queue_handle_t Queue = new ur_queue_handle_t_(); - auto Res = olCreateQueue(hDevice->OffloadDevice, &Queue->OffloadQueue); - if (Res != OL_SUCCESS) { - delete Queue; - return offloadResultToUR(Res); + ur_queue_flags_t URFlags = 0; + if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) { + URFlags = pProps->flags; } - Queue->OffloadDevice = hDevice->OffloadDevice; - Queue->UrContext = hContext; + ur_queue_handle_t Queue = + new ur_queue_handle_t_(hDevice->OffloadDevice, hContext, URFlags); + + // For in-order queues, create the ol queue on construction so we can report + // any errors earlier + if (!(URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) { + [[maybe_unused]] ol_queue_handle_t InitQueue; + auto Res = Queue->nextQueue(InitQueue); + if (Res != OL_SUCCESS) { + delete Queue; + return offloadResultToUR(Res); + } + } *phQueue = Queue; @@ -47,6 +55,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { + case UR_QUEUE_INFO_FLAGS: + return ReturnValue(hQueue->Flags); case UR_QUEUE_INFO_REFERENCE_COUNT: return ReturnValue(hQueue->RefCount.load()); default: @@ -63,7 +73,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { if (--hQueue->RefCount == 0) { - OL_RETURN_ON_ERR(olDestroyQueue(hQueue->OffloadQueue)); + for (auto *Q : hQueue->OffloadQueues) { + if (!Q) { + break; + } + OL_RETURN_ON_ERR(olDestroyQueue(Q)); + } delete hQueue; } @@ -71,7 +86,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { } UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { - return offloadResultToUR(olSyncQueue(hQueue->OffloadQueue)); + for (auto *Q : hQueue->OffloadQueues) { + if (!Q) { + break; + } + OL_RETURN_ON_ERR(olSyncQueue(Q)); + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( diff --git a/source/adapters/offload/queue.hpp b/source/adapters/offload/queue.hpp index e9c642f528..25585db273 100644 --- a/source/adapters/offload/queue.hpp +++ b/source/adapters/offload/queue.hpp @@ -15,8 +15,42 @@ #include "common.hpp" +constexpr size_t OOO_QUEUE_POOL_SIZE = 32; + struct ur_queue_handle_t_ : RefCounted { - ol_queue_handle_t OffloadQueue; + ur_queue_handle_t_(ol_device_handle_t Device, ur_context_handle_t UrContext, + ur_queue_flags_t Flags) + : OffloadQueues((Flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) + ? 1 + : OOO_QUEUE_POOL_SIZE), + QueueOffset(0), OffloadDevice(Device), UrContext(UrContext), + Flags(Flags) {} + + // In-order queues only have one element here, while out of order queues have + // a bank of queues to use. We rotate through them round robin instead of + // constantly creating new ones in case there is a long-running program that + // never destroys the ur queue. Out-of-order queues create ol queues when + // needed; any queues that are not yet created are nullptr. + // This is a simpler implementation of the HIP/Cuda queue pooling logic in + // `stream_queue_t`. In the future, if we want more performance or it + // simplifies the implementation of a feature, we can consider using it. + std::vector OffloadQueues; + size_t QueueOffset; ol_device_handle_t OffloadDevice; ur_context_handle_t UrContext; + ur_queue_flags_t Flags; + + ol_result_t nextQueue(ol_queue_handle_t &Handle) { + auto &Slot = OffloadQueues[QueueOffset++]; + QueueOffset %= OffloadQueues.size(); + + if (!Slot) { + if (auto Res = olCreateQueue(OffloadDevice, &Slot)) { + return Res; + } + } + + Handle = Slot; + return nullptr; + } }; diff --git a/source/loader/layers/sanitizer/asan/asan_buffer.cpp b/source/loader/layers/sanitizer/asan/asan_buffer.cpp index a5d6eb6269..a98cec7453 100644 --- a/source/loader/layers/sanitizer/asan/asan_buffer.cpp +++ b/source/loader/layers/sanitizer/asan/asan_buffer.cpp @@ -90,6 +90,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { assert((void *)Device != nullptr && "Device cannot be nullptr"); std::scoped_lock Guard(Mutex); + auto CI = getAsanInterceptor()->getContextInfo(Context); auto &Allocation = Allocations[Device]; ur_result_t URes = UR_RESULT_SUCCESS; if (!Allocation) { @@ -106,9 +107,9 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { } if (HostPtr) { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr); + InternalQueue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr); if (URes != UR_RESULT_SUCCESS) { UR_LOG_L( getContext()->logger, ERR, @@ -147,10 +148,10 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { // Copy data from last synced device to host { - ManagedQueue Queue(Context, LastSyncedDevice.hDevice); + ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, 0, - nullptr, nullptr); + InternalQueue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, + 0, nullptr, nullptr); if (URes != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Failed to migrate memory buffer data"); @@ -160,9 +161,10 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { // Sync data back to device { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, Allocation, HostAllocation, Size, 0, nullptr, nullptr); + InternalQueue, true, Allocation, HostAllocation, Size, 0, nullptr, + nullptr); if (URes != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Failed to migrate memory buffer data"); diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp index 0b4a64c38a..899ff6a850 100644 --- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -677,7 +677,7 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreate( std::shared_ptr CtxInfo = getAsanInterceptor()->getContextInfo(hContext); for (const auto &hDevice : CtxInfo->DeviceList) { - ManagedQueue InternalQueue(hContext, hDevice); + ur_queue_handle_t InternalQueue = CtxInfo->getInternalQueue(hDevice); char *Handle = nullptr; UR_CALL(pMemBuffer->getHandle(hDevice, Handle)); UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index 5e984d9b93..6c6eb65c92 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -270,17 +270,15 @@ ur_result_t AsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, auto ContextInfo = getContextInfo(Context); auto DeviceInfo = getDeviceInfo(Device); - ManagedQueue InternalQueue(Context, Device); - if (!InternalQueue) { - UR_LOG_L(getContext()->logger, ERR, "Failed to create internal queue"); - return UR_RESULT_ERROR_INVALID_QUEUE; - } + ur_queue_handle_t InternalQueue = ContextInfo->getInternalQueue(Device); UR_CALL(prepareLaunch(ContextInfo, DeviceInfo, InternalQueue, Kernel, LaunchInfo)); UR_CALL(updateShadowMemory(ContextInfo, DeviceInfo, InternalQueue)); + UR_CALL(getContext()->urDdiTable.Queue.pfnFinish(InternalQueue)); + return UR_RESULT_SUCCESS; } @@ -467,6 +465,7 @@ ur_result_t AsanInterceptor::unregisterProgram(ur_program_handle_t Program) { ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) { auto Context = GetContext(Program); + auto CI = getContextInfo(Context); std::vector Devices = GetDevices(Program); for (auto Device : Devices) { @@ -484,11 +483,11 @@ ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) { assert((MetadataSize % sizeof(SpirKernelInfo) == 0) && "SpirKernelMetadata size is not correct"); - ManagedQueue Queue(Context, Device); + ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device); std::vector SKInfo(NumOfSpirKernel); Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, &SKInfo[0], MetadataPtr, + InternalQueue, true, &SKInfo[0], MetadataPtr, sizeof(SpirKernelInfo) * NumOfSpirKernel, 0, nullptr, nullptr); if (Result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Can't read the value of <{}>: {}", @@ -504,7 +503,7 @@ ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) { } std::vector KernelNameV(SKI.Size); Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, KernelNameV.data(), (void *)SKI.KernelName, + InternalQueue, true, KernelNameV.data(), (void *)SKI.KernelName, sizeof(char) * SKI.Size, 0, nullptr, nullptr); if (Result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Can't read kernel name: {}", @@ -537,7 +536,7 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) { assert(ProgramInfo != nullptr && "unregistered program!"); for (auto Device : Devices) { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t InternalQueue = ContextInfo->getInternalQueue(Device); size_t MetadataSize; void *MetadataPtr; @@ -554,7 +553,7 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) { "DeviceGlobal metadata size is not correct"); std::vector GVInfos(NumOfDeviceGlobal); Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, &GVInfos[0], MetadataPtr, + InternalQueue, true, &GVInfos[0], MetadataPtr, sizeof(DeviceGlobalInfo) * NumOfDeviceGlobal, 0, nullptr, nullptr); if (Result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Device Global[{}] Read Failed: {}", @@ -932,6 +931,8 @@ bool ProgramInfo::isKernelInstrumented(ur_kernel_handle_t Kernel) const { ContextInfo::~ContextInfo() { Stats.Print(Handle); + InternalQueueMap.clear(); + [[maybe_unused]] ur_result_t URes; if (USMPool) { URes = getContext()->urDdiTable.USM.pfnPoolRelease(USMPool); @@ -971,6 +972,13 @@ ur_usm_pool_handle_t ContextInfo::getUSMPool() { return USMPool; } +ur_queue_handle_t ContextInfo::getInternalQueue(ur_device_handle_t Device) { + std::scoped_lock Guard(InternalQueueMapMutex); + if (!InternalQueueMap[Device]) + InternalQueueMap[Device].emplace(Handle, Device); + return *InternalQueueMap[Device]; +} + AsanRuntimeDataWrapper::~AsanRuntimeDataWrapper() { [[maybe_unused]] ur_result_t Result; if (Host.LocalArgs) { diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp index bbf75d803b..8190bd9232 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp @@ -20,6 +20,7 @@ #include "asan_statistics.hpp" #include "sanitizer_common/sanitizer_common.hpp" #include "sanitizer_common/sanitizer_options.hpp" +#include "sanitizer_common/sanitizer_utils.hpp" #include "ur_sanitizer_layer.hpp" #include @@ -143,6 +144,10 @@ struct ContextInfo { std::vector DeviceList; std::unordered_map AllocInfosMap; + ur_shared_mutex InternalQueueMapMutex; + std::unordered_map> + InternalQueueMap; + AsanStatsWrapper Stats; explicit ContextInfo(ur_context_handle_t Context) : Handle(Context) { @@ -163,6 +168,8 @@ struct ContextInfo { } ur_usm_pool_handle_t getUSMPool(); + + ur_queue_handle_t getInternalQueue(ur_device_handle_t); }; struct AsanRuntimeDataWrapper { diff --git a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp index 2bd8733394..b81cf55b4d 100644 --- a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp +++ b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp @@ -31,9 +31,11 @@ ur_usm_type_t GetUSMType(ur_context_handle_t Context, const void *MemPtr) { } // namespace ManagedQueue::ManagedQueue(ur_context_handle_t Context, - ur_device_handle_t Device) { + ur_device_handle_t Device, bool IsOutOfOrder) { + ur_queue_properties_t Prop{UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE}; [[maybe_unused]] auto Result = getContext()->urDdiTable.Queue.pfnCreate( - Context, Device, nullptr, &Handle); + Context, Device, IsOutOfOrder ? &Prop : nullptr, &Handle); assert(Result == UR_RESULT_SUCCESS && "Failed to create ManagedQueue"); UR_LOG_L(getContext()->logger, DEBUG, ">>> ManagedQueue {}", (void *)Handle); } diff --git a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.hpp b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.hpp index 6024c644a4..0199cc6f24 100644 --- a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.hpp +++ b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.hpp @@ -23,7 +23,8 @@ namespace ur_sanitizer_layer { struct ManagedQueue { - ManagedQueue(ur_context_handle_t Context, ur_device_handle_t Device); + ManagedQueue(ur_context_handle_t Context, ur_device_handle_t Device, + bool IsOutOfOrder = false); ~ManagedQueue(); // Disable copy semantics diff --git a/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp b/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp index b3ed7386a5..c42a39d7cc 100644 --- a/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp +++ b/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp @@ -99,6 +99,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { std::scoped_lock Guard(Mutex); auto &Allocation = Allocations[Device]; + auto CI = getTsanInterceptor()->getContextInfo(Context); ur_result_t URes = UR_RESULT_SUCCESS; if (!Allocation) { ur_usm_desc_t USMDesc{}; @@ -114,7 +115,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { } if (HostPtr) { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t Queue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr); if (URes != UR_RESULT_SUCCESS) { @@ -155,7 +156,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { // Copy data from last synced device to host { - ManagedQueue Queue(Context, LastSyncedDevice.hDevice); + ur_queue_handle_t Queue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, 0, nullptr, nullptr); @@ -168,7 +169,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { // Sync data back to device { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t Queue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( Queue, true, Allocation, HostAllocation, Size, 0, nullptr, nullptr); if (URes != UR_RESULT_SUCCESS) { diff --git a/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp b/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp index fa772dd7e8..61849ac0b3 100644 --- a/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp +++ b/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp @@ -402,7 +402,7 @@ ur_result_t urMemBufferCreate( std::shared_ptr CtxInfo = getTsanInterceptor()->getContextInfo(hContext); for (const auto &hDevice : CtxInfo->DeviceList) { - ManagedQueue InternalQueue(hContext, hDevice); + ur_queue_handle_t InternalQueue = CtxInfo->getInternalQueue(hDevice); char *Handle = nullptr; UR_CALL(pMemBuffer->getHandle(hDevice, Handle)); UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( diff --git a/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp b/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp index 6c1ee4a62b..86c12f9e89 100644 --- a/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp +++ b/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp @@ -13,7 +13,6 @@ */ #include "tsan_interceptor.hpp" -#include "sanitizer_common/sanitizer_utils.hpp" #include "tsan_report.hpp" namespace ur_sanitizer_layer { @@ -107,6 +106,13 @@ void ContextInfo::insertAllocInfo(TsanAllocInfo AI) { AllocInfos.insert(std::move(AI)); } +ur_queue_handle_t ContextInfo::getInternalQueue(ur_device_handle_t Device) { + std::scoped_lock Guard(InternalQueueMapMutex); + if (!InternalQueueMap[Device]) + InternalQueueMap[Device].emplace(Handle, Device, true); + return *InternalQueueMap[Device]; +} + TsanInterceptor::~TsanInterceptor() { // We must release these objects before releasing adapters, since // they may use the adapter in their destructor @@ -190,7 +196,7 @@ TsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) { auto &ProgramInfo = getProgramInfo(Program); for (auto Device : Devices) { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t Queue = ContextInfo->getInternalQueue(Device); size_t MetadataSize; void *MetadataPtr; @@ -333,16 +339,14 @@ ur_result_t TsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, auto CI = getContextInfo(GetContext(Queue)); auto DI = getDeviceInfo(GetDevice(Queue)); - ManagedQueue InternalQueue(CI->Handle, DI->Handle); - if (!InternalQueue) { - UR_LOG_L(getContext()->logger, ERR, "Failed to create internal queue"); - return UR_RESULT_ERROR_INVALID_QUEUE; - } + ur_queue_handle_t InternalQueue = CI->getInternalQueue(DI->Handle); UR_CALL(prepareLaunch(CI, DI, InternalQueue, Kernel, LaunchInfo)); UR_CALL(updateShadowMemory(CI, DI, Kernel, InternalQueue)); + UR_CALL(getContext()->urDdiTable.Queue.pfnFinish(InternalQueue)); + return UR_RESULT_SUCCESS; } diff --git a/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp b/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp index f6ada3a06b..e700e70294 100644 --- a/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp +++ b/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp @@ -15,6 +15,7 @@ #include "sanitizer_common/sanitizer_allocator.hpp" #include "sanitizer_common/sanitizer_common.hpp" +#include "sanitizer_common/sanitizer_utils.hpp" #include "tsan_buffer.hpp" #include "tsan_libdevice.hpp" #include "tsan_shadow.hpp" @@ -58,6 +59,10 @@ struct ContextInfo { ur_shared_mutex AllocInfosMutex; std::set AllocInfos; + ur_shared_mutex InternalQueueMapMutex; + std::unordered_map> + InternalQueueMap; + explicit ContextInfo(ur_context_handle_t Context) : Handle(Context) { [[maybe_unused]] auto Result = getContext()->urDdiTable.Context.pfnRetain(Context); @@ -65,6 +70,7 @@ struct ContextInfo { } ~ContextInfo() { + InternalQueueMap.clear(); [[maybe_unused]] auto Result = getContext()->urDdiTable.Context.pfnRelease(Handle); assert(Result == UR_RESULT_SUCCESS); @@ -75,6 +81,8 @@ struct ContextInfo { ContextInfo &operator=(const ContextInfo &) = delete; void insertAllocInfo(TsanAllocInfo AI); + + ur_queue_handle_t getInternalQueue(ur_device_handle_t); }; struct DeviceGlobalInfo {