Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/intel-llvm-mirror-base-commit
Original file line number Diff line number Diff line change
@@ -1 +1 @@
63c70a1425d2c91fa54ec6495aae8ecfa7a5a10c
05538e008ad1e1b348e79bbb40888387288a2428
4 changes: 4 additions & 0 deletions source/adapters/offload/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT:
return ReturnValue(uint32_t{0});
case UR_DEVICE_INFO_QUEUE_PROPERTIES:
case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES:
case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
return ReturnValue(
ur_queue_flags_t{UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE});
case UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES:
return ReturnValue(0);
case UR_DEVICE_INFO_SUPPORTED_PARTITIONS: {
Expand Down
25 changes: 16 additions & 9 deletions source/adapters/offload/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
LaunchArgs.DynSharedMemory = 0;

ol_event_handle_t EventOut;
ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
OL_RETURN_ON_ERR(
olLaunchKernel(hQueue->OffloadQueue, hQueue->OffloadDevice,
hKernel->OffloadKernel, hKernel->Args.getStorage(),
hKernel->Args.getStorageSize(), &LaunchArgs, &EventOut));
olLaunchKernel(Queue, hQueue->OffloadDevice, hKernel->OffloadKernel,
hKernel->Args.getStorage(), hKernel->Args.getStorageSize(),
&LaunchArgs, &EventOut));

if (phEvent) {
auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue);
Expand Down Expand Up @@ -105,15 +107,20 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
(void)phEventWaitList;
//

ol_event_handle_t EventOut = nullptr;

OL_RETURN_ON_ERR(olMemcpy(hQueue->OffloadQueue, DestPtr, DestDevice, SrcPtr,
SrcDevice, size, phEvent ? &EventOut : nullptr));

if (blocking) {
OL_RETURN_ON_ERR(olSyncQueue(hQueue->OffloadQueue));
OL_RETURN_ON_ERR(olMemcpy(nullptr, DestPtr, DestDevice, SrcPtr, SrcDevice,
size, nullptr));
if (phEvent) {
*phEvent = ur_event_handle_t_::createEmptyEvent(Command, hQueue);
}
return UR_RESULT_SUCCESS;
}

ol_event_handle_t EventOut = nullptr;
ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
OL_RETURN_ON_ERR(olMemcpy(Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size,
phEvent ? &EventOut : nullptr));
if (phEvent) {
auto *Event = new ur_event_handle_t_(Command, hQueue);
Event->OffloadEvent = EventOut;
Expand Down
47 changes: 34 additions & 13 deletions source/adapters/offload/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,30 @@
#include "queue.hpp"
#include "ur2offload.hpp"

UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(ur_context_handle_t hContext,
ur_device_handle_t hDevice,
const ur_queue_properties_t *,
ur_queue_handle_t *phQueue) {
UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
[[maybe_unused]] ur_context_handle_t hContext, ur_device_handle_t hDevice,
const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) {

assert(hContext->Device == hDevice);

ur_queue_handle_t Queue = new ur_queue_handle_t_();
auto Res = olCreateQueue(hDevice->OffloadDevice, &Queue->OffloadQueue);
if (Res != OL_SUCCESS) {
delete Queue;
return offloadResultToUR(Res);
ur_queue_flags_t URFlags = 0;
if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) {
URFlags = pProps->flags;
}

Queue->OffloadDevice = hDevice->OffloadDevice;
Queue->UrContext = hContext;
ur_queue_handle_t Queue =
new ur_queue_handle_t_(hDevice->OffloadDevice, hContext, URFlags);

// For in-order queues, create the ol queue on construction so we can report
// any errors earlier
if (!(URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
[[maybe_unused]] ol_queue_handle_t InitQueue;
auto Res = Queue->nextQueue(InitQueue);
if (Res != OL_SUCCESS) {
delete Queue;
return offloadResultToUR(Res);
}
}

*phQueue = Queue;

Expand All @@ -47,6 +55,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);

switch (propName) {
case UR_QUEUE_INFO_FLAGS:
return ReturnValue(hQueue->Flags);
case UR_QUEUE_INFO_REFERENCE_COUNT:
return ReturnValue(hQueue->RefCount.load());
default:
Expand All @@ -63,15 +73,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {

UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
if (--hQueue->RefCount == 0) {
OL_RETURN_ON_ERR(olDestroyQueue(hQueue->OffloadQueue));
for (auto *Q : hQueue->OffloadQueues) {
if (!Q) {
break;
}
OL_RETURN_ON_ERR(olDestroyQueue(Q));
}
delete hQueue;
}

return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
return offloadResultToUR(olSyncQueue(hQueue->OffloadQueue));
for (auto *Q : hQueue->OffloadQueues) {
if (!Q) {
break;
}
OL_RETURN_ON_ERR(olSyncQueue(Q));
}
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
Expand Down
36 changes: 35 additions & 1 deletion source/adapters/offload/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,42 @@

#include "common.hpp"

constexpr size_t OOO_QUEUE_POOL_SIZE = 32;

struct ur_queue_handle_t_ : RefCounted {
ol_queue_handle_t OffloadQueue;
ur_queue_handle_t_(ol_device_handle_t Device, ur_context_handle_t UrContext,
ur_queue_flags_t Flags)
: OffloadQueues((Flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)
? 1
: OOO_QUEUE_POOL_SIZE),
QueueOffset(0), OffloadDevice(Device), UrContext(UrContext),
Flags(Flags) {}

// In-order queues only have one element here, while out of order queues have
// a bank of queues to use. We rotate through them round robin instead of
// constantly creating new ones in case there is a long-running program that
// never destroys the ur queue. Out-of-order queues create ol queues when
// needed; any queues that are not yet created are nullptr.
// This is a simpler implementation of the HIP/Cuda queue pooling logic in
// `stream_queue_t`. In the future, if we want more performance or it
// simplifies the implementation of a feature, we can consider using it.
std::vector<ol_queue_handle_t> OffloadQueues;
size_t QueueOffset;
ol_device_handle_t OffloadDevice;
ur_context_handle_t UrContext;
ur_queue_flags_t Flags;

ol_result_t nextQueue(ol_queue_handle_t &Handle) {
auto &Slot = OffloadQueues[QueueOffset++];
QueueOffset %= OffloadQueues.size();

if (!Slot) {
if (auto Res = olCreateQueue(OffloadDevice, &Slot)) {
return Res;
}
}

Handle = Slot;
return nullptr;
}
};
16 changes: 9 additions & 7 deletions source/loader/layers/sanitizer/asan/asan_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
assert((void *)Device != nullptr && "Device cannot be nullptr");

std::scoped_lock<ur_shared_mutex> Guard(Mutex);
auto CI = getAsanInterceptor()->getContextInfo(Context);
auto &Allocation = Allocations[Device];
ur_result_t URes = UR_RESULT_SUCCESS;
if (!Allocation) {
Expand All @@ -106,9 +107,9 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
}

if (HostPtr) {
ManagedQueue Queue(Context, Device);
ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device);
URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr);
InternalQueue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr);
if (URes != UR_RESULT_SUCCESS) {
UR_LOG_L(
getContext()->logger, ERR,
Expand Down Expand Up @@ -147,10 +148,10 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {

// Copy data from last synced device to host
{
ManagedQueue Queue(Context, LastSyncedDevice.hDevice);
ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device);
URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, 0,
nullptr, nullptr);
InternalQueue, true, HostAllocation, LastSyncedDevice.MemHandle, Size,
0, nullptr, nullptr);
if (URes != UR_RESULT_SUCCESS) {
UR_LOG_L(getContext()->logger, ERR,
"Failed to migrate memory buffer data");
Expand All @@ -160,9 +161,10 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {

// Sync data back to device
{
ManagedQueue Queue(Context, Device);
ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device);
URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
Queue, true, Allocation, HostAllocation, Size, 0, nullptr, nullptr);
InternalQueue, true, Allocation, HostAllocation, Size, 0, nullptr,
nullptr);
if (URes != UR_RESULT_SUCCESS) {
UR_LOG_L(getContext()->logger, ERR,
"Failed to migrate memory buffer data");
Expand Down
2 changes: 1 addition & 1 deletion source/loader/layers/sanitizer/asan/asan_ddi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -677,7 +677,7 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreate(
std::shared_ptr<ContextInfo> CtxInfo =
getAsanInterceptor()->getContextInfo(hContext);
for (const auto &hDevice : CtxInfo->DeviceList) {
ManagedQueue InternalQueue(hContext, hDevice);
ur_queue_handle_t InternalQueue = CtxInfo->getInternalQueue(hDevice);
char *Handle = nullptr;
UR_CALL(pMemBuffer->getHandle(hDevice, Handle));
UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
Expand Down
28 changes: 18 additions & 10 deletions source/loader/layers/sanitizer/asan/asan_interceptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,17 +270,15 @@ ur_result_t AsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
auto ContextInfo = getContextInfo(Context);
auto DeviceInfo = getDeviceInfo(Device);

ManagedQueue InternalQueue(Context, Device);
if (!InternalQueue) {
UR_LOG_L(getContext()->logger, ERR, "Failed to create internal queue");
return UR_RESULT_ERROR_INVALID_QUEUE;
}
ur_queue_handle_t InternalQueue = ContextInfo->getInternalQueue(Device);

UR_CALL(prepareLaunch(ContextInfo, DeviceInfo, InternalQueue, Kernel,
LaunchInfo));

UR_CALL(updateShadowMemory(ContextInfo, DeviceInfo, InternalQueue));

UR_CALL(getContext()->urDdiTable.Queue.pfnFinish(InternalQueue));

return UR_RESULT_SUCCESS;
}

Expand Down Expand Up @@ -467,6 +465,7 @@ ur_result_t AsanInterceptor::unregisterProgram(ur_program_handle_t Program) {

ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) {
auto Context = GetContext(Program);
auto CI = getContextInfo(Context);
std::vector<ur_device_handle_t> Devices = GetDevices(Program);

for (auto Device : Devices) {
Expand All @@ -484,11 +483,11 @@ ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) {
assert((MetadataSize % sizeof(SpirKernelInfo) == 0) &&
"SpirKernelMetadata size is not correct");

ManagedQueue Queue(Context, Device);
ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device);

std::vector<SpirKernelInfo> SKInfo(NumOfSpirKernel);
Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
Queue, true, &SKInfo[0], MetadataPtr,
InternalQueue, true, &SKInfo[0], MetadataPtr,
sizeof(SpirKernelInfo) * NumOfSpirKernel, 0, nullptr, nullptr);
if (Result != UR_RESULT_SUCCESS) {
UR_LOG_L(getContext()->logger, ERR, "Can't read the value of <{}>: {}",
Expand All @@ -504,7 +503,7 @@ ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) {
}
std::vector<char> KernelNameV(SKI.Size);
Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
Queue, true, KernelNameV.data(), (void *)SKI.KernelName,
InternalQueue, true, KernelNameV.data(), (void *)SKI.KernelName,
sizeof(char) * SKI.Size, 0, nullptr, nullptr);
if (Result != UR_RESULT_SUCCESS) {
UR_LOG_L(getContext()->logger, ERR, "Can't read kernel name: {}",
Expand Down Expand Up @@ -537,7 +536,7 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) {
assert(ProgramInfo != nullptr && "unregistered program!");

for (auto Device : Devices) {
ManagedQueue Queue(Context, Device);
ur_queue_handle_t InternalQueue = ContextInfo->getInternalQueue(Device);

size_t MetadataSize;
void *MetadataPtr;
Expand All @@ -554,7 +553,7 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) {
"DeviceGlobal metadata size is not correct");
std::vector<DeviceGlobalInfo> GVInfos(NumOfDeviceGlobal);
Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
Queue, true, &GVInfos[0], MetadataPtr,
InternalQueue, true, &GVInfos[0], MetadataPtr,
sizeof(DeviceGlobalInfo) * NumOfDeviceGlobal, 0, nullptr, nullptr);
if (Result != UR_RESULT_SUCCESS) {
UR_LOG_L(getContext()->logger, ERR, "Device Global[{}] Read Failed: {}",
Expand Down Expand Up @@ -932,6 +931,8 @@ bool ProgramInfo::isKernelInstrumented(ur_kernel_handle_t Kernel) const {
ContextInfo::~ContextInfo() {
Stats.Print(Handle);

InternalQueueMap.clear();

[[maybe_unused]] ur_result_t URes;
if (USMPool) {
URes = getContext()->urDdiTable.USM.pfnPoolRelease(USMPool);
Expand Down Expand Up @@ -971,6 +972,13 @@ ur_usm_pool_handle_t ContextInfo::getUSMPool() {
return USMPool;
}

ur_queue_handle_t ContextInfo::getInternalQueue(ur_device_handle_t Device) {
std::scoped_lock<ur_shared_mutex> Guard(InternalQueueMapMutex);
if (!InternalQueueMap[Device])
InternalQueueMap[Device].emplace(Handle, Device);
return *InternalQueueMap[Device];
}

AsanRuntimeDataWrapper::~AsanRuntimeDataWrapper() {
[[maybe_unused]] ur_result_t Result;
if (Host.LocalArgs) {
Expand Down
7 changes: 7 additions & 0 deletions source/loader/layers/sanitizer/asan/asan_interceptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "asan_statistics.hpp"
#include "sanitizer_common/sanitizer_common.hpp"
#include "sanitizer_common/sanitizer_options.hpp"
#include "sanitizer_common/sanitizer_utils.hpp"
#include "ur_sanitizer_layer.hpp"

#include <memory>
Expand Down Expand Up @@ -143,6 +144,10 @@ struct ContextInfo {
std::vector<ur_device_handle_t> DeviceList;
std::unordered_map<ur_device_handle_t, AllocInfoList> AllocInfosMap;

ur_shared_mutex InternalQueueMapMutex;
std::unordered_map<ur_device_handle_t, std::optional<ManagedQueue>>
InternalQueueMap;

AsanStatsWrapper Stats;

explicit ContextInfo(ur_context_handle_t Context) : Handle(Context) {
Expand All @@ -163,6 +168,8 @@ struct ContextInfo {
}

ur_usm_pool_handle_t getUSMPool();

ur_queue_handle_t getInternalQueue(ur_device_handle_t);
};

struct AsanRuntimeDataWrapper {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ ur_usm_type_t GetUSMType(ur_context_handle_t Context, const void *MemPtr) {
} // namespace

ManagedQueue::ManagedQueue(ur_context_handle_t Context,
ur_device_handle_t Device) {
ur_device_handle_t Device, bool IsOutOfOrder) {
ur_queue_properties_t Prop{UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr,
UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE};
[[maybe_unused]] auto Result = getContext()->urDdiTable.Queue.pfnCreate(
Context, Device, nullptr, &Handle);
Context, Device, IsOutOfOrder ? &Prop : nullptr, &Handle);
assert(Result == UR_RESULT_SUCCESS && "Failed to create ManagedQueue");
UR_LOG_L(getContext()->logger, DEBUG, ">>> ManagedQueue {}", (void *)Handle);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
namespace ur_sanitizer_layer {

struct ManagedQueue {
ManagedQueue(ur_context_handle_t Context, ur_device_handle_t Device);
ManagedQueue(ur_context_handle_t Context, ur_device_handle_t Device,
bool IsOutOfOrder = false);
~ManagedQueue();

// Disable copy semantics
Expand Down
Loading