diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp index 774ce3a61d..8f9411a736 100644 --- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -471,10 +471,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( numEventsInWaitList, phEventWaitList, phEvent); } - USMLaunchInfo LaunchInfo(GetContext(hKernel), GetDevice(hQueue), - pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, - workDim); - UR_CALL(LaunchInfo.initialize()); + LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), + pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, + workDim); UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index 271d846990..708b73ec85 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -249,15 +249,11 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context, ur_result_t AsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - USMLaunchInfo &LaunchInfo) { + LaunchInfo &LaunchInfo) { auto Context = GetContext(Queue); auto Device = GetDevice(Queue); auto ContextInfo = getContextInfo(Context); auto DeviceInfo = getDeviceInfo(Device); - auto KernelInfo = getKernelInfo(Kernel); - assert(KernelInfo && "Kernel should be instrumented"); - - UR_CALL(LaunchInfo.updateKernelInfo(*KernelInfo.get())); ManagedQueue InternalQueue(Context, Device); if (!InternalQueue) { @@ -275,12 +271,14 @@ ur_result_t AsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, ur_result_t AsanInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - USMLaunchInfo &LaunchInfo) { + LaunchInfo &LaunchInfo) { // FIXME: We must use block operation here, until we support urEventSetCallback auto Result = getContext()->urDdiTable.Queue.pfnFinish(Queue); + UR_CALL(LaunchInfo.Data.syncFromDevice(Queue)); + if (Result == UR_RESULT_SUCCESS) { - for (const auto &Report : LaunchInfo.Data->Report) { + for (const auto &Report : LaunchInfo.Data.Host.Report) { if (!Report.Flag) { continue; } @@ -688,7 +686,7 @@ AsanInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) { ur_result_t AsanInterceptor::prepareLaunch( std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, - ur_kernel_handle_t Kernel, USMLaunchInfo &LaunchInfo) { + ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo) { do { auto KernelInfo = getKernelInfo(Kernel); @@ -724,15 +722,13 @@ ur_result_t AsanInterceptor::prepareLaunch( } } - // Set launch info argument auto ArgNums = GetKernelNumArgs(Kernel); + // We must prepare all kernel args before call + // urKernelGetSuggestedLocalWorkSize, otherwise the call will fail on + // CPU device. if (ArgNums) { - getContext()->logger.debug( - "launch_info {} (numLocalArgs={}, localArgs={})", - (void *)LaunchInfo.Data, LaunchInfo.Data->NumLocalArgs, - (void *)LaunchInfo.Data->LocalArgs); ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer( - Kernel, ArgNums - 1, nullptr, LaunchInfo.Data); + Kernel, ArgNums - 1, nullptr, LaunchInfo.Data.getDevicePtr()); if (URes != UR_RESULT_SUCCESS) { getContext()->logger.error("Failed to set launch info: {}", URes); @@ -740,11 +736,6 @@ ur_result_t AsanInterceptor::prepareLaunch( } } - LaunchInfo.Data->GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin; - LaunchInfo.Data->GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd; - LaunchInfo.Data->DeviceTy = DeviceInfo->Type; - LaunchInfo.Data->Debug = getOptions().Debug ? 1 : 0; - if (LaunchInfo.LocalWorkSize.empty()) { LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim); auto URes = @@ -771,6 +762,14 @@ ur_result_t AsanInterceptor::prepareLaunch( LocalWorkSize[Dim]; } + // Prepare asan runtime data + LaunchInfo.Data.Host.GlobalShadowOffset = + DeviceInfo->Shadow->ShadowBegin; + LaunchInfo.Data.Host.GlobalShadowOffsetEnd = + DeviceInfo->Shadow->ShadowEnd; + LaunchInfo.Data.Host.DeviceTy = DeviceInfo->Type; + LaunchInfo.Data.Host.Debug = getOptions().Debug ? 1 : 0; + auto EnqueueAllocateShadowMemory = [Context = ContextInfo->Handle, Device = DeviceInfo->Handle, Queue](size_t Size, uptr &Ptr) { @@ -819,7 +818,7 @@ ur_result_t AsanInterceptor::prepareLaunch( if (EnqueueAllocateShadowMemory( LocalShadowMemorySize, - LaunchInfo.Data->LocalShadowOffset) != + LaunchInfo.Data.Host.LocalShadowOffset) != UR_RESULT_SUCCESS) { getContext()->logger.warning( "Failed to allocate shadow memory for local " @@ -830,8 +829,8 @@ ur_result_t AsanInterceptor::prepareLaunch( "Skip checking local memory of kernel <{}>", GetKernelName(Kernel)); } else { - LaunchInfo.Data->LocalShadowOffsetEnd = - LaunchInfo.Data->LocalShadowOffset + + LaunchInfo.Data.Host.LocalShadowOffsetEnd = + LaunchInfo.Data.Host.LocalShadowOffset + LocalShadowMemorySize - 1; ContextInfo->Stats.UpdateShadowMalloced( @@ -839,8 +838,8 @@ ur_result_t AsanInterceptor::prepareLaunch( getContext()->logger.info( "ShadowMemory(Local, {} - {})", - (void *)LaunchInfo.Data->LocalShadowOffset, - (void *)LaunchInfo.Data->LocalShadowOffsetEnd); + (void *)LaunchInfo.Data.Host.LocalShadowOffset, + (void *)LaunchInfo.Data.Host.LocalShadowOffsetEnd); } } } @@ -848,7 +847,7 @@ ur_result_t AsanInterceptor::prepareLaunch( // Write shadow memory offset for private memory if (getOptions().DetectPrivates) { if (DeviceInfo->Type == DeviceType::CPU) { - LaunchInfo.Data->PrivateShadowOffset = + LaunchInfo.Data.Host.PrivateShadowOffset = DeviceInfo->Shadow->ShadowBegin; } else if (DeviceInfo->Type == DeviceType::GPU_PVC || DeviceInfo->Type == DeviceType::GPU_DG2) { @@ -861,7 +860,7 @@ ur_result_t AsanInterceptor::prepareLaunch( if (EnqueueAllocateShadowMemory( PrivateShadowMemorySize, - LaunchInfo.Data->PrivateShadowOffset) != + LaunchInfo.Data.Host.PrivateShadowOffset) != UR_RESULT_SUCCESS) { getContext()->logger.warning( "Failed to allocate shadow memory for private " @@ -872,8 +871,8 @@ ur_result_t AsanInterceptor::prepareLaunch( "Skip checking private memory of kernel <{}>", GetKernelName(Kernel)); } else { - LaunchInfo.Data->PrivateShadowOffsetEnd = - LaunchInfo.Data->PrivateShadowOffset + + LaunchInfo.Data.Host.PrivateShadowOffsetEnd = + LaunchInfo.Data.Host.PrivateShadowOffset + PrivateShadowMemorySize - 1; ContextInfo->Stats.UpdateShadowMalloced( @@ -881,11 +880,32 @@ ur_result_t AsanInterceptor::prepareLaunch( getContext()->logger.info( "ShadowMemory(Private, {} - {})", - (void *)LaunchInfo.Data->PrivateShadowOffset, - (void *)LaunchInfo.Data->PrivateShadowOffsetEnd); + (void *)LaunchInfo.Data.Host.PrivateShadowOffset, + (void *)LaunchInfo.Data.Host.PrivateShadowOffsetEnd); } } } + + // Write local arguments info + if (!KernelInfo->LocalArgs.empty()) { + std::vector LocalArgsInfo; + for (auto [ArgIndex, ArgInfo] : KernelInfo->LocalArgs) { + LocalArgsInfo.push_back(ArgInfo); + getContext()->logger.debug( + "local_args (argIndex={}, size={}, sizeWithRZ={})", + ArgIndex, ArgInfo.Size, ArgInfo.SizeWithRedZone); + } + UR_CALL(LaunchInfo.Data.importLocalArgsInfo(Queue, LocalArgsInfo)); + } + + // sync asan runtime data to device side + UR_CALL(LaunchInfo.Data.syncToDevice(Queue)); + + getContext()->logger.debug( + "launch_info {} (numLocalArgs={}, localArgs={})", + (void *)LaunchInfo.Data.getDevicePtr(), + LaunchInfo.Data.Host.NumLocalArgs, + (void *)LaunchInfo.Data.Host.LocalArgs); } while (false); return UR_RESULT_SUCCESS; @@ -945,63 +965,39 @@ ContextInfo::~ContextInfo() { } } -ur_result_t USMLaunchInfo::initialize() { - UR_CALL(getContext()->urDdiTable.Context.pfnRetain(Context)); - UR_CALL(getContext()->urDdiTable.Device.pfnRetain(Device)); - UR_CALL(getContext()->urDdiTable.USM.pfnSharedAlloc( - Context, Device, nullptr, nullptr, sizeof(LaunchInfo), (void **)&Data)); - *Data = LaunchInfo{}; - return UR_RESULT_SUCCESS; -} - -ur_result_t USMLaunchInfo::updateKernelInfo(const KernelInfo &KI) { - auto NumArgs = KI.LocalArgs.size(); - if (NumArgs) { - Data->NumLocalArgs = NumArgs; - UR_CALL(getContext()->urDdiTable.USM.pfnSharedAlloc( - Context, Device, nullptr, nullptr, sizeof(LocalArgsInfo) * NumArgs, - (void **)&Data->LocalArgs)); - uint32_t i = 0; - for (auto [ArgIndex, ArgInfo] : KI.LocalArgs) { - Data->LocalArgs[i++] = ArgInfo; - getContext()->logger.debug( - "local_args (argIndex={}, size={}, sizeWithRZ={})", ArgIndex, - ArgInfo.Size, ArgInfo.SizeWithRedZone); - } - } - return UR_RESULT_SUCCESS; -} - -USMLaunchInfo::~USMLaunchInfo() { +AsanRuntimeDataWrapper::~AsanRuntimeDataWrapper() { [[maybe_unused]] ur_result_t Result; - if (Data) { - auto Type = GetDeviceType(Context, Device); - auto ContextInfo = getAsanInterceptor()->getContextInfo(Context); - if (Type == DeviceType::GPU_PVC || Type == DeviceType::GPU_DG2) { - if (Data->PrivateShadowOffset) { - ContextInfo->Stats.UpdateShadowFreed( - Data->PrivateShadowOffsetEnd - Data->PrivateShadowOffset + - 1); - Result = getContext()->urDdiTable.USM.pfnFree( - Context, (void *)Data->PrivateShadowOffset); - assert(Result == UR_RESULT_SUCCESS); - } - if (Data->LocalShadowOffset) { - ContextInfo->Stats.UpdateShadowFreed( - Data->LocalShadowOffsetEnd - Data->LocalShadowOffset + 1); - Result = getContext()->urDdiTable.USM.pfnFree( - Context, (void *)Data->LocalShadowOffset); - assert(Result == UR_RESULT_SUCCESS); - } + auto Type = GetDeviceType(Context, Device); + auto ContextInfo = getAsanInterceptor()->getContextInfo(Context); + if (Type == DeviceType::GPU_PVC || Type == DeviceType::GPU_DG2) { + if (Host.PrivateShadowOffset) { + ContextInfo->Stats.UpdateShadowFreed(Host.PrivateShadowOffsetEnd - + Host.PrivateShadowOffset + 1); + Result = getContext()->urDdiTable.USM.pfnFree( + Context, (void *)Host.PrivateShadowOffset); + assert(Result == UR_RESULT_SUCCESS); } - if (Data->LocalArgs) { + if (Host.LocalShadowOffset) { + ContextInfo->Stats.UpdateShadowFreed(Host.LocalShadowOffsetEnd - + Host.LocalShadowOffset + 1); Result = getContext()->urDdiTable.USM.pfnFree( - Context, (void *)Data->LocalArgs); + Context, (void *)Host.LocalShadowOffset); assert(Result == UR_RESULT_SUCCESS); } - Result = getContext()->urDdiTable.USM.pfnFree(Context, (void *)Data); + } + if (Host.LocalArgs) { + Result = getContext()->urDdiTable.USM.pfnFree(Context, + (void *)Host.LocalArgs); + assert(Result == UR_RESULT_SUCCESS); + } + if (DevicePtr) { + Result = getContext()->urDdiTable.USM.pfnFree(Context, DevicePtr); assert(Result == UR_RESULT_SUCCESS); } +} + +LaunchInfo::~LaunchInfo() { + [[maybe_unused]] ur_result_t Result; Result = getContext()->urDdiTable.Context.pfnRelease(Context); assert(Result == UR_RESULT_SUCCESS); Result = getContext()->urDdiTable.Device.pfnRelease(Device); diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp index 926be1388e..12dd230436 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp @@ -157,9 +157,72 @@ struct ContextInfo { } }; -struct USMLaunchInfo { - LaunchInfo *Data = nullptr; +struct AsanRuntimeDataWrapper { + AsanRuntimeData Host{}; + AsanRuntimeData *DevicePtr = nullptr; + + ur_context_handle_t Context{}; + + ur_device_handle_t Device{}; + + AsanRuntimeDataWrapper(ur_context_handle_t Context, + ur_device_handle_t Device) + : Context(Context), Device(Device) {} + + ~AsanRuntimeDataWrapper(); + + AsanRuntimeData *getDevicePtr() { + if (DevicePtr == nullptr) { + ur_result_t Result = getContext()->urDdiTable.USM.pfnDeviceAlloc( + Context, Device, nullptr, nullptr, sizeof(AsanRuntimeData), + (void **)&DevicePtr); + if (Result != UR_RESULT_SUCCESS) { + getContext()->logger.error( + "Failed to alloc device usm for asan runtime data: {}", + Result); + } + } + return DevicePtr; + } + + ur_result_t syncFromDevice(ur_queue_handle_t Queue) { + UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( + Queue, true, ur_cast(&Host), getDevicePtr(), + sizeof(AsanRuntimeData), 0, nullptr, nullptr)); + + return UR_RESULT_SUCCESS; + } + + ur_result_t syncToDevice(ur_queue_handle_t Queue) { + UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( + Queue, true, getDevicePtr(), ur_cast(&Host), + sizeof(AsanRuntimeData), 0, nullptr, nullptr)); + + return UR_RESULT_SUCCESS; + } + + ur_result_t + importLocalArgsInfo(ur_queue_handle_t Queue, + const std::vector &LocalArgs) { + assert(!LocalArgs.empty()); + + Host.NumLocalArgs = LocalArgs.size(); + const size_t LocalArgsInfoSize = + sizeof(LocalArgsInfo) * Host.NumLocalArgs; + UR_CALL(getContext()->urDdiTable.USM.pfnDeviceAlloc( + Context, Device, nullptr, nullptr, LocalArgsInfoSize, + ur_cast(&Host.LocalArgs))); + + UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( + Queue, true, Host.LocalArgs, &LocalArgs[0], LocalArgsInfoSize, 0, + nullptr, nullptr)); + + return UR_RESULT_SUCCESS; + } +}; + +struct LaunchInfo { ur_context_handle_t Context = nullptr; ur_device_handle_t Device = nullptr; const size_t *GlobalWorkSize = nullptr; @@ -167,20 +230,25 @@ struct USMLaunchInfo { std::vector LocalWorkSize; uint32_t WorkDim = 0; - USMLaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device, - const size_t *GlobalWorkSize, const size_t *LocalWorkSize, - const size_t *GlobalWorkOffset, uint32_t WorkDim) + AsanRuntimeDataWrapper Data; + + LaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + const size_t *GlobalWorkOffset, uint32_t WorkDim) : Context(Context), Device(Device), GlobalWorkSize(GlobalWorkSize), - GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim) { + GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim), + Data(Context, Device) { if (LocalWorkSize) { this->LocalWorkSize = std::vector(LocalWorkSize, LocalWorkSize + WorkDim); } + [[maybe_unused]] auto Result = + getContext()->urDdiTable.Context.pfnRetain(Context); + assert(Result == UR_RESULT_SUCCESS); + Result = getContext()->urDdiTable.Device.pfnRetain(Device); + assert(Result == UR_RESULT_SUCCESS); } - ~USMLaunchInfo(); - - ur_result_t initialize(); - ur_result_t updateKernelInfo(const KernelInfo &KI); + ~LaunchInfo(); }; struct DeviceGlobalInfo { @@ -213,11 +281,11 @@ class AsanInterceptor { ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - USMLaunchInfo &LaunchInfo); + LaunchInfo &LaunchInfo); ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - USMLaunchInfo &LaunchInfo); + LaunchInfo &LaunchInfo); ur_result_t insertContext(ur_context_handle_t Context, std::shared_ptr &CI); @@ -301,7 +369,7 @@ class AsanInterceptor { std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - USMLaunchInfo &LaunchInfo); + LaunchInfo &LaunchInfo); ur_result_t allocShadowMemory(ur_context_handle_t Context, std::shared_ptr &DeviceInfo); diff --git a/source/loader/layers/sanitizer/asan/asan_libdevice.hpp b/source/loader/layers/sanitizer/asan/asan_libdevice.hpp index 53fa5e68ba..a2d5ecd6be 100644 --- a/source/loader/layers/sanitizer/asan/asan_libdevice.hpp +++ b/source/loader/layers/sanitizer/asan/asan_libdevice.hpp @@ -50,10 +50,9 @@ struct LocalArgsInfo { constexpr uint64_t ASAN_MAX_NUM_REPORTS = 10; -struct LaunchInfo { +struct AsanRuntimeData { uintptr_t GlobalShadowOffset = 0; uintptr_t GlobalShadowOffsetEnd = 0; - uintptr_t PrivateShadowOffset = 0; uintptr_t PrivateShadowOffsetEnd = 0;