diff --git a/.github/intel-llvm-mirror-base-commit b/.github/intel-llvm-mirror-base-commit index fdb4d85658..92a6c05337 100644 --- a/.github/intel-llvm-mirror-base-commit +++ b/.github/intel-llvm-mirror-base-commit @@ -1 +1 @@ -3e95c0c70850b8b668116d9a491d25dd969c6329 +63c70a1425d2c91fa54ec6495aae8ecfa7a5a10c diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 32c3ea19b2..eb19063999 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -1139,9 +1139,31 @@ ur_result_t urDeviceGetInfo( return ReturnValue(Device->Platform->ZeBindlessImagesExtensionSupported && Device->ZeDeviceImageProperties->maxImageDims2D > 0); } + case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP: { + ze_device_image_properties_t imageProps = {}; + imageProps.stype = ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES; + ze_device_pitched_alloc_exp_properties_t imageAllocProps = {}; + imageAllocProps.stype = + ZE_STRUCTURE_TYPE_PITCHED_ALLOC_DEVICE_EXP_PROPERTIES; + imageProps.pNext = (void *)&imageAllocProps; + + ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &imageProps)); + + return ReturnValue(imageAllocProps.maxImageLinearWidth); + } + case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP: { + ze_device_image_properties_t imageProps = {}; + imageProps.stype = ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES; + ze_device_pitched_alloc_exp_properties_t imageAllocProps = {}; + imageAllocProps.stype = + ZE_STRUCTURE_TYPE_PITCHED_ALLOC_DEVICE_EXP_PROPERTIES; + imageProps.pNext = (void *)&imageAllocProps; + + ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &imageProps)); + + return ReturnValue(imageAllocProps.maxImageLinearHeight); + } case UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP: - case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP: - case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP: case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP: UR_LOG(ERR, "Unsupported ParamName in urGetDeviceInfo"); UR_LOG(ERR, "ParamName=%{}(0x{})", ParamName, logger::toHex(ParamName)); diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index d4203ab32a..5fecdc5b8f 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -52,17 +52,6 @@ struct NDRDescT { }; } // namespace native_cpu -#ifdef NATIVECPU_USE_OCK -static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr, - size_t itemsPerThread) { - native_cpu::state resized_state( - ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread, - ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0], - ndr.GlobalOffset[1], ndr.GlobalOffset[2]); - return resized_state; -} -#endif - UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, @@ -112,6 +101,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // TODO: add proper error checking native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize); + unsigned long long numWI; + auto umulll_overflow = [](unsigned long long a, unsigned long long b, + unsigned long long *c) -> bool { +#ifdef __GNUC__ + return __builtin_umulll_overflow(a, b, c); +#else + *c = a * b; + return a != 0 && b != *c / a; +#endif + }; + if (umulll_overflow(ndr.GlobalSize[0], ndr.GlobalSize[1], &numWI) || + umulll_overflow(numWI, ndr.GlobalSize[2], &numWI) || numWI > SIZE_MAX) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + auto &tp = hQueue->getDevice()->tp; const size_t numParallelThreads = tp.num_threads(); std::vector> futures; @@ -130,131 +134,56 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto kernel = std::make_unique(*hKernel); kernel->updateMemPool(numParallelThreads); + const size_t numWG = numWG0 * numWG1 * numWG2; + const size_t numWGPerThread = numWG / numParallelThreads; + const size_t remainderWG = numWG - numWGPerThread * numParallelThreads; + // The fourth value is the linearized value. + std::array rangeStart = {0, 0, 0, 0}; + for (unsigned t = 0; t < numParallelThreads; ++t) { + auto rangeEnd = rangeStart; + rangeEnd[3] += numWGPerThread + (t < remainderWG); + if (rangeEnd[3] == rangeStart[3]) + break; + rangeEnd[0] = rangeEnd[3] % numWG0; + rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1; + rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1); + futures.emplace_back( + tp.schedule_task([state, &kernel = *kernel, rangeStart, + rangeEnd = rangeEnd[3], numWG0, numWG1, #ifndef NATIVECPU_USE_OCK - for (unsigned g2 = 0; g2 < numWG2; g2++) { - for (unsigned g1 = 0; g1 < numWG1; g1++) { - for (unsigned g0 = 0; g0 < numWG0; g0++) { - for (unsigned local2 = 0; local2 < ndr.LocalSize[2]; local2++) { - for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) { - for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) { - state.update(g0, g1, g2, local0, local1, local2); - kernel->_subhandler(kernel->getArgs(1, 0).data(), &state); - } - } - } - } - } - } + localSize = ndr.LocalSize, +#endif + numParallelThreads](size_t threadId) mutable { + for (size_t g0 = rangeStart[0], g1 = rangeStart[1], + g2 = rangeStart[2], g3 = rangeStart[3]; + g3 < rangeEnd; ++g3) { +#ifdef NATIVECPU_USE_OCK + state.update(g0, g1, g2); + kernel._subhandler( + kernel.getArgs(numParallelThreads, threadId).data(), &state); #else - bool isLocalSizeOne = - ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1; - if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads && - !kernel->hasLocalArgs()) { - // If the local size is one, we make the assumption that we are running a - // parallel_for over a sycl::range. - // Todo: we could add more compiler checks and - // kernel properties for this (e.g. check that no barriers are called). - - // Todo: this assumes that dim 0 is the best dimension over which we want to - // parallelize - - // Since we also vectorize the kernel, and vectorization happens within the - // work group loop, it's better to have a large-ish local size. We can - // divide the global range by the number of threads, set that as the local - // size and peel everything else. - - size_t new_num_work_groups_0 = numParallelThreads; - size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads; - - for (unsigned g2 = 0; g2 < numWG2; g2++) { - for (unsigned g1 = 0; g1 < numWG1; g1++) { - for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) { - futures.emplace_back(tp.schedule_task( - [ndr, itemsPerThread, &kernel = *kernel, g0, g1, g2](size_t) { - native_cpu::state resized_state = - getResizedState(ndr, itemsPerThread); - resized_state.update(g0, g1, g2); - kernel._subhandler(kernel.getArgs().data(), &resized_state); - })); - } - // Peel the remaining work items. Since the local size is 1, we iterate - // over the work groups. - for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0; - g0++) { - state.update(g0, g1, g2); - kernel->_subhandler(kernel->getArgs().data(), &state); - } - } - } - - } else { - // We are running a parallel_for over an nd_range - - if (numWG1 * numWG2 >= numParallelThreads) { - // Dimensions 1 and 2 have enough work, split them across the threadpool - for (unsigned g2 = 0; g2 < numWG2; g2++) { - for (unsigned g1 = 0; g1 < numWG1; g1++) { - futures.emplace_back( - tp.schedule_task([state, &kernel = *kernel, numWG0, g1, g2, - numParallelThreads](size_t threadId) mutable { - for (unsigned g0 = 0; g0 < numWG0; g0++) { - state.update(g0, g1, g2); + for (size_t local2 = 0; local2 < localSize[2]; ++local2) { + for (size_t local1 = 0; local1 < localSize[1]; ++local1) { + for (size_t local0 = 0; local0 < localSize[0]; ++local0) { + state.update(g0, g1, g2, local0, local1, local2); kernel._subhandler( kernel.getArgs(numParallelThreads, threadId).data(), &state); } - })); - } - } - } else { - // Split dimension 0 across the threadpool - // Here we try to create groups of workgroups in order to reduce - // synchronization overhead - for (unsigned g2 = 0; g2 < numWG2; g2++) { - for (unsigned g1 = 0; g1 < numWG1; g1++) { - for (unsigned g0 = 0; g0 < numWG0; g0++) { - groups.push_back([state, g0, g1, g2, numParallelThreads]( - size_t threadId, - ur_kernel_handle_t_ &kernel) mutable { - state.update(g0, g1, g2); - kernel._subhandler( - kernel.getArgs(numParallelThreads, threadId).data(), &state); - }); - } - } - } - auto numGroups = groups.size(); - auto groupsPerThread = numGroups / numParallelThreads; - if (groupsPerThread) { - for (unsigned thread = 0; thread < numParallelThreads; thread++) { - futures.emplace_back( - tp.schedule_task([groups, thread, groupsPerThread, - &kernel = *kernel](size_t threadId) { - for (unsigned i = 0; i < groupsPerThread; i++) { - auto index = thread * groupsPerThread + i; - groups[index](threadId, kernel); - } - })); - } - } - - // schedule the remaining tasks - auto remainder = numGroups % numParallelThreads; - if (remainder) { - futures.emplace_back( - tp.schedule_task([groups, remainder, - scheduled = numParallelThreads * groupsPerThread, - &kernel = *kernel](size_t threadId) { - for (unsigned i = 0; i < remainder; i++) { - auto index = scheduled + i; - groups[index](threadId, kernel); } - })); - } - } + } +#endif + if (++g0 == numWG0) { + g0 = 0; + if (++g1 == numWG1) { + g1 = 0; + ++g2; + } + } + } + })); + rangeStart = rangeEnd; } - -#endif // NATIVECPU_USE_OCK event->set_futures(futures); if (phEvent) { diff --git a/source/adapters/offload/enqueue.cpp b/source/adapters/offload/enqueue.cpp index 62b91f82af..d35d2bd40e 100644 --- a/source/adapters/offload/enqueue.cpp +++ b/source/adapters/offload/enqueue.cpp @@ -74,7 +74,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( hKernel->Args.getStorageSize(), &LaunchArgs, &EventOut)); if (phEvent) { - auto *Event = new ur_event_handle_t_(); + auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue); Event->OffloadEvent = EventOut; *phEvent = Event; } @@ -94,10 +94,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( } namespace { -ur_result_t doMemcpy(ur_queue_handle_t hQueue, void *DestPtr, - ol_device_handle_t DestDevice, const void *SrcPtr, - ol_device_handle_t SrcDevice, size_t size, bool blocking, - uint32_t numEventsInWaitList, +ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue, + void *DestPtr, ol_device_handle_t DestDevice, + const void *SrcPtr, ol_device_handle_t SrcDevice, + size_t size, bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { // Ignore wait list for now @@ -111,11 +111,11 @@ ur_result_t doMemcpy(ur_queue_handle_t hQueue, void *DestPtr, SrcDevice, size, phEvent ? &EventOut : nullptr)); if (blocking) { - OL_RETURN_ON_ERR(olWaitQueue(hQueue->OffloadQueue)); + OL_RETURN_ON_ERR(olSyncQueue(hQueue->OffloadQueue)); } if (phEvent) { - auto *Event = new ur_event_handle_t_(); + auto *Event = new ur_event_handle_t_(Command, hQueue); Event->OffloadEvent = EventOut; *phEvent = Event; } @@ -131,8 +131,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( char *DevPtr = reinterpret_cast(std::get(hBuffer->Mem).Ptr); - return doMemcpy(hQueue, pDst, Adapter->HostDevice, DevPtr + offset, - hQueue->OffloadDevice, size, blockingRead, + return doMemcpy(UR_COMMAND_MEM_BUFFER_READ, hQueue, pDst, Adapter->HostDevice, + DevPtr + offset, hQueue->OffloadDevice, size, blockingRead, numEventsInWaitList, phEventWaitList, phEvent); } @@ -143,9 +143,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( char *DevPtr = reinterpret_cast(std::get(hBuffer->Mem).Ptr); - return doMemcpy(hQueue, DevPtr + offset, hQueue->OffloadDevice, pSrc, - Adapter->HostDevice, size, blockingWrite, numEventsInWaitList, - phEventWaitList, phEvent); + return doMemcpy(UR_COMMAND_MEM_BUFFER_WRITE, hQueue, DevPtr + offset, + hQueue->OffloadDevice, pSrc, Adapter->HostDevice, size, + blockingWrite, numEventsInWaitList, phEventWaitList, phEvent); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( @@ -159,10 +159,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( return Err; } - return doMemcpy(hQueue, pDst, Adapter->HostDevice, - reinterpret_cast(Ptr) + offset, - hQueue->OffloadDevice, count, blockingRead, - numEventsInWaitList, phEventWaitList, phEvent); + return doMemcpy( + UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ, hQueue, pDst, Adapter->HostDevice, + reinterpret_cast(Ptr) + offset, hQueue->OffloadDevice, + count, blockingRead, numEventsInWaitList, phEventWaitList, phEvent); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( @@ -176,18 +176,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( return Err; } - return doMemcpy(hQueue, reinterpret_cast(Ptr) + offset, - hQueue->OffloadDevice, pSrc, Adapter->HostDevice, count, - blockingWrite, numEventsInWaitList, phEventWaitList, phEvent); + return doMemcpy(UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE, hQueue, + reinterpret_cast(Ptr) + offset, hQueue->OffloadDevice, + pSrc, Adapter->HostDevice, count, blockingWrite, + numEventsInWaitList, phEventWaitList, phEvent); } -ur_result_t enqueueNoOp(ur_queue_handle_t hQueue, ur_event_handle_t *phEvent) { +ur_result_t enqueueNoOp(ur_command_t Type, ur_queue_handle_t hQueue, + ur_event_handle_t *phEvent) { // This path is a no-op, but we can't output a real event because // Offload doesn't currently support creating arbitrary events, and we // don't know the last real event in the queue. Instead we just have to // wait on the whole queue and then return an empty (implicitly // finished) event. - *phEvent = ur_event_handle_t_::createEmptyEvent(); + *phEvent = ur_event_handle_t_::createEmptyEvent(Type, hQueue); return urQueueFinish(hQueue); } @@ -221,7 +223,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( } if (phEvent) { - enqueueNoOp(hQueue, phEvent); + enqueueNoOp(UR_COMMAND_MEM_BUFFER_MAP, hQueue, phEvent); } } *ppRetMap = MapPtr; @@ -255,7 +257,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( } if (phEvent) { - enqueueNoOp(hQueue, phEvent); + enqueueNoOp(UR_COMMAND_MEM_UNMAP, hQueue, phEvent); } } BufferImpl.unmap(pMappedPtr); diff --git a/source/adapters/offload/event.cpp b/source/adapters/offload/event.cpp index b4bbce8034..aab41ed3d2 100644 --- a/source/adapters/offload/event.cpp +++ b/source/adapters/offload/event.cpp @@ -12,6 +12,7 @@ #include #include "event.hpp" +#include "queue.hpp" #include "ur2offload.hpp" UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, @@ -22,6 +23,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { + case UR_EVENT_INFO_CONTEXT: + return ReturnValue(hEvent->UrQueue->UrContext); + case UR_EVENT_INFO_COMMAND_QUEUE: + return ReturnValue(hEvent->UrQueue); + case UR_EVENT_INFO_COMMAND_TYPE: + return ReturnValue(hEvent->Type); case UR_EVENT_INFO_REFERENCE_COUNT: return ReturnValue(hEvent->RefCount.load()); default: @@ -43,7 +50,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { for (uint32_t i = 0; i < numEvents; i++) { if (phEventWaitList[i]->OffloadEvent) { - OL_RETURN_ON_ERR(olWaitEvent(phEventWaitList[i]->OffloadEvent)); + OL_RETURN_ON_ERR(olSyncEvent(phEventWaitList[i]->OffloadEvent)); } } return UR_RESULT_SUCCESS; @@ -61,9 +68,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { if (Res) { return offloadResultToUR(Res); } + delete hEvent; } - delete hEvent; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/offload/event.hpp b/source/adapters/offload/event.hpp index 642c61e532..42860e68be 100644 --- a/source/adapters/offload/event.hpp +++ b/source/adapters/offload/event.hpp @@ -18,9 +18,14 @@ struct ur_event_handle_t_ : RefCounted { ol_event_handle_t OffloadEvent; ur_command_t Type; + ur_queue_handle_t UrQueue; - static ur_event_handle_t createEmptyEvent() { - auto *Event = new ur_event_handle_t_(); + ur_event_handle_t_(ur_command_t Type, ur_queue_handle_t Queue) + : Type(Type), UrQueue(Queue) {} + + static ur_event_handle_t createEmptyEvent(ur_command_t Type, + ur_queue_handle_t Queue) { + auto *Event = new ur_event_handle_t_(Type, Queue); // Null event represents an empty event. Waiting on it is a no-op. Event->OffloadEvent = nullptr; diff --git a/source/adapters/offload/queue.cpp b/source/adapters/offload/queue.cpp index 57a10fafa0..b2e28ddd70 100644 --- a/source/adapters/offload/queue.cpp +++ b/source/adapters/offload/queue.cpp @@ -17,9 +17,10 @@ #include "queue.hpp" #include "ur2offload.hpp" -UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( - [[maybe_unused]] ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_queue_properties_t *, ur_queue_handle_t *phQueue) { +UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + const ur_queue_properties_t *, + ur_queue_handle_t *phQueue) { assert(hContext->Device == hDevice); @@ -31,6 +32,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( } Queue->OffloadDevice = hDevice->OffloadDevice; + Queue->UrContext = hContext; *phQueue = Queue; @@ -69,7 +71,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { } UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { - return offloadResultToUR(olWaitQueue(hQueue->OffloadQueue)); + return offloadResultToUR(olSyncQueue(hQueue->OffloadQueue)); } UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( diff --git a/source/adapters/offload/queue.hpp b/source/adapters/offload/queue.hpp index 6afe4bf150..e9c642f528 100644 --- a/source/adapters/offload/queue.hpp +++ b/source/adapters/offload/queue.hpp @@ -18,4 +18,5 @@ struct ur_queue_handle_t_ : RefCounted { ol_queue_handle_t OffloadQueue; ol_device_handle_t OffloadDevice; + ur_context_handle_t UrContext; }; diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index b7a5abac1e..4f20f4a29e 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -42,11 +42,11 @@ if (NOT DEFINED UMF_REPO) endif() if (NOT DEFINED UMF_TAG) - # commit c02a5525bd6165e137129f7c5691e0faf14b7ceb + # commit 1de269c00e46b7cbdbafa2247812c8c4bb4ed4a5 # Author: Ɓukasz Stolarczuk - # Date: Wed Jul 16 11:04:14 2025 +0200 - # 1.0.0-rc2 release - set(UMF_TAG v1.0.0-rc2) + # Date: Mon Jul 21 15:42:59 2025 +0200 + # 1.0.0 release + set(UMF_TAG v1.0.0) endif() message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}") diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt index a36ff4f698..15dc3127d3 100644 --- a/source/loader/CMakeLists.txt +++ b/source/loader/CMakeLists.txt @@ -211,21 +211,10 @@ if(UR_ENABLE_SANITIZER) # In in-tree build, if LLVM is built with libc++, we also need to build # symbolizer.cpp with libc++ abi and link libc++ in. if(NOT UR_STANDALONE_BUILD AND LLVM_LIBCXX_USED) - execute_process( - COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libc++.a - OUTPUT_VARIABLE LIBCXX_PATH - OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process( - COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libc++abi.a - OUTPUT_VARIABLE LIBCXX_ABI_PATH - OUTPUT_STRIP_TRAILING_WHITESPACE) set_property(SOURCE ${symbolizer_sources} APPEND_STRING PROPERTY COMPILE_FLAGS " -stdlib=libc++ ") - if(NOT EXISTS ${LIBCXX_PATH} OR NOT EXISTS ${LIBCXX_ABI_PATH}) - message(FATAL_ERROR "libc++ is required but can't find the libraries") - endif() # Link with gcc_s fisrt to avoid some symbols resolve to libc++/libc++abi/libunwind's one target_link_libraries(ur_loader PRIVATE gcc_s ${LIBCXX_PATH} ${LIBCXX_ABI_PATH}) endif() diff --git a/source/loader/layers/sanitizer/msan/msan_libdevice.hpp b/source/loader/layers/sanitizer/msan/msan_libdevice.hpp index 575655fe99..d57d0c8553 100644 --- a/source/loader/layers/sanitizer/msan/msan_libdevice.hpp +++ b/source/loader/layers/sanitizer/msan/msan_libdevice.hpp @@ -14,6 +14,7 @@ #pragma once #include "sanitizer_common/sanitizer_libdevice.hpp" +#include #if !defined(__SPIR__) && !defined(__SPIRV__) namespace ur_sanitizer_layer { diff --git a/test/conformance/event/urEventGetProfilingInfo.cpp b/test/conformance/event/urEventGetProfilingInfo.cpp index e4f3d1f894..87b02ca9af 100644 --- a/test/conformance/event/urEventGetProfilingInfo.cpp +++ b/test/conformance/event/urEventGetProfilingInfo.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "fixtures.h" +#include "uur/checks.h" #include "uur/known_failure.h" using urEventGetProfilingInfoTest = uur::event::urEventTest; @@ -179,8 +180,9 @@ TEST_P(urEventGetProfilingInfoTest, InvalidValue) { const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_QUEUED; size_t property_size = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo(event, property_name, 0, nullptr, - &property_size)); + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urEventGetProfilingInfo(event, property_name, 0, nullptr, &property_size), + property_name); ASSERT_NE(property_size, 0); uint64_t property_value = 0; @@ -221,8 +223,10 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEventGetProfilingInfoForWaitWithBarrier); TEST_P(urEventGetProfilingInfoForWaitWithBarrier, Success) { uint64_t submit_value = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START, - size, &submit_value, nullptr)); + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START, size, + &submit_value, nullptr), + UR_PROFILING_INFO_COMMAND_START); ASSERT_NE(submit_value, 0); uint64_t complete_value = 0;