Merge pull request #1533 from AllanZyne/sanitizer-buffer

kbenzie · web-flow · commit 9f783837089c · 2024-05-31T10:20:23.000+01:00
[DeviceSanitizer] Support detecting out-of-bounds errors on sycl::buffer
diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt
@@ -108,6 +108,8 @@ if(UR_ENABLE_SANITIZER)
         ${CMAKE_CURRENT_SOURCE_DIR}/../ur/ur.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_allocator.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_allocator.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_buffer.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_buffer.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_interceptor.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_interceptor.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_libdevice.hpp
diff --git a/source/loader/layers/sanitizer/asan_buffer.cpp b/source/loader/layers/sanitizer/asan_buffer.cpp
@@ -0,0 +1,137 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file asan_buffer.cpp
+ *
+ */
+
+#include "asan_buffer.hpp"
+#include "asan_interceptor.hpp"
+#include "ur_sanitizer_layer.hpp"
+#include "ur_sanitizer_utils.hpp"
+
+namespace ur_sanitizer_layer {
+
+ur_result_t EnqueueMemCopyRectHelper(
+    ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
+    ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
+    size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
+    bool Blocking, uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event) {
+    // If user doesn't determine src/dst row pitch and slice pitch, just use
+    // region for it.
+    if (SrcRowPitch == 0) {
+        SrcRowPitch = Region.width;
+    }
+
+    if (SrcSlicePitch == 0) {
+        SrcSlicePitch = SrcRowPitch * Region.height;
+    }
+
+    if (DstRowPitch == 0) {
+        DstRowPitch = Region.width;
+    }
+
+    if (DstSlicePitch == 0) {
+        DstSlicePitch = DstRowPitch * Region.height;
+    }
+
+    // Calculate the src and dst addresses that actually will be copied.
+    char *SrcOrigin = pSrc + SrcOffset.x + SrcRowPitch * SrcOffset.y +
+                      SrcSlicePitch * SrcOffset.z;
+    char *DstOrigin = pDst + DstOffset.x + DstRowPitch * DstOffset.y +
+                      DstSlicePitch * DstOffset.z;
+
+    std::vector<ur_event_handle_t> Events;
+    Events.reserve(Region.depth);
+    // For now, USM doesn't support 3D memory copy operation, so we can only
+    // loop call 2D memory copy function to implement it.
+    for (size_t i = 0; i < Region.depth; i++) {
+        ur_event_handle_t NewEvent{};
+        UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy2D(
+            Queue, Blocking, DstOrigin + (i * DstSlicePitch), DstRowPitch,
+            SrcOrigin + (i * SrcSlicePitch), SrcRowPitch, Region.width,
+            Region.height, NumEventsInWaitList, EventWaitList, &NewEvent));
+
+        Events.push_back(NewEvent);
+    }
+
+    UR_CALL(context.urDdiTable.Enqueue.pfnEventsWait(Queue, Events.size(),
+                                                     Events.data(), Event));
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
+    // Sub-buffers don't maintain own allocations but rely on parent buffer.
+    if (SubBuffer) {
+        UR_CALL(SubBuffer->Parent->getHandle(Device, Handle));
+        Handle += SubBuffer->Origin;
+        return UR_RESULT_SUCCESS;
+    }
+
+    auto &Allocation = Allocations[Device];
+    if (!Allocation) {
+        ur_usm_desc_t USMDesc{};
+        USMDesc.align = getAlignment();
+        ur_usm_pool_handle_t Pool{};
+        ur_result_t URes = context.interceptor->allocateMemory(
+            Context, Device, &USMDesc, Pool, Size, AllocType::MEM_BUFFER,
+            ur_cast<void **>(&Allocation));
+        if (URes != UR_RESULT_SUCCESS) {
+            context.logger.error(
+                "Failed to allocate {} bytes memory for buffer {}", Size, this);
+            return URes;
+        }
+
+        if (HostPtr) {
+            ManagedQueue Queue(Context, Device);
+            URes = context.urDdiTable.Enqueue.pfnUSMMemcpy(
+                Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr);
+            if (URes != UR_RESULT_SUCCESS) {
+                context.logger.error("Failed to copy {} bytes data from host "
+                                     "pointer {} to buffer {}",
+                                     Size, HostPtr, this);
+                return URes;
+            }
+        }
+    }
+
+    Handle = Allocation;
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MemBuffer::free() {
+    for (const auto &[_, Ptr] : Allocations) {
+        ur_result_t URes = context.interceptor->releaseMemory(Context, Ptr);
+        if (URes != UR_RESULT_SUCCESS) {
+            context.logger.error("Failed to free buffer handle {}", Ptr);
+            return URes;
+        }
+    }
+    Allocations.clear();
+    return UR_RESULT_SUCCESS;
+}
+
+size_t MemBuffer::getAlignment() {
+    // Choose an alignment that is at most 128 and is the next power of 2
+    // for sizes less than 128.
+    // TODO: If we don't set the alignment size explicitly, the device will
+    // usually choose a very large size (more than 1k). Then sanitizer will
+    // allocate extra unnessary memory. Not sure if this will impact
+    // performance.
+    size_t MsbIdx = 63 - __builtin_clz(Size);
+    size_t Alignment = (1 << (MsbIdx + 1));
+    if (Alignment > 128) {
+        Alignment = 128;
+    }
+    return Alignment;
+}
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan_buffer.hpp b/source/loader/layers/sanitizer/asan_buffer.hpp
@@ -0,0 +1,74 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file asan_buffer.hpp
+ *
+ */
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <optional>
+
+#include "common.hpp"
+
+namespace ur_sanitizer_layer {
+
+struct MemBuffer {
+    // Buffer constructor
+    MemBuffer(ur_context_handle_t Context, size_t Size, char *HostPtr)
+        : Context(Context), Size(Size), HostPtr(HostPtr) {}
+
+    // Sub-buffer constructor
+    MemBuffer(std::shared_ptr<MemBuffer> Parent, size_t Origin, size_t Size)
+        : Context(Parent->Context), Size(Size), SubBuffer{{Parent, Origin}} {}
+
+    ur_result_t getHandle(ur_device_handle_t Device, char *&Handle);
+
+    ur_result_t free();
+
+    size_t getAlignment();
+
+    std::unordered_map<ur_device_handle_t, char *> Allocations;
+
+    enum AccessMode { UNKNOWN, READ_WRITE, READ_ONLY, WRITE_ONLY };
+
+    struct Mapping {
+        size_t Offset;
+        size_t Size;
+    };
+
+    std::unordered_map<void *, Mapping> Mappings;
+
+    ur_context_handle_t Context;
+
+    size_t Size;
+
+    char *HostPtr{};
+
+    struct SubBuffer_t {
+        std::shared_ptr<MemBuffer> Parent;
+        size_t Origin;
+    };
+
+    std::optional<SubBuffer_t> SubBuffer;
+
+    std::atomic<int32_t> RefCount = 1;
+
+    ur_shared_mutex Mutex;
+};
+
+ur_result_t EnqueueMemCopyRectHelper(
+    ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
+    ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
+    size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
+    bool Blocking, uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event);
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -259,6 +259,9 @@ ur_result_t SanitizerInterceptor::allocateMemory(
     } else if (Type == AllocType::SHARED_USM) {
         UR_CALL(context.urDdiTable.USM.pfnSharedAlloc(
             Context, Device, Properties, Pool, NeededSize, &Allocated));
+    } else if (Type == AllocType::MEM_BUFFER) {
+        UR_CALL(context.urDdiTable.USM.pfnDeviceAlloc(
+            Context, Device, Properties, Pool, NeededSize, &Allocated));
     } else {
         context.logger.error("Unsupport memory type");
         return UR_RESULT_ERROR_INVALID_ARGUMENT;
@@ -662,13 +665,54 @@ ur_result_t SanitizerInterceptor::eraseKernel(ur_kernel_handle_t Kernel) {
     return UR_RESULT_SUCCESS;
 }
 
+ur_result_t
+SanitizerInterceptor::insertMemBuffer(std::shared_ptr<MemBuffer> MemBuffer) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_MemBufferMapMutex);
+    assert(m_MemBufferMap.find(ur_cast<ur_mem_handle_t>(MemBuffer.get())) ==
+           m_MemBufferMap.end());
+    m_MemBufferMap.emplace(reinterpret_cast<ur_mem_handle_t>(MemBuffer.get()),
+                           MemBuffer);
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::eraseMemBuffer(ur_mem_handle_t MemHandle) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_MemBufferMapMutex);
+    assert(m_MemBufferMap.find(MemHandle) != m_MemBufferMap.end());
+    m_MemBufferMap.erase(MemHandle);
+    return UR_RESULT_SUCCESS;
+}
+
+std::shared_ptr<MemBuffer>
+SanitizerInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) {
+    std::shared_lock<ur_shared_mutex> Guard(m_MemBufferMapMutex);
+    if (m_MemBufferMap.find(MemHandle) != m_MemBufferMap.end()) {
+        return m_MemBufferMap[MemHandle];
+    }
+    return nullptr;
+}
+
 ur_result_t SanitizerInterceptor::prepareLaunch(
     ur_context_handle_t Context, std::shared_ptr<DeviceInfo> &DeviceInfo,
     ur_queue_handle_t Queue, ur_kernel_handle_t Kernel,
     USMLaunchInfo &LaunchInfo) {
     auto Program = GetProgram(Kernel);
 
     do {
+        // Set membuffer arguments
+        auto KernelInfo = getKernelInfo(Kernel);
+        for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) {
+            char *ArgPointer = nullptr;
+            UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer));
+            ur_result_t URes = context.urDdiTable.Kernel.pfnSetArgPointer(
+                Kernel, ArgIndex, nullptr, &ArgPointer);
+            if (URes != UR_RESULT_SUCCESS) {
+                context.logger.error(
+                    "Failed to set buffer {} as the {} arg to kernel {}: {}",
+                    ur_cast<ur_mem_handle_t>(MemBuffer.get()), ArgIndex, Kernel,
+                    URes);
+            }
+        }
+
         // Set launch info argument
         auto ArgNums = GetKernelNumArgs(Kernel);
         if (ArgNums) {
diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -13,6 +13,7 @@
 #pragma once
 
 #include "asan_allocator.hpp"
+#include "asan_buffer.hpp"
 #include "asan_libdevice.hpp"
 #include "common.hpp"
 #include "ur_sanitizer_layer.hpp"
@@ -81,8 +82,10 @@ struct QueueInfo {
 
 struct KernelInfo {
     ur_kernel_handle_t Handle;
-
     ur_shared_mutex Mutex;
+    std::atomic<int32_t> RefCount = 1;
+    std::unordered_map<uint32_t, std::shared_ptr<MemBuffer>> BufferArgs;
+
     // Need preserve the order of local arguments
     std::map<uint32_t, LocalArgsInfo> LocalArgs;
 
@@ -194,6 +197,10 @@ class SanitizerInterceptor {
     ur_result_t insertKernel(ur_kernel_handle_t Kernel);
     ur_result_t eraseKernel(ur_kernel_handle_t Kernel);
 
+    ur_result_t insertMemBuffer(std::shared_ptr<MemBuffer> MemBuffer);
+    ur_result_t eraseMemBuffer(ur_mem_handle_t MemHandle);
+    std::shared_ptr<MemBuffer> getMemBuffer(ur_mem_handle_t MemHandle);
+
     std::optional<AllocationIterator> findAllocInfoByAddress(uptr Address);
 
     std::shared_ptr<ContextInfo> getContextInfo(ur_context_handle_t Context) {
@@ -245,6 +252,10 @@ class SanitizerInterceptor {
         m_KernelMap;
     ur_shared_mutex m_KernelMapMutex;
 
+    std::unordered_map<ur_mem_handle_t, std::shared_ptr<MemBuffer>>
+        m_MemBufferMap;
+    ur_shared_mutex m_MemBufferMapMutex;
+
     /// Assumption: all USM chunks are allocated in one VA
     AllocationMap m_AllocationMap;
     ur_shared_mutex m_AllocationMapMutex;
diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp