intel
diff --git a/‎opencl/source/gtpin/gtpin_callbacks.cpp‎
Lines changed: 33 additions & 14 deletions b/‎opencl/source/gtpin/gtpin_callbacks.cpp‎
Lines changed: 33 additions & 14 deletions
diff --git a/‎opencl/source/gtpin/gtpin_helpers.cpp‎
Lines changed: 46 additions & 22 deletions b/‎opencl/source/gtpin/gtpin_helpers.cpp‎
Lines changed: 46 additions & 22 deletions
diff --git a/‎opencl/source/gtpin/gtpin_hw_helper.h‎
Lines changed: 3 additions & 1 deletion b/‎opencl/source/gtpin/gtpin_hw_helper.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎opencl/source/gtpin/gtpin_hw_helper.inl‎
Lines changed: 10 additions & 0 deletions b/‎opencl/source/gtpin/gtpin_hw_helper.inl‎
Lines changed: 10 additions & 0 deletions
@@ -7,7 +7,7 @@
 
 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/source/memory_manager/surface.h"
-#include "shared/source/utilities/spinlock.h"
+#include "shared/source/memory_manager/unified_memory_manager.h"
 
 #include "opencl/source/cl_device/cl_device.h"
 #include "opencl/source/command_queue/command_queue.h"
@@ -30,13 +30,15 @@ using namespace gtpin;
 
 namespace NEO {
 
+using GTPinLockType = std::recursive_mutex;
+
 extern gtpin::ocl::gtpin_events_t GTPinCallbacks;
 
 igc_init_t *pIgcInit = nullptr;
 std::atomic<int> sequenceCount(1);
 CommandQueue *pCmdQueueForFlushTask = nullptr;
 std::deque<gtpinkexec_t> kernelExecQueue;
-SpinLock kernelExecQueueLock;
+GTPinLockType kernelExecQueueLock;
 
 void gtpinNotifyContextCreate(cl_context context) {
     if (isGTPinInitialized) {
@@ -131,7 +133,7 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
         kExec.gtpinResource = (cl_mem)resource;
         kExec.commandBuffer = commandBuffer;
         kExec.pCommandQueue = (CommandQueue *)pCmdQueue;
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
         kernelExecQueue.push_back(kExec);
         lock.unlock();
         // Patch SSH[gtpinBTI] with GT-Pin resource
@@ -142,10 +144,19 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
         GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
         size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
         void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI);
-        cl_mem buffer = (cl_mem)resource;
-        auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
-        pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device,
-                                pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
+        if (gtpinHelper.canUseSharedAllocation(device.getHardwareInfo())) {
+            auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
+            auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
+            size_t size = gpuAllocation->getUnderlyingBufferSize();
+            Buffer::setSurfaceState(&device, pSurfaceState, false, false, size, gpuAllocation->getUnderlyingBuffer(), 0, gpuAllocation, 0, 0,
+                                    pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
+            pKernel->setUnifiedMemoryExecInfo(gpuAllocation);
+        } else {
+            cl_mem buffer = (cl_mem)resource;
+            auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
+            pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device,
+                                    pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
+        }
     }
 }
 
@@ -157,7 +168,7 @@ void gtpinNotifyPreFlushTask(void *pCmdQueue) {
 
 void gtpinNotifyFlushTask(uint32_t flushedTaskCount) {
     if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
         size_t numElems = kernelExecQueue.size();
         for (size_t n = 0; n < numElems; n++) {
             if ((kernelExecQueue[n].pCommandQueue == pCmdQueueForFlushTask) && !kernelExecQueue[n].isTaskCountValid) {
@@ -173,7 +184,7 @@ void gtpinNotifyFlushTask(uint32_t flushedTaskCount) {
 
 void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) {
     if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
         size_t numElems = kernelExecQueue.size();
         for (size_t n = 0; n < numElems;) {
             if (kernelExecQueue[n].isTaskCountValid && (kernelExecQueue[n].taskCount <= completedTaskCount)) {
@@ -191,15 +202,23 @@ void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) {
 
 void gtpinNotifyMakeResident(void *pKernel, void *pCSR) {
     if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
+        Context &context = static_cast<Kernel *>(pKernel)->getContext();
+        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(context.getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
         size_t numElems = kernelExecQueue.size();
         for (size_t n = 0; n < numElems; n++) {
             if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident && kernelExecQueue[n].gtpinResource) {
                 // It's time for kernel to make resident its GT-Pin resource
                 CommandStreamReceiver *pCommandStreamReceiver = reinterpret_cast<CommandStreamReceiver *>(pCSR);
-                cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource;
-                auto pBuffer = castToObjectOrAbort<Buffer>(gtpinBuffer);
-                GraphicsAllocation *pGfxAlloc = pBuffer->getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
+                GraphicsAllocation *pGfxAlloc = nullptr;
+                if (gtpinHelper.canUseSharedAllocation(context.getDevice(0)->getHardwareInfo())) {
+                    auto allocData = reinterpret_cast<SvmAllocationData *>(kernelExecQueue[n].gtpinResource);
+                    pGfxAlloc = allocData->gpuAllocations.getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
+                } else {
+                    cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource;
+                    auto pBuffer = castToObjectOrAbort<Buffer>(gtpinBuffer);
+                    pGfxAlloc = pBuffer->getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
+                }
                 pCommandStreamReceiver->makeResident(*pGfxAlloc);
                 kernelExecQueue[n].isResourceResident = true;
                 break;
@@ -210,7 +229,7 @@ void gtpinNotifyMakeResident(void *pKernel, void *pCSR) {
 
 void gtpinNotifyUpdateResidencyList(void *pKernel, void *pResVec) {
     if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
         size_t numElems = kernelExecQueue.size();
         for (size_t n = 0; n < numElems; n++) {
             if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident && kernelExecQueue[n].gtpinResource) {
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -8,8 +8,12 @@
 #include "gtpin_helpers.h"
 
 #include "shared/source/memory_manager/memory_manager.h"
+#include "shared/source/memory_manager/unified_memory_manager.h"
 
+#include "opencl/source/api/api.h"
+#include "opencl/source/cl_device/cl_device.h"
 #include "opencl/source/context/context.h"
+#include "opencl/source/gtpin/gtpin_hw_helper.h"
 #include "opencl/source/helpers/validators.h"
 #include "opencl/source/mem_obj/buffer.h"
 
@@ -27,27 +31,39 @@ GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinCreateBuffer(context_handle_t context
         return GTPIN_DI_ERROR_INVALID_ARGUMENT;
     }
     size_t size = alignUp(reqSize, MemoryConstants::cacheLineSize);
-    void *hostPtr = pContext->getMemoryManager()->allocateSystemMemory(size, MemoryConstants::pageSize);
-    if (hostPtr == nullptr) {
-        return GTPIN_DI_ERROR_ALLOCATION_FAILED;
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
+    if (gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
+        void *unfiedMemorySharedAllocation = clSharedMemAllocINTEL(pContext, pContext->getDevice(0), 0, size, 0, &diag);
+        auto allocationsManager = pContext->getSVMAllocsManager();
+        auto graphicsAllocation = allocationsManager->getSVMAlloc(unfiedMemorySharedAllocation);
+        *pResource = (resource_handle_t)graphicsAllocation;
+    } else {
+        void *hostPtr = pContext->getMemoryManager()->allocateSystemMemory(size, MemoryConstants::pageSize);
+        if (hostPtr == nullptr) {
+            return GTPIN_DI_ERROR_ALLOCATION_FAILED;
+        }
+        cl_mem buffer = Buffer::create(pContext, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE | CL_MEM_FORCE_HOST_MEMORY_INTEL, size, hostPtr, diag);
+        *pResource = (resource_handle_t)buffer;
     }
-    cl_mem buffer = Buffer::create(pContext, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE | CL_MEM_FORCE_HOST_MEMORY_INTEL, size, hostPtr, diag);
-    *pResource = (resource_handle_t)buffer;
     return GTPIN_DI_SUCCESS;
 }
 
 GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinFreeBuffer(context_handle_t context, resource_handle_t resource) {
-    cl_mem buffer = (cl_mem)resource;
     Context *pContext = castToObject<Context>((cl_context)context);
-    if ((pContext == nullptr) || (buffer == nullptr)) {
+    if ((pContext == nullptr) || (resource == nullptr)) {
         return GTPIN_DI_ERROR_INVALID_ARGUMENT;
     }
-    auto pMemObj = castToObject<MemObj>(buffer);
-    if (pMemObj == nullptr) {
-        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
+    if (pContext->getMemoryManager()->isLocalMemorySupported(pContext->getDevice(0)->getRootDeviceIndex())) {
+        auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
+        clMemFreeINTEL(pContext, allocData->cpuAllocation->getUnderlyingBuffer());
+    } else {
+        auto pMemObj = castToObject<MemObj>(resource);
+        if (pMemObj == nullptr) {
+            return GTPIN_DI_ERROR_INVALID_ARGUMENT;
+        }
+        alignedFree(pMemObj->getHostPtr());
+        pMemObj->release();
     }
-    alignedFree(pMemObj->getHostPtr());
-    pMemObj->release();
     return GTPIN_DI_SUCCESS;
 }
 
@@ -57,23 +73,31 @@ GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinMapBuffer(context_handle_t context, r
     if ((pContext == nullptr) || (buffer == nullptr) || (pAddress == nullptr)) {
         return GTPIN_DI_ERROR_INVALID_ARGUMENT;
     }
-    auto pMemObj = castToObject<MemObj>(buffer);
-    if (pMemObj == nullptr) {
-        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
+    if (gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
+        auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
+        *pAddress = reinterpret_cast<uint8_t *>(allocData->cpuAllocation->getUnderlyingBuffer());
+    } else {
+        auto pMemObj = castToObject<MemObj>(buffer);
+        if (pMemObj == nullptr) {
+            return GTPIN_DI_ERROR_INVALID_ARGUMENT;
+        }
+        *pAddress = reinterpret_cast<uint8_t *>(pMemObj->getHostPtr());
     }
-    *pAddress = (uint8_t *)pMemObj->getHostPtr();
     return GTPIN_DI_SUCCESS;
 }
 
 GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinUnmapBuffer(context_handle_t context, resource_handle_t resource) {
-    cl_mem buffer = (cl_mem)resource;
     Context *pContext = castToObject<Context>((cl_context)context);
-    if ((pContext == nullptr) || (buffer == nullptr)) {
+    if ((pContext == nullptr) || (resource == nullptr)) {
         return GTPIN_DI_ERROR_INVALID_ARGUMENT;
     }
-    auto pMemObj = castToObject<MemObj>(buffer);
-    if (pMemObj == nullptr) {
-        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
+    if (!gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
+        auto pMemObj = castToObject<MemObj>(resource);
+        if (pMemObj == nullptr) {
+            return GTPIN_DI_ERROR_INVALID_ARGUMENT;
+        }
     }
     return GTPIN_DI_SUCCESS;
 }
 
@@ -17,6 +17,7 @@ class GTPinHwHelper {
     virtual uint32_t getGenVersion() = 0;
     virtual bool addSurfaceState(Kernel *pKernel) = 0;
     virtual void *getSurfaceState(Kernel *pKernel, size_t bti) = 0;
+    virtual bool canUseSharedAllocation(const HardwareInfo &hwInfo) const = 0;
 
   protected:
     GTPinHwHelper(){};
@@ -32,8 +33,9 @@ class GTPinHwHelperHw : public GTPinHwHelper {
     uint32_t getGenVersion() override;
     bool addSurfaceState(Kernel *pKernel) override;
     void *getSurfaceState(Kernel *pKernel, size_t bti) override;
+    bool canUseSharedAllocation(const HardwareInfo &hwInfo) const override;
 
-  private:
+  protected:
     GTPinHwHelperHw(){};
 };
 } // namespace NEO
@@ -56,4 +56,14 @@ void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti) {
     return pSurfaceState;
 }
 
+template <typename GfxFamily>
+bool GTPinHwHelperHw<GfxFamily>::canUseSharedAllocation(const HardwareInfo &hwInfo) const {
+    bool canUseSharedAllocation = false;
+    if (DebugManager.flags.GTPinAllocateBufferInSharedMemory.get() != -1) {
+        canUseSharedAllocation = !!DebugManager.flags.GTPinAllocateBufferInSharedMemory.get();
+    }
+    canUseSharedAllocation &= hwInfo.capabilityTable.ftrSvm;
+    return canUseSharedAllocation;
+}
+
 } // namespace NEO