Skip to content

Commit 255e85c

Browse files
Add GTPin feature to allocate buffer in shared memory
Related-To: NEO-5667 Signed-off-by: Milczarek, Slawomir <[email protected]>
1 parent 671d916 commit 255e85c

File tree

7 files changed

+279
-39
lines changed

7 files changed

+279
-39
lines changed

opencl/source/gtpin/gtpin_callbacks.cpp

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
#include "shared/source/command_stream/command_stream_receiver.h"
99
#include "shared/source/memory_manager/surface.h"
10-
#include "shared/source/utilities/spinlock.h"
10+
#include "shared/source/memory_manager/unified_memory_manager.h"
1111

1212
#include "opencl/source/cl_device/cl_device.h"
1313
#include "opencl/source/command_queue/command_queue.h"
@@ -30,13 +30,15 @@ using namespace gtpin;
3030

3131
namespace NEO {
3232

33+
using GTPinLockType = std::recursive_mutex;
34+
3335
extern gtpin::ocl::gtpin_events_t GTPinCallbacks;
3436

3537
igc_init_t *pIgcInit = nullptr;
3638
std::atomic<int> sequenceCount(1);
3739
CommandQueue *pCmdQueueForFlushTask = nullptr;
3840
std::deque<gtpinkexec_t> kernelExecQueue;
39-
SpinLock kernelExecQueueLock;
41+
GTPinLockType kernelExecQueueLock;
4042

4143
void gtpinNotifyContextCreate(cl_context context) {
4244
if (isGTPinInitialized) {
@@ -131,7 +133,7 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
131133
kExec.gtpinResource = (cl_mem)resource;
132134
kExec.commandBuffer = commandBuffer;
133135
kExec.pCommandQueue = (CommandQueue *)pCmdQueue;
134-
std::unique_lock<SpinLock> lock{kernelExecQueueLock};
136+
std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
135137
kernelExecQueue.push_back(kExec);
136138
lock.unlock();
137139
// Patch SSH[gtpinBTI] with GT-Pin resource
@@ -142,10 +144,19 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
142144
GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
143145
size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
144146
void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI);
145-
cl_mem buffer = (cl_mem)resource;
146-
auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
147-
pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device,
148-
pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
147+
if (gtpinHelper.canUseSharedAllocation(device.getHardwareInfo())) {
148+
auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
149+
auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
150+
size_t size = gpuAllocation->getUnderlyingBufferSize();
151+
Buffer::setSurfaceState(&device, pSurfaceState, false, false, size, gpuAllocation->getUnderlyingBuffer(), 0, gpuAllocation, 0, 0,
152+
pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
153+
pKernel->setUnifiedMemoryExecInfo(gpuAllocation);
154+
} else {
155+
cl_mem buffer = (cl_mem)resource;
156+
auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
157+
pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device,
158+
pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
159+
}
149160
}
150161
}
151162

@@ -157,7 +168,7 @@ void gtpinNotifyPreFlushTask(void *pCmdQueue) {
157168

158169
void gtpinNotifyFlushTask(uint32_t flushedTaskCount) {
159170
if (isGTPinInitialized) {
160-
std::unique_lock<SpinLock> lock{kernelExecQueueLock};
171+
std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
161172
size_t numElems = kernelExecQueue.size();
162173
for (size_t n = 0; n < numElems; n++) {
163174
if ((kernelExecQueue[n].pCommandQueue == pCmdQueueForFlushTask) && !kernelExecQueue[n].isTaskCountValid) {
@@ -173,7 +184,7 @@ void gtpinNotifyFlushTask(uint32_t flushedTaskCount) {
173184

174185
void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) {
175186
if (isGTPinInitialized) {
176-
std::unique_lock<SpinLock> lock{kernelExecQueueLock};
187+
std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
177188
size_t numElems = kernelExecQueue.size();
178189
for (size_t n = 0; n < numElems;) {
179190
if (kernelExecQueue[n].isTaskCountValid && (kernelExecQueue[n].taskCount <= completedTaskCount)) {
@@ -191,15 +202,23 @@ void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) {
191202

192203
void gtpinNotifyMakeResident(void *pKernel, void *pCSR) {
193204
if (isGTPinInitialized) {
194-
std::unique_lock<SpinLock> lock{kernelExecQueueLock};
205+
std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
206+
Context &context = static_cast<Kernel *>(pKernel)->getContext();
207+
GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(context.getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
195208
size_t numElems = kernelExecQueue.size();
196209
for (size_t n = 0; n < numElems; n++) {
197210
if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident && kernelExecQueue[n].gtpinResource) {
198211
// It's time for kernel to make resident its GT-Pin resource
199212
CommandStreamReceiver *pCommandStreamReceiver = reinterpret_cast<CommandStreamReceiver *>(pCSR);
200-
cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource;
201-
auto pBuffer = castToObjectOrAbort<Buffer>(gtpinBuffer);
202-
GraphicsAllocation *pGfxAlloc = pBuffer->getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
213+
GraphicsAllocation *pGfxAlloc = nullptr;
214+
if (gtpinHelper.canUseSharedAllocation(context.getDevice(0)->getHardwareInfo())) {
215+
auto allocData = reinterpret_cast<SvmAllocationData *>(kernelExecQueue[n].gtpinResource);
216+
pGfxAlloc = allocData->gpuAllocations.getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
217+
} else {
218+
cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource;
219+
auto pBuffer = castToObjectOrAbort<Buffer>(gtpinBuffer);
220+
pGfxAlloc = pBuffer->getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
221+
}
203222
pCommandStreamReceiver->makeResident(*pGfxAlloc);
204223
kernelExecQueue[n].isResourceResident = true;
205224
break;
@@ -210,7 +229,7 @@ void gtpinNotifyMakeResident(void *pKernel, void *pCSR) {
210229

211230
void gtpinNotifyUpdateResidencyList(void *pKernel, void *pResVec) {
212231
if (isGTPinInitialized) {
213-
std::unique_lock<SpinLock> lock{kernelExecQueueLock};
232+
std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
214233
size_t numElems = kernelExecQueue.size();
215234
for (size_t n = 0; n < numElems; n++) {
216235
if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident && kernelExecQueue[n].gtpinResource) {

opencl/source/gtpin/gtpin_helpers.cpp

Lines changed: 46 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2017-2020 Intel Corporation
2+
* Copyright (C) 2017-2021 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -8,8 +8,12 @@
88
#include "gtpin_helpers.h"
99

1010
#include "shared/source/memory_manager/memory_manager.h"
11+
#include "shared/source/memory_manager/unified_memory_manager.h"
1112

13+
#include "opencl/source/api/api.h"
14+
#include "opencl/source/cl_device/cl_device.h"
1215
#include "opencl/source/context/context.h"
16+
#include "opencl/source/gtpin/gtpin_hw_helper.h"
1317
#include "opencl/source/helpers/validators.h"
1418
#include "opencl/source/mem_obj/buffer.h"
1519

@@ -27,27 +31,39 @@ GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinCreateBuffer(context_handle_t context
2731
return GTPIN_DI_ERROR_INVALID_ARGUMENT;
2832
}
2933
size_t size = alignUp(reqSize, MemoryConstants::cacheLineSize);
30-
void *hostPtr = pContext->getMemoryManager()->allocateSystemMemory(size, MemoryConstants::pageSize);
31-
if (hostPtr == nullptr) {
32-
return GTPIN_DI_ERROR_ALLOCATION_FAILED;
34+
GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
35+
if (gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
36+
void *unfiedMemorySharedAllocation = clSharedMemAllocINTEL(pContext, pContext->getDevice(0), 0, size, 0, &diag);
37+
auto allocationsManager = pContext->getSVMAllocsManager();
38+
auto graphicsAllocation = allocationsManager->getSVMAlloc(unfiedMemorySharedAllocation);
39+
*pResource = (resource_handle_t)graphicsAllocation;
40+
} else {
41+
void *hostPtr = pContext->getMemoryManager()->allocateSystemMemory(size, MemoryConstants::pageSize);
42+
if (hostPtr == nullptr) {
43+
return GTPIN_DI_ERROR_ALLOCATION_FAILED;
44+
}
45+
cl_mem buffer = Buffer::create(pContext, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE | CL_MEM_FORCE_HOST_MEMORY_INTEL, size, hostPtr, diag);
46+
*pResource = (resource_handle_t)buffer;
3347
}
34-
cl_mem buffer = Buffer::create(pContext, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE | CL_MEM_FORCE_HOST_MEMORY_INTEL, size, hostPtr, diag);
35-
*pResource = (resource_handle_t)buffer;
3648
return GTPIN_DI_SUCCESS;
3749
}
3850

3951
GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinFreeBuffer(context_handle_t context, resource_handle_t resource) {
40-
cl_mem buffer = (cl_mem)resource;
4152
Context *pContext = castToObject<Context>((cl_context)context);
42-
if ((pContext == nullptr) || (buffer == nullptr)) {
53+
if ((pContext == nullptr) || (resource == nullptr)) {
4354
return GTPIN_DI_ERROR_INVALID_ARGUMENT;
4455
}
45-
auto pMemObj = castToObject<MemObj>(buffer);
46-
if (pMemObj == nullptr) {
47-
return GTPIN_DI_ERROR_INVALID_ARGUMENT;
56+
if (pContext->getMemoryManager()->isLocalMemorySupported(pContext->getDevice(0)->getRootDeviceIndex())) {
57+
auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
58+
clMemFreeINTEL(pContext, allocData->cpuAllocation->getUnderlyingBuffer());
59+
} else {
60+
auto pMemObj = castToObject<MemObj>(resource);
61+
if (pMemObj == nullptr) {
62+
return GTPIN_DI_ERROR_INVALID_ARGUMENT;
63+
}
64+
alignedFree(pMemObj->getHostPtr());
65+
pMemObj->release();
4866
}
49-
alignedFree(pMemObj->getHostPtr());
50-
pMemObj->release();
5167
return GTPIN_DI_SUCCESS;
5268
}
5369

@@ -57,23 +73,31 @@ GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinMapBuffer(context_handle_t context, r
5773
if ((pContext == nullptr) || (buffer == nullptr) || (pAddress == nullptr)) {
5874
return GTPIN_DI_ERROR_INVALID_ARGUMENT;
5975
}
60-
auto pMemObj = castToObject<MemObj>(buffer);
61-
if (pMemObj == nullptr) {
62-
return GTPIN_DI_ERROR_INVALID_ARGUMENT;
76+
GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
77+
if (gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
78+
auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
79+
*pAddress = reinterpret_cast<uint8_t *>(allocData->cpuAllocation->getUnderlyingBuffer());
80+
} else {
81+
auto pMemObj = castToObject<MemObj>(buffer);
82+
if (pMemObj == nullptr) {
83+
return GTPIN_DI_ERROR_INVALID_ARGUMENT;
84+
}
85+
*pAddress = reinterpret_cast<uint8_t *>(pMemObj->getHostPtr());
6386
}
64-
*pAddress = (uint8_t *)pMemObj->getHostPtr();
6587
return GTPIN_DI_SUCCESS;
6688
}
6789

6890
GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinUnmapBuffer(context_handle_t context, resource_handle_t resource) {
69-
cl_mem buffer = (cl_mem)resource;
7091
Context *pContext = castToObject<Context>((cl_context)context);
71-
if ((pContext == nullptr) || (buffer == nullptr)) {
92+
if ((pContext == nullptr) || (resource == nullptr)) {
7293
return GTPIN_DI_ERROR_INVALID_ARGUMENT;
7394
}
74-
auto pMemObj = castToObject<MemObj>(buffer);
75-
if (pMemObj == nullptr) {
76-
return GTPIN_DI_ERROR_INVALID_ARGUMENT;
95+
GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
96+
if (!gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
97+
auto pMemObj = castToObject<MemObj>(resource);
98+
if (pMemObj == nullptr) {
99+
return GTPIN_DI_ERROR_INVALID_ARGUMENT;
100+
}
77101
}
78102
return GTPIN_DI_SUCCESS;
79103
}

opencl/source/gtpin/gtpin_hw_helper.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class GTPinHwHelper {
1717
virtual uint32_t getGenVersion() = 0;
1818
virtual bool addSurfaceState(Kernel *pKernel) = 0;
1919
virtual void *getSurfaceState(Kernel *pKernel, size_t bti) = 0;
20+
virtual bool canUseSharedAllocation(const HardwareInfo &hwInfo) const = 0;
2021

2122
protected:
2223
GTPinHwHelper(){};
@@ -32,8 +33,9 @@ class GTPinHwHelperHw : public GTPinHwHelper {
3233
uint32_t getGenVersion() override;
3334
bool addSurfaceState(Kernel *pKernel) override;
3435
void *getSurfaceState(Kernel *pKernel, size_t bti) override;
36+
bool canUseSharedAllocation(const HardwareInfo &hwInfo) const override;
3537

36-
private:
38+
protected:
3739
GTPinHwHelperHw(){};
3840
};
3941
} // namespace NEO

opencl/source/gtpin/gtpin_hw_helper.inl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,14 @@ void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti) {
5656
return pSurfaceState;
5757
}
5858

59+
template <typename GfxFamily>
60+
bool GTPinHwHelperHw<GfxFamily>::canUseSharedAllocation(const HardwareInfo &hwInfo) const {
61+
bool canUseSharedAllocation = false;
62+
if (DebugManager.flags.GTPinAllocateBufferInSharedMemory.get() != -1) {
63+
canUseSharedAllocation = !!DebugManager.flags.GTPinAllocateBufferInSharedMemory.get();
64+
}
65+
canUseSharedAllocation &= hwInfo.capabilityTable.ftrSvm;
66+
return canUseSharedAllocation;
67+
}
68+
5969
} // namespace NEO

0 commit comments

Comments
 (0)