Skip to content

Commit e151bc6

Browse files
[OCL] Flag for allocating small buffers from pool
Improves performance in workloads that create small opencl buffers. To enable, set env var ExperimentalSmallBufferPoolAllocator=1 Known issues (will be addressed in further commits): - cannot create subBuffer from such buffer - pool buffer allocation should be reused Related-To: NEO-7332 Signed-off-by: Dominik Dabek <[email protected]>
1 parent 4faf1ee commit e151bc6

File tree

12 files changed

+598
-48
lines changed

12 files changed

+598
-48
lines changed

opencl/source/context/context.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ Context::Context(
4545
Context::~Context() {
4646
gtpinNotifyContextDestroy((cl_context)this);
4747

48+
if (smallBufferPoolAllocator.isAggregatedSmallBuffersEnabled()) {
49+
smallBufferPoolAllocator.releaseSmallBufferPool();
50+
}
51+
4852
delete[] properties;
4953

5054
for (auto rootDeviceIndex = 0u; rootDeviceIndex < specialQueues.size(); rootDeviceIndex++) {
@@ -467,4 +471,65 @@ Platform *Context::getPlatformFromProperties(const cl_context_properties *proper
467471
bool Context::isSingleDeviceContext() {
468472
return devices[0]->getNumGenericSubDevices() == 0 && getNumDevices() == 1;
469473
}
474+
475+
void Context::BufferPoolAllocator::initAggregatedSmallBuffers(Context *context) {
476+
static constexpr cl_mem_flags flags{};
477+
[[maybe_unused]] cl_int errcodeRet{};
478+
this->mainStorage = Buffer::create(context,
479+
flags,
480+
BufferPoolAllocator::aggregatedSmallBuffersPoolSize,
481+
nullptr,
482+
errcodeRet);
483+
if (this->mainStorage) {
484+
this->chunkAllocator.reset(new HeapAllocator(BufferPoolAllocator::startingOffset,
485+
BufferPoolAllocator::aggregatedSmallBuffersPoolSize,
486+
BufferPoolAllocator::chunkAlignment));
487+
context->decRefInternal();
488+
}
489+
}
490+
491+
Buffer *Context::BufferPoolAllocator::allocateBufferFromPool(const MemoryProperties &memoryProperties,
492+
cl_mem_flags flags,
493+
cl_mem_flags_intel flagsIntel,
494+
size_t size,
495+
void *hostPtr,
496+
cl_int &errcodeRet) {
497+
errcodeRet = CL_MEM_OBJECT_ALLOCATION_FAILURE;
498+
if (this->isAggregatedSmallBuffersEnabled() &&
499+
this->isSizeWithinThreshold(size) &&
500+
this->mainStorage) {
501+
auto lock = std::unique_lock<std::mutex>(this->mutex);
502+
cl_buffer_region bufferRegion{};
503+
bufferRegion.origin = static_cast<size_t>(this->chunkAllocator->allocate(size));
504+
if (bufferRegion.origin == 0) {
505+
return nullptr;
506+
}
507+
bufferRegion.origin -= BufferPoolAllocator::startingOffset;
508+
bufferRegion.size = size;
509+
auto bufferFromPool = this->mainStorage->createSubBuffer(flags, flagsIntel, &bufferRegion, errcodeRet);
510+
bufferFromPool->createFunction = this->mainStorage->createFunction;
511+
return bufferFromPool;
512+
}
513+
return nullptr;
514+
}
515+
516+
bool Context::BufferPoolAllocator::isPoolBuffer(const MemObj *buffer) const {
517+
return this->mainStorage == buffer;
518+
}
519+
520+
void Context::BufferPoolAllocator::tryFreeFromPoolBuffer(MemObj *possiblePoolBuffer, size_t offset, size_t size) {
521+
if (this->isPoolBuffer(possiblePoolBuffer)) {
522+
auto lock = std::unique_lock<std::mutex>(this->mutex);
523+
DEBUG_BREAK_IF(!this->mainStorage);
524+
auto internalBufferAddress = offset + BufferPoolAllocator::startingOffset;
525+
this->chunkAllocator->free(internalBufferAddress, size);
526+
}
527+
}
528+
529+
void Context::BufferPoolAllocator::releaseSmallBufferPool() {
530+
DEBUG_BREAK_IF(!this->mainStorage);
531+
delete this->mainStorage;
532+
this->mainStorage = nullptr;
533+
}
534+
470535
} // namespace NEO

opencl/source/context/context.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@
1010
#include "shared/source/helpers/common_types.h"
1111
#include "shared/source/helpers/string.h"
1212
#include "shared/source/unified_memory/unified_memory.h"
13+
#include "shared/source/utilities/heap_allocator.h"
1314

1415
#include "opencl/source/cl_device/cl_device_vector.h"
1516
#include "opencl/source/context/context_type.h"
1617
#include "opencl/source/context/driver_diagnostics.h"
1718
#include "opencl/source/gtpin/gtpin_notify.h"
1819
#include "opencl/source/helpers/base_object.h"
1920
#include "opencl/source/helpers/destructor_callbacks.h"
21+
#include "opencl/source/mem_obj/buffer.h"
2022
#include "opencl/source/mem_obj/map_operations_handler.h"
2123

2224
#include <map>
@@ -40,6 +42,42 @@ struct OpenCLObjectMapper<_cl_context> {
4042

4143
class Context : public BaseObject<_cl_context> {
4244
public:
45+
class BufferPoolAllocator {
46+
public:
47+
static constexpr auto aggregatedSmallBuffersPoolSize = 64 * KB;
48+
static constexpr auto smallBufferThreshold = 4 * KB;
49+
static constexpr auto chunkAlignment = 256u;
50+
static constexpr auto startingOffset = chunkAlignment;
51+
52+
static_assert(aggregatedSmallBuffersPoolSize > smallBufferThreshold, "Largest allowed buffer needs to fit in pool");
53+
Buffer *allocateBufferFromPool(const MemoryProperties &memoryProperties,
54+
cl_mem_flags flags,
55+
cl_mem_flags_intel flagsIntel,
56+
size_t size,
57+
void *hostPtr,
58+
cl_int &errcodeRet);
59+
void tryFreeFromPoolBuffer(MemObj *possiblePoolBuffer, size_t offset, size_t size);
60+
void releaseSmallBufferPool();
61+
62+
inline bool isAggregatedSmallBuffersEnabled() const {
63+
constexpr bool enable = false;
64+
if (DebugManager.flags.ExperimentalSmallBufferPoolAllocator.get() != -1) {
65+
return !!DebugManager.flags.ExperimentalSmallBufferPoolAllocator.get();
66+
}
67+
return enable;
68+
}
69+
void initAggregatedSmallBuffers(Context *context);
70+
71+
bool isPoolBuffer(const MemObj *buffer) const;
72+
73+
protected:
74+
inline bool isSizeWithinThreshold(size_t size) const {
75+
return BufferPoolAllocator::smallBufferThreshold >= size;
76+
}
77+
Buffer *mainStorage{nullptr};
78+
std::unique_ptr<HeapAllocator> chunkAllocator;
79+
std::mutex mutex;
80+
};
4381
static const cl_ulong objectMagic = 0xA4234321DC002130LL;
4482

4583
bool createImpl(const cl_context_properties *properties,
@@ -58,6 +96,11 @@ class Context : public BaseObject<_cl_context> {
5896
if (!pContext->createImpl(properties, devices, funcNotify, data, errcodeRet)) {
5997
delete pContext;
6098
pContext = nullptr;
99+
} else {
100+
auto &bufferPoolAllocator = pContext->getBufferPoolAllocator();
101+
if (bufferPoolAllocator.isAggregatedSmallBuffersEnabled()) {
102+
bufferPoolAllocator.initAggregatedSmallBuffers(pContext);
103+
}
61104
}
62105
gtpinNotifyContextCreate(pContext);
63106
return pContext;
@@ -176,6 +219,9 @@ class Context : public BaseObject<_cl_context> {
176219
const std::map<uint32_t, DeviceBitfield> &getDeviceBitfields() const { return deviceBitfields; };
177220

178221
static Platform *getPlatformFromProperties(const cl_context_properties *properties, cl_int &errcode);
222+
BufferPoolAllocator &getBufferPoolAllocator() {
223+
return this->smallBufferPoolAllocator;
224+
}
179225

180226
protected:
181227
struct BuiltInKernel {
@@ -211,6 +257,7 @@ class Context : public BaseObject<_cl_context> {
211257
MapOperationsStorage mapOperationsStorage = {};
212258
StackVec<CommandQueue *, 1> specialQueues;
213259
DriverDiagnostics *driverDiagnostics = nullptr;
260+
BufferPoolAllocator smallBufferPoolAllocator;
214261

215262
uint32_t maxRootDeviceIndex = std::numeric_limits<uint32_t>::max();
216263
cl_bool preferD3dSharedResources = 0u;

opencl/source/mem_obj/buffer.cpp

Lines changed: 102 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,58 @@ Buffer *Buffer::create(Context *context,
175175
flags, 0, size, hostPtr, errcodeRet);
176176
}
177177

178+
bool inline copyHostPointer(Buffer *buffer,
179+
size_t size,
180+
void *hostPtr,
181+
GraphicsAllocation *memory,
182+
GraphicsAllocation *mapAllocation,
183+
uint32_t rootDeviceIndex,
184+
bool isCompressionEnabled,
185+
bool implicitScalingEnabled,
186+
cl_int &errcodeRet) {
187+
const bool isLocalMemory = !MemoryPoolHelper::isSystemMemoryPool(memory->getMemoryPool());
188+
const bool gpuCopyRequired = isCompressionEnabled || isLocalMemory;
189+
if (gpuCopyRequired) {
190+
auto context = buffer->getContext();
191+
auto &device = context->getDevice(0u)->getDevice();
192+
auto &hwInfo = device.getHardwareInfo();
193+
auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily);
194+
bool copyOnCpuAllowed = implicitScalingEnabled == false &&
195+
size <= Buffer::maxBufferSizeForCopyOnCpu &&
196+
isCompressionEnabled == false &&
197+
hwInfoConfig->getLocalMemoryAccessMode(hwInfo) != LocalMemoryAccessMode::CpuAccessDisallowed &&
198+
memory->storageInfo.isLockable;
199+
if (DebugManager.flags.CopyHostPtrOnCpu.get() != -1) {
200+
copyOnCpuAllowed = DebugManager.flags.CopyHostPtrOnCpu.get() == 1;
201+
}
202+
if (auto lockedPointer = copyOnCpuAllowed ? device.getMemoryManager()->lockResource(memory) : nullptr) {
203+
memcpy_s(ptrOffset(lockedPointer, buffer->getOffset()), size, hostPtr, size);
204+
memory->setAubWritable(true, GraphicsAllocation::defaultBank);
205+
memory->setTbxWritable(true, GraphicsAllocation::defaultBank);
206+
return true;
207+
} else {
208+
auto blitMemoryToAllocationResult = BlitOperationResult::Unsupported;
209+
210+
if (hwInfoConfig->isBlitterFullySupported(hwInfo) && isLocalMemory) {
211+
blitMemoryToAllocationResult = BlitHelperFunctions::blitMemoryToAllocation(device, memory, buffer->getOffset(), hostPtr, {size, 1, 1});
212+
}
213+
214+
if (blitMemoryToAllocationResult != BlitOperationResult::Success) {
215+
auto cmdQ = context->getSpecialQueue(rootDeviceIndex);
216+
if (CL_SUCCESS != cmdQ->enqueueWriteBuffer(buffer, CL_TRUE, buffer->getOffset(), size, hostPtr, mapAllocation, 0, nullptr, nullptr)) {
217+
errcodeRet = CL_OUT_OF_RESOURCES;
218+
return false;
219+
}
220+
}
221+
return true;
222+
}
223+
} else {
224+
memcpy_s(ptrOffset(memory->getUnderlyingBuffer(), buffer->getOffset()), size, hostPtr, size);
225+
return true;
226+
}
227+
return false;
228+
}
229+
178230
Buffer *Buffer::create(Context *context,
179231
const MemoryProperties &memoryProperties,
180232
cl_mem_flags flags,
@@ -184,6 +236,47 @@ Buffer *Buffer::create(Context *context,
184236
cl_int &errcodeRet) {
185237

186238
errcodeRet = CL_SUCCESS;
239+
Context::BufferPoolAllocator &bufferPoolAllocator = context->getBufferPoolAllocator();
240+
const bool implicitScalingEnabled = ImplicitScalingHelper::isImplicitScalingEnabled(context->getDevice(0u)->getDeviceBitfield(), true);
241+
const bool useHostPtr = memoryProperties.flags.useHostPtr;
242+
const bool copyHostPtr = memoryProperties.flags.copyHostPtr;
243+
if (implicitScalingEnabled == false &&
244+
useHostPtr == false &&
245+
memoryProperties.flags.forceHostMemory == false) {
246+
cl_int poolAllocRet = CL_SUCCESS;
247+
auto bufferFromPool = bufferPoolAllocator.allocateBufferFromPool(memoryProperties,
248+
flags,
249+
flagsIntel,
250+
size,
251+
hostPtr,
252+
poolAllocRet);
253+
if (CL_SUCCESS == poolAllocRet) {
254+
const bool needsCopy = copyHostPtr;
255+
if (needsCopy) {
256+
for (auto &rootDeviceIndex : context->getRootDeviceIndices()) {
257+
auto graphicsAllocation = bufferFromPool->getGraphicsAllocation(rootDeviceIndex);
258+
auto mapAllocation = bufferFromPool->getMapAllocation(rootDeviceIndex);
259+
bool isCompressionEnabled = graphicsAllocation->isCompressionEnabled();
260+
if (copyHostPointer(bufferFromPool,
261+
size,
262+
hostPtr,
263+
graphicsAllocation,
264+
mapAllocation,
265+
rootDeviceIndex,
266+
isCompressionEnabled,
267+
implicitScalingEnabled,
268+
poolAllocRet)) {
269+
break;
270+
}
271+
}
272+
}
273+
if (!needsCopy || poolAllocRet == CL_SUCCESS) {
274+
return bufferFromPool;
275+
} else {
276+
clReleaseMemObject(bufferFromPool);
277+
}
278+
}
279+
}
187280

188281
MemoryManager *memoryManager = context->getMemoryManager();
189282
UNRECOVERABLE_IF(!memoryManager);
@@ -194,9 +287,6 @@ Buffer *Buffer::create(Context *context,
194287
AllocationInfoType allocationInfos;
195288
allocationInfos.resize(maxRootDeviceIndex + 1ull);
196289

197-
const bool useHostPtr = memoryProperties.flags.useHostPtr;
198-
const bool copyHostPtr = memoryProperties.flags.copyHostPtr;
199-
200290
void *allocationCpuPtr = nullptr;
201291
bool forceCopyHostPtr = false;
202292

@@ -404,45 +494,15 @@ Buffer *Buffer::create(Context *context,
404494
pBuffer->setHostPtrMinSize(size);
405495

406496
if (allocationInfo.copyMemoryFromHostPtr && !copyExecuted) {
407-
auto isLocalMemory = !MemoryPoolHelper::isSystemMemoryPool(allocationInfo.memory->getMemoryPool());
408-
bool gpuCopyRequired = isCompressionEnabled || isLocalMemory;
409-
410-
if (gpuCopyRequired) {
411-
auto &device = pBuffer->getContext()->getDevice(0u)->getDevice();
412-
auto &hwInfo = device.getHardwareInfo();
413-
auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily);
414-
bool copyOnCpuAllowed = false == ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) &&
415-
size <= Buffer::maxBufferSizeForCopyOnCpu &&
416-
!isCompressionEnabled &&
417-
hwInfoConfig->getLocalMemoryAccessMode(hwInfo) != LocalMemoryAccessMode::CpuAccessDisallowed &&
418-
allocationInfo.memory->storageInfo.isLockable;
419-
if (DebugManager.flags.CopyHostPtrOnCpu.get() != -1) {
420-
copyOnCpuAllowed = DebugManager.flags.CopyHostPtrOnCpu.get() == 1;
421-
}
422-
if (auto lockedPointer = copyOnCpuAllowed ? device.getMemoryManager()->lockResource(allocationInfo.memory) : nullptr) {
423-
memcpy_s(ptrOffset(lockedPointer, pBuffer->getOffset()), size, hostPtr, size);
424-
allocationInfo.memory->setAubWritable(true, GraphicsAllocation::defaultBank);
425-
allocationInfo.memory->setTbxWritable(true, GraphicsAllocation::defaultBank);
426-
copyExecuted = true;
427-
} else {
428-
auto blitMemoryToAllocationResult = BlitOperationResult::Unsupported;
429-
430-
if (hwInfoConfig->isBlitterFullySupported(hwInfo) && isLocalMemory) {
431-
blitMemoryToAllocationResult = BlitHelperFunctions::blitMemoryToAllocation(device, allocationInfo.memory, pBuffer->getOffset(), hostPtr, {size, 1, 1});
432-
}
433-
434-
if (blitMemoryToAllocationResult != BlitOperationResult::Success) {
435-
auto cmdQ = context->getSpecialQueue(rootDeviceIndex);
436-
if (CL_SUCCESS != cmdQ->enqueueWriteBuffer(pBuffer, CL_TRUE, 0, size, hostPtr, allocationInfo.mapAllocation, 0, nullptr, nullptr)) {
437-
errcodeRet = CL_OUT_OF_RESOURCES;
438-
}
439-
}
440-
copyExecuted = true;
441-
}
442-
} else {
443-
memcpy_s(allocationInfo.memory->getUnderlyingBuffer(), size, hostPtr, size);
444-
copyExecuted = true;
445-
}
497+
copyExecuted = copyHostPointer(pBuffer,
498+
size,
499+
hostPtr,
500+
allocationInfo.memory,
501+
allocationInfo.mapAllocation,
502+
rootDeviceIndex,
503+
isCompressionEnabled,
504+
implicitScalingEnabled,
505+
errcodeRet);
446506
}
447507
}
448508

opencl/source/mem_obj/mem_obj.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ MemObj::~MemObj() {
104104
}
105105
if (associatedMemObject) {
106106
associatedMemObject->decRefInternal();
107+
context->getBufferPoolAllocator().tryFreeFromPoolBuffer(associatedMemObject, this->offset, this->size);
107108
}
108109
if (!associatedMemObject) {
109110
releaseAllocatedMapPtr();
@@ -112,7 +113,10 @@ MemObj::~MemObj() {
112113

113114
destructorCallbacks.invoke(this);
114115

115-
context->decRefInternal();
116+
const bool needDecrementContextRefCount = !context->getBufferPoolAllocator().isPoolBuffer(this);
117+
if (needDecrementContextRefCount) {
118+
context->decRefInternal();
119+
}
116120
}
117121

118122
cl_int MemObj::getMemObjectInfo(cl_mem_info paramName,

opencl/test/unit_test/mem_obj/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
#
2-
# Copyright (C) 2018-2021 Intel Corporation
2+
# Copyright (C) 2018-2022 Intel Corporation
33
#
44
# SPDX-License-Identifier: MIT
55
#
66

77
set(IGDRCL_SRCS_tests_mem_obj
88
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
99
${CMAKE_CURRENT_SOURCE_DIR}/buffer_pin_tests.cpp
10+
${CMAKE_CURRENT_SOURCE_DIR}/buffer_pool_alloc_tests.cpp
1011
${CMAKE_CURRENT_SOURCE_DIR}/buffer_set_arg_tests.cpp
1112
${CMAKE_CURRENT_SOURCE_DIR}/buffer_tests.cpp
1213
${CMAKE_CURRENT_SOURCE_DIR}/buffer_bcs_tests.cpp

0 commit comments

Comments
 (0)