Skip to content

Commit dacbce7

Browse files
performance: introduce staging read for cl_buffer
Related-To: NEO-14026 Signed-off-by: Szymon Morek <[email protected]>
1 parent b2b3b55 commit dacbce7

12 files changed

+272
-29
lines changed

opencl/source/api/api.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2542,7 +2542,8 @@ cl_int CL_API_CALL clEnqueueWriteBuffer(cl_command_queue commandQueue,
25422542
}
25432543

25442544
if (pCommandQueue->isValidForStagingTransfer(pBuffer, ptr, cb, CL_COMMAND_WRITE_BUFFER, blockingWrite, numEventsInWaitList > 0)) {
2545-
retVal = pCommandQueue->enqueueStagingWriteBuffer(
2545+
retVal = pCommandQueue->enqueueStagingBufferTransfer(
2546+
CL_COMMAND_WRITE_BUFFER,
25462547
pBuffer,
25472548
blockingWrite,
25482549
offset,

opencl/source/command_queue/command_queue.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
140140
virtual cl_int enqueueReadBuffer(Buffer *buffer, cl_bool blockingRead, size_t offset, size_t size, void *ptr,
141141
GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList,
142142
const cl_event *eventWaitList, cl_event *event) = 0;
143+
virtual cl_int enqueueReadBufferImpl(Buffer *buffer, cl_bool blockingRead, size_t offset, size_t size,
144+
void *ptr, GraphicsAllocation *mapAllocation, cl_uint numEventsInWaitList,
145+
const cl_event *eventWaitList, cl_event *event, CommandStreamReceiver &csr) = 0;
143146

144147
virtual cl_int enqueueReadImage(Image *srcImage, cl_bool blockingRead, const size_t *origin, const size_t *region,
145148
size_t rowPitch, size_t slicePitch, void *ptr, GraphicsAllocation *mapAllocation,
@@ -402,7 +405,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
402405
cl_int enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event);
403406
cl_int enqueueStagingImageTransfer(cl_command_type commandType, Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion,
404407
size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event);
405-
cl_int enqueueStagingWriteBuffer(Buffer *buffer, cl_bool blockingCopy, size_t offset, size_t size, const void *ptr, cl_event *event);
408+
cl_int enqueueStagingBufferTransfer(cl_command_type commandType, Buffer *buffer, cl_bool blockingCopy, size_t offset, size_t size, const void *ptr, cl_event *event);
406409

407410
bool isValidForStagingBufferCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies);
408411
bool isValidForStagingTransfer(MemObj *memObj, const void *ptr, size_t size, cl_command_type commandType, bool isBlocking, bool hasDependencies);

opencl/source/command_queue/command_queue_hw.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,16 @@ class CommandQueueHw : public CommandQueue {
243243
const cl_event *eventWaitList,
244244
cl_event *event) override;
245245

246+
cl_int enqueueReadBufferImpl(Buffer *buffer,
247+
cl_bool blockingRead,
248+
size_t offset,
249+
size_t size,
250+
void *ptr,
251+
GraphicsAllocation *mapAllocation,
252+
cl_uint numEventsInWaitList,
253+
const cl_event *eventWaitList,
254+
cl_event *event, CommandStreamReceiver &csr) override;
255+
246256
cl_int enqueueReadBufferRect(Buffer *buffer,
247257
cl_bool blockingRead,
248258
const size_t *bufferOrigin,

opencl/source/command_queue/command_queue_staging.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,9 @@ cl_int CommandQueue::enqueueStagingImageTransfer(cl_command_type commandType, Im
8383
return postStagingTransferSync(ret, event, profilingEvent, isSingleTransfer, blockingCopy, commandType);
8484
}
8585

86-
cl_int CommandQueue::enqueueStagingWriteBuffer(Buffer *buffer, cl_bool blockingCopy, size_t offset, size_t size, const void *ptr, cl_event *event) {
87-
CsrSelectionArgs csrSelectionArgs{CL_COMMAND_WRITE_BUFFER, {}, buffer, this->getDevice().getRootDeviceIndex(), &size};
86+
cl_int CommandQueue::enqueueStagingBufferTransfer(cl_command_type commandType, Buffer *buffer, cl_bool blockingCopy, size_t offset, size_t size, const void *ptr, cl_event *event) {
87+
auto isRead = commandType == CL_COMMAND_READ_BUFFER;
88+
CsrSelectionArgs csrSelectionArgs{commandType, isRead ? buffer : nullptr, isRead ? nullptr : buffer, this->getDevice().getRootDeviceIndex(), &size};
8889
CommandStreamReceiver &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
8990
cl_event profilingEvent = nullptr;
9091

@@ -94,14 +95,26 @@ cl_int CommandQueue::enqueueStagingWriteBuffer(Buffer *buffer, cl_bool blockingC
9495
auto isLastTransfer = (offset + size == chunkOffset + chunkSize);
9596
isSingleTransfer = isFirstTransfer && isLastTransfer;
9697
cl_event *outEvent = assignEventForStaging(event, &profilingEvent, isFirstTransfer, isLastTransfer);
97-
98-
auto ret = this->enqueueWriteBufferImpl(buffer, false, chunkOffset, chunkSize, stagingBuffer, nullptr, 0, nullptr, outEvent, csr);
98+
cl_int ret = 0;
99+
if (isRead) {
100+
ret = this->enqueueReadBufferImpl(buffer, false, chunkOffset, chunkSize, stagingBuffer, nullptr, 0, nullptr, outEvent, csr);
101+
} else {
102+
ret = this->enqueueWriteBufferImpl(buffer, false, chunkOffset, chunkSize, stagingBuffer, nullptr, 0, nullptr, outEvent, csr);
103+
}
99104
ret |= this->flush();
100105
return ret;
101106
};
102107
auto stagingBufferManager = this->context->getStagingBufferManager();
103-
auto ret = stagingBufferManager->performBufferTransfer(ptr, offset, size, chunkWrite, &csr, false);
104-
return postStagingTransferSync(ret, event, profilingEvent, isSingleTransfer, blockingCopy, CL_COMMAND_WRITE_BUFFER);
108+
auto ret = stagingBufferManager->performBufferTransfer(ptr, offset, size, chunkWrite, &csr, isRead);
109+
110+
if (isRead && context->isProvidingPerformanceHints()) {
111+
context->providePerformanceHintForMemoryTransfer(commandType, true, static_cast<cl_mem>(buffer), ptr);
112+
if (!isL3Capable(ptr, size)) {
113+
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, CL_ENQUEUE_READ_BUFFER_DOESNT_MEET_ALIGNMENT_RESTRICTIONS, ptr, size, MemoryConstants::pageSize, MemoryConstants::pageSize);
114+
}
115+
}
116+
117+
return postStagingTransferSync(ret, event, profilingEvent, isSingleTransfer, blockingCopy, commandType);
105118
}
106119

107120
/*

opencl/source/command_queue/enqueue_read_buffer.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,28 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBuffer(
3434
cl_uint numEventsInWaitList,
3535
const cl_event *eventWaitList,
3636
cl_event *event) {
37-
3837
const cl_command_type cmdType = CL_COMMAND_READ_BUFFER;
3938

4039
CsrSelectionArgs csrSelectionArgs{cmdType, buffer, {}, device->getRootDeviceIndex(), &size};
4140
CommandStreamReceiver &csr = selectCsrForBuiltinOperation(csrSelectionArgs);
41+
return enqueueReadBufferImpl(buffer, blockingRead, offset, size, ptr, mapAllocation, numEventsInWaitList, eventWaitList, event, csr);
42+
}
43+
44+
template <typename GfxFamily>
45+
cl_int CommandQueueHw<GfxFamily>::enqueueReadBufferImpl(
46+
Buffer *buffer,
47+
cl_bool blockingRead,
48+
size_t offset,
49+
size_t size,
50+
void *ptr,
51+
GraphicsAllocation *mapAllocation,
52+
cl_uint numEventsInWaitList,
53+
const cl_event *eventWaitList,
54+
cl_event *event, CommandStreamReceiver &csr) {
55+
56+
const cl_command_type cmdType = CL_COMMAND_READ_BUFFER;
57+
58+
CsrSelectionArgs csrSelectionArgs{cmdType, buffer, {}, device->getRootDeviceIndex(), &size};
4259

4360
if (nullptr == mapAllocation) {
4461
notifyEnqueueReadBuffer(buffer, !!blockingRead, EngineHelpers::isBcs(csr.getOsContext().getEngineType()));

opencl/test/unit_test/api/cl_enqueue_read_buffer_tests.inl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,13 @@ using ClEnqueueReadBufferTests = ApiTests;
2121
namespace ULT {
2222

2323
TEST_F(ClEnqueueReadBufferTests, GivenCorrectArgumentsWhenReadingBufferThenSuccessIsReturned) {
24-
MockBuffer buffer{};
24+
MockContext context{};
25+
MockGraphicsAllocation allocation{};
26+
MockBuffer buffer{&context, allocation};
27+
MockCommandQueue commandQueue{context};
2528
auto data = 1;
2629
auto retVal = clEnqueueReadBuffer(
27-
pCommandQueue,
30+
&commandQueue,
2831
&buffer,
2932
false,
3033
0,

opencl/test/unit_test/command_queue/enqueue_read_buffer_tests.cpp

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -855,3 +855,160 @@ HWTEST_F(EnqueueReadBufferHw, givenHostPtrIsFromMappedBufferWhenReadBufferIsCall
855855
EXPECT_EQ(CL_SUCCESS, retVal);
856856
EXPECT_EQ(1u, csr.createAllocationForHostSurfaceCalled);
857857
}
858+
859+
struct ReadBufferStagingBufferTest : public EnqueueReadBufferHw {
860+
void SetUp() override {
861+
REQUIRE_SVM_OR_SKIP(defaultHwInfo);
862+
EnqueueReadBufferHw::SetUp();
863+
}
864+
865+
void TearDown() override {
866+
if (defaultHwInfo->capabilityTable.ftrSvm == false) {
867+
return;
868+
}
869+
EnqueueReadBufferHw::TearDown();
870+
}
871+
constexpr static size_t chunkSize = MemoryConstants::megaByte * 2;
872+
873+
unsigned char ptr[MemoryConstants::cacheLineSize];
874+
MockBuffer buffer;
875+
cl_queue_properties props = {};
876+
};
877+
878+
HWTEST_F(ReadBufferStagingBufferTest, whenEnqueueStagingReadBufferCalledThenReturnSuccess) {
879+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
880+
auto res = mockCommandQueueHw.enqueueStagingBufferTransfer(CL_COMMAND_READ_BUFFER, &buffer, false, 0, buffer.getSize(), ptr, nullptr);
881+
EXPECT_TRUE(mockCommandQueueHw.flushCalled);
882+
EXPECT_EQ(res, CL_SUCCESS);
883+
EXPECT_EQ(1ul, mockCommandQueueHw.enqueueReadBufferCounter);
884+
auto &csr = device->getUltCommandStreamReceiver<FamilyType>();
885+
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
886+
}
887+
888+
HWTEST_F(ReadBufferStagingBufferTest, whenHostPtrRegisteredThenDontUseStagingUntilEventCompleted) {
889+
DebugManagerStateRestore restorer;
890+
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
891+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
892+
893+
cl_event event;
894+
auto retVal = mockCommandQueueHw.enqueueReadBuffer(&buffer,
895+
CL_FALSE,
896+
0,
897+
MemoryConstants::cacheLineSize,
898+
ptr,
899+
nullptr,
900+
0,
901+
nullptr,
902+
&event);
903+
EXPECT_EQ(CL_SUCCESS, retVal);
904+
auto pEvent = castToObjectOrAbort<Event>(event);
905+
906+
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, MemoryConstants::cacheLineSize, CL_COMMAND_READ_BUFFER, false, false));
907+
EXPECT_FALSE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, MemoryConstants::cacheLineSize, CL_COMMAND_READ_BUFFER, false, false));
908+
909+
pEvent->updateExecutionStatus();
910+
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, MemoryConstants::cacheLineSize, CL_COMMAND_READ_BUFFER, false, false));
911+
912+
pEvent->release();
913+
}
914+
915+
HWTEST_F(ReadBufferStagingBufferTest, whenHostPtrRegisteredThenDontUseStagingUntilFinishCalled) {
916+
DebugManagerStateRestore restorer;
917+
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
918+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
919+
920+
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, MemoryConstants::cacheLineSize, CL_COMMAND_READ_BUFFER, false, false));
921+
EXPECT_FALSE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, MemoryConstants::cacheLineSize, CL_COMMAND_READ_BUFFER, false, false));
922+
923+
mockCommandQueueHw.finish();
924+
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, MemoryConstants::cacheLineSize, CL_COMMAND_READ_BUFFER, false, false));
925+
}
926+
927+
HWTEST_F(ReadBufferStagingBufferTest, whenEnqueueStagingReadBufferCalledWithLargeSizeThenSplitTransfer) {
928+
auto hostPtr = new unsigned char[chunkSize * 4];
929+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
930+
auto retVal = CL_SUCCESS;
931+
std::unique_ptr<Buffer> buffer = std::unique_ptr<Buffer>(Buffer::create(context.get(),
932+
0,
933+
chunkSize * 4,
934+
nullptr,
935+
retVal));
936+
auto res = mockCommandQueueHw.enqueueStagingBufferTransfer(CL_COMMAND_READ_BUFFER, buffer.get(), false, 0, chunkSize * 4, hostPtr, nullptr);
937+
EXPECT_TRUE(mockCommandQueueHw.flushCalled);
938+
EXPECT_EQ(retVal, CL_SUCCESS);
939+
EXPECT_EQ(res, CL_SUCCESS);
940+
EXPECT_EQ(4ul, mockCommandQueueHw.enqueueReadBufferCounter);
941+
auto &csr = device->getUltCommandStreamReceiver<FamilyType>();
942+
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
943+
944+
delete[] hostPtr;
945+
}
946+
947+
HWTEST_F(ReadBufferStagingBufferTest, whenEnqueueStagingReadBufferCalledWithEventThenReturnValidEvent) {
948+
constexpr cl_command_type expectedLastCmd = CL_COMMAND_READ_BUFFER;
949+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
950+
cl_event event;
951+
auto res = mockCommandQueueHw.enqueueStagingBufferTransfer(CL_COMMAND_READ_BUFFER, &buffer, false, 0, MemoryConstants::cacheLineSize, ptr, &event);
952+
EXPECT_EQ(res, CL_SUCCESS);
953+
954+
auto pEvent = (Event *)event;
955+
EXPECT_EQ(expectedLastCmd, mockCommandQueueHw.lastCommandType);
956+
EXPECT_EQ(expectedLastCmd, pEvent->getCommandType());
957+
958+
clReleaseEvent(event);
959+
}
960+
961+
HWTEST_F(ReadBufferStagingBufferTest, givenOutOfOrderQueueWhenEnqueueStagingReadBufferCalledWithSingleTransferThenNoBarrierEnqueued) {
962+
constexpr cl_command_type expectedLastCmd = CL_COMMAND_READ_BUFFER;
963+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
964+
mockCommandQueueHw.setOoqEnabled();
965+
cl_event event;
966+
auto res = mockCommandQueueHw.enqueueStagingBufferTransfer(CL_COMMAND_READ_BUFFER, &buffer, false, 0, MemoryConstants::cacheLineSize, ptr, &event);
967+
EXPECT_EQ(res, CL_SUCCESS);
968+
969+
auto pEvent = (Event *)event;
970+
EXPECT_EQ(expectedLastCmd, mockCommandQueueHw.lastCommandType);
971+
EXPECT_EQ(expectedLastCmd, pEvent->getCommandType());
972+
973+
clReleaseEvent(event);
974+
}
975+
976+
HWTEST_F(ReadBufferStagingBufferTest, givenCmdQueueWithProfilingWhenEnqueueStagingReadBufferThenTimestampsSetCorrectly) {
977+
cl_event event;
978+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
979+
mockCommandQueueHw.setProfilingEnabled();
980+
auto res = mockCommandQueueHw.enqueueStagingBufferTransfer(CL_COMMAND_READ_BUFFER, &buffer, false, 0, MemoryConstants::cacheLineSize, ptr, &event);
981+
EXPECT_EQ(res, CL_SUCCESS);
982+
983+
auto pEvent = (Event *)event;
984+
EXPECT_FALSE(pEvent->isCPUProfilingPath());
985+
EXPECT_TRUE(pEvent->isProfilingEnabled());
986+
987+
clReleaseEvent(event);
988+
}
989+
990+
HWTEST_F(ReadBufferStagingBufferTest, whenEnqueueStagingReadBufferFailedThenPropagateErrorCode) {
991+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
992+
mockCommandQueueHw.enqueueReadBufferCallBase = false;
993+
auto res = mockCommandQueueHw.enqueueStagingBufferTransfer(CL_COMMAND_READ_BUFFER, &buffer, false, 0, MemoryConstants::cacheLineSize, ptr, nullptr);
994+
995+
EXPECT_EQ(res, CL_INVALID_OPERATION);
996+
EXPECT_EQ(1ul, mockCommandQueueHw.enqueueReadBufferCounter);
997+
}
998+
999+
HWTEST_F(ReadBufferStagingBufferTest, whenIsValidForStagingTransferCalledThenReturnCorrectValue) {
1000+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
1001+
auto isStagingBuffersEnabled = device->getProductHelper().isStagingBuffersEnabled();
1002+
unsigned char ptr[16];
1003+
1004+
EXPECT_EQ(isStagingBuffersEnabled, mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, 16, CL_COMMAND_READ_BUFFER, false, false));
1005+
}
1006+
1007+
HWTEST_F(ReadBufferStagingBufferTest, whenIsValidForStagingTransferCalledAndCpuCopyAllowedThenReturnCorrectValue) {
1008+
DebugManagerStateRestore dbgRestore;
1009+
debugManager.flags.DoCpuCopyOnReadBuffer.set(1);
1010+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
1011+
unsigned char ptr[16];
1012+
1013+
EXPECT_FALSE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, 16, CL_COMMAND_READ_BUFFER, true, false));
1014+
}

0 commit comments

Comments
 (0)