Skip to content

Commit 9ff1307

Browse files
Fix optimize timestamp packet dependiencies
-program barrier after global fence allocation is programmed -do not double barrier timestamp in blit enqueue -flush GPGPU while submitting to BCS when barrier requested Signed-off-by: Lukasz Jobczyk <[email protected]>
1 parent 45d2386 commit 9ff1307

File tree

7 files changed

+112
-56
lines changed

7 files changed

+112
-56
lines changed

opencl/source/command_queue/command_queue_hw.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ class CommandQueueHw : public CommandQueue {
521521
KernelOperation *blockedCommandsData,
522522
TimestampPacketDependencies &timestampPacketDependencies);
523523

524-
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked) const;
524+
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies) const;
525525
void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType);
526526

527527
bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo);

opencl/source/command_queue/command_queue_hw_base.inl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2019-2021 Intel Corporation
2+
* Copyright (C) 2019-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -193,8 +193,8 @@ bool CommandQueueHw<Family>::obtainTimestampPacketForCacheFlush(bool isCacheFlus
193193
}
194194

195195
template <typename Family>
196-
bool CommandQueueHw<Family>::isGpgpuSubmissionForBcsRequired(bool queueBlocked) const {
197-
if (queueBlocked) {
196+
bool CommandQueueHw<Family>::isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies) const {
197+
if (queueBlocked || timestampPacketDependencies.barrierNodes.peekNodes().size() > 0u) {
198198
return true;
199199
}
200200

opencl/source/command_queue/enqueue_common.h

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -202,10 +202,6 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
202202
timestampPacketDependencies, eventsRequest, blockQueue);
203203
}
204204

205-
if (!blockQueue && isOOQEnabled()) {
206-
setupBarrierTimestampForBcsEngines(computeCommandStreamReceiver.getOsContext().getEngineType(), timestampPacketDependencies);
207-
}
208-
209205
if (eventBuilder.getEvent() && computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
210206
eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
211207
eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.nonAuxToAuxNodes);
@@ -257,6 +253,10 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
257253
const EnqueueProperties enqueueProperties(false, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType),
258254
flushDependenciesForNonKernelCommand, isMarkerWithProfiling, &blitPropertiesContainer);
259255

256+
if (!blockQueue && isOOQEnabled()) {
257+
setupBarrierTimestampForBcsEngines(computeCommandStreamReceiver.getOsContext().getEngineType(), timestampPacketDependencies);
258+
}
259+
260260
bool migratedMemory = false;
261261

262262
this->wasNonKernelOperationSent |= enqueueProperties.operation != EnqueueProperties::Operation::GpuKernel;
@@ -948,7 +948,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
948948
CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()};
949949
bool flushGpgpuCsr = true;
950950

951-
if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && !isGpgpuSubmissionForBcsRequired(false)) {
951+
if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && !isGpgpuSubmissionForBcsRequired(false, timestampPacketDependencies)) {
952952
flushGpgpuCsr = false;
953953
} else {
954954
csrDeps.makeResident(getGpgpuCommandStreamReceiver());
@@ -1096,21 +1096,25 @@ void CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispat
10961096
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All);
10971097
auto allocator = bcsCsr.getTimestampPacketAllocator();
10981098

1099-
if (isCacheFlushForBcsRequired() && isGpgpuSubmissionForBcsRequired(blockQueue)) {
1100-
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
1101-
}
1102-
11031099
if (!blockQueue) {
11041100
setupBarrierTimestampForBcsEngines(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies);
1101+
if (isOOQEnabled()) {
1102+
TimestampPacketContainer clearBarrierNodes;
1103+
timestampPacketDependencies.barrierNodes.swapNodes(clearBarrierNodes);
1104+
}
11051105
}
11061106
processBarrierTimestampForBcsEngine(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies);
11071107

1108+
if (isCacheFlushForBcsRequired() && isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies)) {
1109+
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
1110+
}
1111+
11081112
obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
11091113
csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
11101114

11111115
LinearStream *gpgpuCommandStream = {};
11121116
size_t gpgpuCommandStreamStart = {};
1113-
if (isGpgpuSubmissionForBcsRequired(blockQueue)) {
1117+
if (isGpgpuSubmissionForBcsRequired(blockQueue, timestampPacketDependencies)) {
11141118
gpgpuCommandStream = obtainCommandStream<cmdType>(csrDeps, true, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, nullptr, 0, false);
11151119
gpgpuCommandStreamStart = gpgpuCommandStream->getUsed();
11161120
}

opencl/test/unit_test/command_queue/command_queue_fixture.h

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
/*
2-
* Copyright (C) 2018-2021 Intel Corporation
2+
* Copyright (C) 2018-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
66
*/
77

88
#pragma once
9+
#include "shared/test/common/helpers/debug_manager_state_restore.h"
10+
#include "shared/test/common/test_macros/test_checks_shared.h"
11+
912
#include "opencl/source/command_queue/command_queue.h"
13+
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
14+
#include "opencl/test/unit_test/fixtures/context_fixture.h"
1015
#include "opencl/test/unit_test/mocks/mock_context.h"
1116

1217
#include "CL/cl.h"
@@ -85,4 +90,37 @@ static const cl_command_queue_properties DefaultCommandQueueProperties[] = {
8590
0,
8691
CL_QUEUE_PROFILING_ENABLE,
8792
};
93+
94+
template <bool ooq>
95+
struct CommandQueueHwBlitTest : ClDeviceFixture, ContextFixture, CommandQueueHwFixture, ::testing::Test {
96+
using ContextFixture::SetUp;
97+
98+
void SetUp() override {
99+
hwInfo = *::defaultHwInfo;
100+
hwInfo.capabilityTable.blitterOperationsSupported = true;
101+
REQUIRE_FULL_BLITTER_OR_SKIP(&hwInfo);
102+
103+
DebugManager.flags.EnableBlitterOperationsSupport.set(1);
104+
DebugManager.flags.EnableTimestampPacket.set(1);
105+
DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(1);
106+
ClDeviceFixture::SetUpImpl(&hwInfo);
107+
cl_device_id device = pClDevice;
108+
ContextFixture::SetUp(1, &device);
109+
cl_command_queue_properties queueProperties = ooq ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : 0;
110+
CommandQueueHwFixture::SetUp(pClDevice, queueProperties);
111+
}
112+
113+
void TearDown() override {
114+
CommandQueueHwFixture::TearDown();
115+
ContextFixture::TearDown();
116+
ClDeviceFixture::TearDown();
117+
}
118+
119+
HardwareInfo hwInfo{};
120+
DebugManagerStateRestore state{};
121+
};
122+
123+
using IoqCommandQueueHwBlitTest = CommandQueueHwBlitTest<false>;
124+
using OoqCommandQueueHwBlitTest = CommandQueueHwBlitTest<true>;
125+
88126
} // namespace NEO

opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp

Lines changed: 5 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,12 @@
1515
#include "shared/test/common/mocks/mock_os_library.h"
1616
#include "shared/test/common/mocks/mock_source_level_debugger.h"
1717
#include "shared/test/common/test_macros/matchers.h"
18-
#include "shared/test/common/test_macros/test.h"
19-
#include "shared/test/common/test_macros/test_checks_shared.h"
2018
#include "shared/test/unit_test/utilities/base_object_utils.h"
2119

2220
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
2321
#include "opencl/source/helpers/dispatch_info_builder.h"
2422
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
2523
#include "opencl/test/unit_test/fixtures/buffer_fixture.h"
26-
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
27-
#include "opencl/test/unit_test/fixtures/context_fixture.h"
2824
#include "opencl/test/unit_test/fixtures/image_fixture.h"
2925
#include "opencl/test/unit_test/mocks/mock_buffer.h"
3026
#include "opencl/test/unit_test/mocks/mock_command_queue.h"
@@ -1501,38 +1497,6 @@ HWTEST_F(CommandQueueHwTest, givenFinishWhenFlushBatchedSubmissionsFailsThenErro
15011497
EXPECT_EQ(CL_OUT_OF_RESOURCES, errorCode);
15021498
}
15031499

1504-
template <bool ooq>
1505-
struct CommandQueueHwBlitTest : ClDeviceFixture, ContextFixture, CommandQueueHwFixture, ::testing::Test {
1506-
using ContextFixture::SetUp;
1507-
1508-
void SetUp() override {
1509-
hwInfo = *::defaultHwInfo;
1510-
hwInfo.capabilityTable.blitterOperationsSupported = true;
1511-
REQUIRE_FULL_BLITTER_OR_SKIP(&hwInfo);
1512-
1513-
DebugManager.flags.EnableBlitterOperationsSupport.set(1);
1514-
DebugManager.flags.EnableTimestampPacket.set(1);
1515-
DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(1);
1516-
ClDeviceFixture::SetUpImpl(&hwInfo);
1517-
cl_device_id device = pClDevice;
1518-
ContextFixture::SetUp(1, &device);
1519-
cl_command_queue_properties queueProperties = ooq ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : 0;
1520-
CommandQueueHwFixture::SetUp(pClDevice, queueProperties);
1521-
}
1522-
1523-
void TearDown() override {
1524-
CommandQueueHwFixture::TearDown();
1525-
ContextFixture::TearDown();
1526-
ClDeviceFixture::TearDown();
1527-
}
1528-
1529-
HardwareInfo hwInfo{};
1530-
DebugManagerStateRestore state{};
1531-
};
1532-
1533-
using IoqCommandQueueHwBlitTest = CommandQueueHwBlitTest<false>;
1534-
using OoqCommandQueueHwBlitTest = CommandQueueHwBlitTest<true>;
1535-
15361500
HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingSubsequentBlitsThenGpgpuCommandStreamIsNotObtained) {
15371501
auto &gpgpuCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
15381502
auto srcBuffer = std::unique_ptr<Buffer>{BufferHelper<>::create(pContext)};
@@ -1648,6 +1612,7 @@ HWTEST_F(OoqCommandQueueHwBlitTest, givenBlitAfterBarrierWhenEnqueueingCommandTh
16481612
HWTEST_F(OoqCommandQueueHwBlitTest, givenBlitBeforeBarrierWhenEnqueueingCommandThenWaitForBlitBeforeBarrier) {
16491613
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
16501614
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
1615+
using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT;
16511616

16521617
if (pCmdQ->getTimestampPacketContainer() == nullptr) {
16531618
GTEST_SKIP();
@@ -1702,6 +1667,10 @@ HWTEST_F(OoqCommandQueueHwBlitTest, givenBlitBeforeBarrierWhenEnqueueingCommandT
17021667
const auto semaphore = genCmdCast<MI_SEMAPHORE_WAIT *>(*semaphoreItor);
17031668
EXPECT_EQ(barrierNodeAddress, semaphore->getSemaphoreGraphicsAddress());
17041669
EXPECT_EQ(bcsHwParser.cmdList.end(), find<PIPE_CONTROL *>(semaphoreItor, bcsHwParser.cmdList.end()));
1670+
1671+
// Only one barrier semaphore from first BCS enqueue
1672+
const auto blitItor = find<XY_COPY_BLT *>(bcsHwParser.cmdList.begin(), bcsHwParser.cmdList.end());
1673+
EXPECT_EQ(1u, findAll<MI_SEMAPHORE_WAIT *>(bcsHwParser.cmdList.begin(), blitItor).size());
17051674
}
17061675

17071676
EXPECT_EQ(CL_SUCCESS, pCmdQ->finish());

opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,24 @@
11
/*
2-
* Copyright (C) 2021 Intel Corporation
2+
* Copyright (C) 2021-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
66
*/
77

88
#include "shared/source/helpers/engine_node_helper.h"
9+
#include "shared/test/common/cmd_parse/hw_parse.h"
910
#include "shared/test/common/helpers/debug_manager_state_restore.h"
1011
#include "shared/test/common/mocks/mock_graphics_allocation.h"
1112
#include "shared/test/common/test_macros/test.h"
13+
#include "shared/test/unit_test/utilities/base_object_utils.h"
1214

15+
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
16+
#include "opencl/test/unit_test/fixtures/buffer_fixture.h"
1317
#include "opencl/test/unit_test/mocks/mock_buffer.h"
1418
#include "opencl/test/unit_test/mocks/mock_cl_device.h"
1519
#include "opencl/test/unit_test/mocks/mock_command_queue.h"
1620
#include "opencl/test/unit_test/mocks/mock_context.h"
21+
#include "opencl/test/unit_test/mocks/mock_kernel.h"
1722

1823
using namespace NEO;
1924

@@ -460,3 +465,42 @@ HWTEST2_F(BcsCsrSelectionCommandQueueTests, givenMultipleEnginesInQueueWhenSelec
460465
EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2), &queue->selectCsrForBuiltinOperation(args));
461466
}
462467
}
468+
469+
HWTEST2_F(OoqCommandQueueHwBlitTest, givenBarrierBeforeFirstKernelWhenEnqueueNDRangeThenProgramBarrierBeforeGlobalAllocation, IsPVC) {
470+
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
471+
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
472+
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
473+
using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE;
474+
475+
if (pCmdQ->getTimestampPacketContainer() == nullptr) {
476+
GTEST_SKIP();
477+
}
478+
DebugManagerStateRestore restore{};
479+
DebugManager.flags.DoCpuCopyOnReadBuffer.set(0);
480+
DebugManager.flags.ForceCacheFlushForBcs.set(0);
481+
DebugManager.flags.UpdateTaskCountFromWait.set(1);
482+
DebugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(1);
483+
484+
MockKernelWithInternals mockKernelWithInternals(*pClDevice);
485+
MockKernel *kernel = mockKernelWithInternals.mockKernel;
486+
size_t offset = 0;
487+
size_t gws = 1;
488+
BufferDefaults::context = context;
489+
auto buffer = clUniquePtr(BufferHelper<>::create());
490+
char ptr[1] = {};
491+
492+
EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr));
493+
EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr));
494+
EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueBarrierWithWaitList(0, nullptr, nullptr));
495+
auto ccsStart = pCmdQ->getGpgpuCommandStreamReceiver().getCS().getUsed();
496+
497+
EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueKernel(kernel, 1, &offset, &gws, nullptr, 0, nullptr, nullptr));
498+
499+
HardwareParse ccsHwParser;
500+
ccsHwParser.parseCommands<FamilyType>(pCmdQ->getGpgpuCommandStreamReceiver().getCS(0), ccsStart);
501+
502+
const auto memFenceStateItor = find<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(ccsHwParser.cmdList.begin(), ccsHwParser.cmdList.end());
503+
const auto memFenceItor = find<MI_MEM_FENCE *>(memFenceStateItor, ccsHwParser.cmdList.end());
504+
EXPECT_NE(ccsHwParser.cmdList.end(), memFenceItor);
505+
EXPECT_NE(ccsHwParser.cmdList.end(), memFenceStateItor);
506+
}

shared/source/command_stream/command_stream_receiver_hw_base.inl

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -330,11 +330,8 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
330330
TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);
331331

332332
programActivePartitionConfigFlushTask(commandStreamCSR);
333-
if (stallingCommandsOnNextFlushRequired) {
334-
programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags);
335-
}
336-
337333
programEngineModeCommands(commandStreamCSR, dispatchFlags);
334+
338335
if (pageTableManager.get() && !pageTableManagerInitialized) {
339336
pageTableManagerInitialized = pageTableManager->initPageTableManagerRegisters(this);
340337
}
@@ -359,6 +356,10 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
359356

360357
programPreemption(commandStreamCSR, dispatchFlags);
361358

359+
if (stallingCommandsOnNextFlushRequired) {
360+
programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags);
361+
}
362+
362363
bool dshDirty = dshState.updateAndCheck(&dsh);
363364
bool iohDirty = iohState.updateAndCheck(&ioh);
364365
bool sshDirty = sshState.updateAndCheck(&ssh);

0 commit comments

Comments
 (0)