Skip to content

Commit 49d4b8f

Browse files
Add implicit scaling capability to L0 barriers
Related-To: NEO-6262 Signed-off-by: Zbigniew Zdanowicz <[email protected]>
1 parent 870b324 commit 49d4b8f

File tree

7 files changed

+244
-54
lines changed

7 files changed

+244
-54
lines changed

level_zero/core/source/cmdlist/cmdlist_hw.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "level_zero/core/source/cmdlist/cmdlist_imp.h"
1414

1515
#include "igfxfmid.h"
16+
#include "pipe_control_args.h"
1617

1718
namespace NEO {
1819
enum class ImageType;
@@ -240,6 +241,8 @@ struct CommandListCoreFamily : CommandListImp {
240241
void appendSignalEventPostWalker(ze_event_handle_t hEvent);
241242
void programStateBaseAddress(NEO::CommandContainer &container, bool genericMediaStateClearRequired);
242243
void programThreadArbitrationPolicy(Device *device);
244+
void appendComputeBarrierCommand();
245+
NEO::PipeControlArgs createBarrierFlags();
243246

244247
uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region);
245248
MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize);

level_zero/core/source/cmdlist/cmdlist_hw.inl

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2288,4 +2288,32 @@ void CommandListCoreFamily<gfxCoreFamily>::programStateBaseAddress(NEO::CommandC
22882288
template <GFXCORE_FAMILY gfxCoreFamily>
22892289
void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask) {}
22902290

2291+
template <GFXCORE_FAMILY gfxCoreFamily>
2292+
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_t hSignalEvent,
2293+
uint32_t numWaitEvents,
2294+
ze_event_handle_t *phWaitEvents) {
2295+
2296+
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
2297+
if (ret) {
2298+
return ret;
2299+
}
2300+
appendEventForProfiling(hSignalEvent, true);
2301+
2302+
if (!hSignalEvent) {
2303+
if (isCopyOnly()) {
2304+
size_t estimatedSizeRequired = NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite();
2305+
increaseCommandStreamSpace(estimatedSizeRequired);
2306+
2307+
NEO::MiFlushArgs args;
2308+
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args);
2309+
} else {
2310+
appendComputeBarrierCommand();
2311+
}
2312+
} else {
2313+
appendSignalEventPostWalker(hSignalEvent);
2314+
}
2315+
2316+
return ZE_RESULT_SUCCESS;
2317+
}
2318+
22912319
} // namespace L0

level_zero/core/source/cmdlist/cmdlist_hw_base.inl

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -33,32 +33,6 @@ size_t CommandListCoreFamily<gfxCoreFamily>::getReserveSshSize() {
3333
return helper.getRenderSurfaceStateSize();
3434
}
3535

36-
template <GFXCORE_FAMILY gfxCoreFamily>
37-
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_t hSignalEvent,
38-
uint32_t numWaitEvents,
39-
ze_event_handle_t *phWaitEvents) {
40-
41-
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
42-
if (ret) {
43-
return ret;
44-
}
45-
appendEventForProfiling(hSignalEvent, true);
46-
47-
if (!hSignalEvent) {
48-
if (isCopyOnly()) {
49-
NEO::MiFlushArgs args;
50-
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args);
51-
} else {
52-
NEO::PipeControlArgs args;
53-
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
54-
}
55-
} else {
56-
appendSignalEventPostWalker(hSignalEvent);
57-
}
58-
59-
return ZE_RESULT_SUCCESS;
60-
}
61-
6236
template <GFXCORE_FAMILY gfxCoreFamily>
6337
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(ze_kernel_handle_t hKernel,
6438
const ze_group_count_t *pThreadGroupDimensions,
@@ -203,4 +177,19 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
203177
template <GFXCORE_FAMILY gfxCoreFamily>
204178
void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionPrologue(uint32_t partitionDataSize) {}
205179

180+
template <GFXCORE_FAMILY gfxCoreFamily>
181+
void CommandListCoreFamily<gfxCoreFamily>::appendComputeBarrierCommand() {
182+
size_t estimatedSizeRequired = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl();
183+
increaseCommandStreamSpace(estimatedSizeRequired);
184+
185+
NEO::PipeControlArgs args = createBarrierFlags();
186+
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
187+
}
188+
189+
template <GFXCORE_FAMILY gfxCoreFamily>
190+
NEO::PipeControlArgs CommandListCoreFamily<gfxCoreFamily>::createBarrierFlags() {
191+
NEO::PipeControlArgs args;
192+
return args;
193+
}
194+
206195
} // namespace L0

level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -36,33 +36,6 @@ size_t CommandListCoreFamily<gfxCoreFamily>::getReserveSshSize() {
3636
return 4 * MemoryConstants::pageSize;
3737
}
3838

39-
template <GFXCORE_FAMILY gfxCoreFamily>
40-
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_t hSignalEvent,
41-
uint32_t numWaitEvents,
42-
ze_event_handle_t *phWaitEvents) {
43-
44-
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
45-
if (ret) {
46-
return ret;
47-
}
48-
appendEventForProfiling(hSignalEvent, true);
49-
50-
if (!hSignalEvent) {
51-
if (isCopyOnly()) {
52-
NEO::MiFlushArgs args;
53-
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args);
54-
} else {
55-
NEO::PipeControlArgs args;
56-
args.hdcPipelineFlush = true;
57-
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
58-
}
59-
} else {
60-
appendSignalEventPostWalker(hSignalEvent);
61-
}
62-
63-
return ZE_RESULT_SUCCESS;
64-
}
65-
6639
template <GFXCORE_FAMILY gfxCoreFamily>
6740
void CommandListCoreFamily<gfxCoreFamily>::applyMemoryRangesBarrier(uint32_t numRanges,
6841
const size_t *pRangeSizes,
@@ -345,4 +318,30 @@ void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionPrologue(uint32_t
345318
true);
346319
}
347320

321+
template <GFXCORE_FAMILY gfxCoreFamily>
322+
void CommandListCoreFamily<gfxCoreFamily>::appendComputeBarrierCommand() {
323+
NEO::PipeControlArgs args = createBarrierFlags();
324+
if (this->partitionCount > 1) {
325+
size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch<GfxFamily>::getBarrierSize(true);
326+
increaseCommandStreamSpace(estimatedSizeRequired);
327+
328+
NEO::ImplicitScalingDispatch<GfxFamily>::dispatchBarrierCommands(*commandContainer.getCommandStream(),
329+
device->getNEODevice()->getDeviceBitfield(),
330+
args,
331+
true,
332+
true);
333+
} else {
334+
size_t estimatedSizeRequired = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl();
335+
increaseCommandStreamSpace(estimatedSizeRequired);
336+
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
337+
}
338+
}
339+
340+
template <GFXCORE_FAMILY gfxCoreFamily>
341+
NEO::PipeControlArgs CommandListCoreFamily<gfxCoreFamily>::createBarrierFlags() {
342+
NEO::PipeControlArgs args;
343+
args.hdcPipelineFlush = true;
344+
return args;
345+
}
346+
348347
} // namespace L0

level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,5 +44,35 @@ class CommandListFixture : public DeviceFixture {
4444
std::unique_ptr<Event> event;
4545
};
4646

47+
struct MultiTileCommandListFixture : public SingleRootMultiSubDeviceFixture {
48+
void SetUp() {
49+
SingleRootMultiSubDeviceFixture::SetUp();
50+
ze_result_t returnValue;
51+
commandList.reset(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)));
52+
53+
commandList->partitionCount = 2;
54+
55+
ze_event_pool_desc_t eventPoolDesc = {};
56+
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
57+
eventPoolDesc.count = 2;
58+
59+
ze_event_desc_t eventDesc = {};
60+
eventDesc.index = 0;
61+
eventDesc.wait = 0;
62+
eventDesc.signal = 0;
63+
64+
eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc));
65+
event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
66+
}
67+
68+
void TearDown() {
69+
SingleRootMultiSubDeviceFixture::TearDown();
70+
}
71+
72+
std::unique_ptr<L0::ult::CommandList> commandList;
73+
std::unique_ptr<EventPool> eventPool;
74+
std::unique_ptr<Event> event;
75+
};
76+
4777
} // namespace ult
4878
} // namespace L0

level_zero/core/test/unit_tests/mocks/mock_cmdlist.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
5151
using BaseClass::hostPtrMap;
5252
using BaseClass::indirectAllocationsAllowed;
5353
using BaseClass::initialize;
54+
using BaseClass::partitionCount;
5455
using BaseClass::patternAllocations;
5556
using BaseClass::requiredStreamState;
5657
using BaseClass::unifiedMemoryControls;
@@ -70,6 +71,7 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
7071
using BaseClass::clearCommandsToPatch;
7172
using BaseClass::commandsToPatch;
7273
using BaseClass::finalStreamState;
74+
using BaseClass::partitionCount;
7375
using BaseClass::requiredStreamState;
7476

7577
WhiteBox() : BaseClass(BaseClass::defaultNumIddsPerBlock) {}
@@ -82,6 +84,7 @@ struct WhiteBox<::L0::CommandList> : public ::L0::CommandListImp {
8284
using BaseClass::commandContainer;
8385
using BaseClass::commandListPreemptionMode;
8486
using BaseClass::initialize;
87+
using BaseClass::partitionCount;
8588

8689
WhiteBox(Device *device);
8790
~WhiteBox() override;

level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp

Lines changed: 139 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "shared/source/command_container/command_encoder.h"
99
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
10+
#include "shared/test/common/helpers/unit_test_helper.h"
1011

1112
#include "test.h"
1213

@@ -79,5 +80,142 @@ HWTEST_F(CommandListAppendBarrier, GivenEventVsNoEventWhenAppendingBarrierThenCo
7980

8081
ASSERT_LE(sizeWithoutEvent, sizeWithEvent);
8182
}
83+
84+
using MultiTileCommandListAppendBarrier = Test<MultiTileCommandListFixture>;
85+
86+
HWTEST2_F(MultiTileCommandListAppendBarrier, WhenAppendingBarrierThenPipeControlIsGenerated, IsWithinXeGfxFamily) {
87+
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
88+
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
89+
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
90+
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
91+
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
92+
93+
size_t beforeControlSectionOffset = sizeof(MI_STORE_DATA_IMM) +
94+
sizeof(PIPE_CONTROL) +
95+
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
96+
sizeof(MI_BATCH_BUFFER_START);
97+
98+
size_t startOffset = beforeControlSectionOffset +
99+
(2 * sizeof(uint32_t));
100+
101+
size_t expectedUseBuffer = startOffset +
102+
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
103+
sizeof(MI_STORE_DATA_IMM) +
104+
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT);
105+
106+
auto usedSpaceBefore = commandList->commandContainer.getCommandStream()->getUsed();
107+
auto gpuBaseAddress = commandList->commandContainer.getCommandStream()->getGraphicsAllocation()->getGpuAddress() +
108+
usedSpaceBefore;
109+
110+
auto gpuCrossTileSyncAddress = gpuBaseAddress +
111+
beforeControlSectionOffset;
112+
113+
auto gpuFinalSyncAddress = gpuCrossTileSyncAddress +
114+
sizeof(uint32_t);
115+
116+
auto gpuStartAddress = gpuBaseAddress +
117+
startOffset;
118+
119+
auto result = commandList->appendBarrier(nullptr, 0, nullptr);
120+
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
121+
122+
auto usedSpaceAfter = commandList->commandContainer.getCommandStream()->getUsed();
123+
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
124+
size_t usedBuffer = usedSpaceAfter - usedSpaceBefore;
125+
EXPECT_EQ(expectedUseBuffer, usedBuffer);
126+
127+
void *cmdBuffer = ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), usedSpaceBefore);
128+
size_t parsedOffset = 0;
129+
130+
{
131+
auto storeDataImm = genCmdCast<MI_STORE_DATA_IMM *>(ptrOffset(cmdBuffer, parsedOffset));
132+
ASSERT_NE(nullptr, storeDataImm);
133+
EXPECT_EQ(gpuFinalSyncAddress, storeDataImm->getAddress());
134+
EXPECT_EQ(0u, storeDataImm->getDataDword0());
135+
parsedOffset += sizeof(MI_STORE_DATA_IMM);
136+
}
137+
{
138+
auto pipeControl = genCmdCast<PIPE_CONTROL *>(ptrOffset(cmdBuffer, parsedOffset));
139+
ASSERT_NE(nullptr, pipeControl);
140+
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
141+
EXPECT_FALSE(pipeControl->getDcFlushEnable());
142+
parsedOffset += sizeof(PIPE_CONTROL);
143+
}
144+
{
145+
auto miAtomic = genCmdCast<MI_ATOMIC *>(ptrOffset(cmdBuffer, parsedOffset));
146+
ASSERT_NE(nullptr, miAtomic);
147+
auto miAtomicProgrammedAddress = NEO::UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
148+
EXPECT_EQ(gpuCrossTileSyncAddress, miAtomicProgrammedAddress);
149+
EXPECT_FALSE(miAtomic->getReturnDataControl());
150+
EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
151+
parsedOffset += sizeof(MI_ATOMIC);
152+
}
153+
{
154+
auto miSemaphore = genCmdCast<MI_SEMAPHORE_WAIT *>(ptrOffset(cmdBuffer, parsedOffset));
155+
ASSERT_NE(nullptr, miSemaphore);
156+
EXPECT_EQ(gpuCrossTileSyncAddress, miSemaphore->getSemaphoreGraphicsAddress());
157+
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphore->getCompareOperation());
158+
EXPECT_EQ(2u, miSemaphore->getSemaphoreDataDword());
159+
parsedOffset += sizeof(MI_SEMAPHORE_WAIT);
160+
}
161+
{
162+
auto bbStart = genCmdCast<MI_BATCH_BUFFER_START *>(ptrOffset(cmdBuffer, parsedOffset));
163+
ASSERT_NE(nullptr, bbStart);
164+
EXPECT_EQ(gpuStartAddress, bbStart->getBatchBufferStartAddress());
165+
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
166+
parsedOffset += sizeof(MI_BATCH_BUFFER_START);
167+
}
168+
{
169+
auto crossField = reinterpret_cast<uint32_t *>(ptrOffset(cmdBuffer, parsedOffset));
170+
EXPECT_EQ(0u, *crossField);
171+
parsedOffset += sizeof(uint32_t);
172+
auto finalField = reinterpret_cast<uint32_t *>(ptrOffset(cmdBuffer, parsedOffset));
173+
EXPECT_EQ(0u, *finalField);
174+
parsedOffset += sizeof(uint32_t);
175+
}
176+
{
177+
auto miAtomic = genCmdCast<MI_ATOMIC *>(ptrOffset(cmdBuffer, parsedOffset));
178+
ASSERT_NE(nullptr, miAtomic);
179+
auto miAtomicProgrammedAddress = NEO::UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
180+
EXPECT_EQ(gpuFinalSyncAddress, miAtomicProgrammedAddress);
181+
EXPECT_FALSE(miAtomic->getReturnDataControl());
182+
EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
183+
parsedOffset += sizeof(MI_ATOMIC);
184+
}
185+
{
186+
auto miSemaphore = genCmdCast<MI_SEMAPHORE_WAIT *>(ptrOffset(cmdBuffer, parsedOffset));
187+
ASSERT_NE(nullptr, miSemaphore);
188+
EXPECT_EQ(gpuFinalSyncAddress, miSemaphore->getSemaphoreGraphicsAddress());
189+
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphore->getCompareOperation());
190+
EXPECT_EQ(2u, miSemaphore->getSemaphoreDataDword());
191+
parsedOffset += sizeof(MI_SEMAPHORE_WAIT);
192+
}
193+
{
194+
auto storeDataImm = genCmdCast<MI_STORE_DATA_IMM *>(ptrOffset(cmdBuffer, parsedOffset));
195+
ASSERT_NE(nullptr, storeDataImm);
196+
EXPECT_EQ(gpuCrossTileSyncAddress, storeDataImm->getAddress());
197+
EXPECT_EQ(0u, storeDataImm->getDataDword0());
198+
parsedOffset += sizeof(MI_STORE_DATA_IMM);
199+
}
200+
{
201+
auto miAtomic = genCmdCast<MI_ATOMIC *>(ptrOffset(cmdBuffer, parsedOffset));
202+
ASSERT_NE(nullptr, miAtomic);
203+
auto miAtomicProgrammedAddress = NEO::UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
204+
EXPECT_EQ(gpuFinalSyncAddress, miAtomicProgrammedAddress);
205+
EXPECT_FALSE(miAtomic->getReturnDataControl());
206+
EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
207+
parsedOffset += sizeof(MI_ATOMIC);
208+
}
209+
{
210+
auto miSemaphore = genCmdCast<MI_SEMAPHORE_WAIT *>(ptrOffset(cmdBuffer, parsedOffset));
211+
ASSERT_NE(nullptr, miSemaphore);
212+
EXPECT_EQ(gpuFinalSyncAddress, miSemaphore->getSemaphoreGraphicsAddress());
213+
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphore->getCompareOperation());
214+
EXPECT_EQ(4u, miSemaphore->getSemaphoreDataDword());
215+
parsedOffset += sizeof(MI_SEMAPHORE_WAIT);
216+
}
217+
EXPECT_EQ(expectedUseBuffer, parsedOffset);
218+
}
219+
82220
} // namespace ult
83-
} // namespace L0
221+
} // namespace L0

0 commit comments

Comments
 (0)