Skip to content

Commit e8cfb38

Browse files
performance: improve relaxed ordering task count tracking
Signed-off-by: Bartosz Dunajski <[email protected]>
1 parent f357ada commit e8cfb38

File tree

8 files changed

+46
-16
lines changed

8 files changed

+46
-16
lines changed

level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1448,16 +1448,13 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isRelaxedOrderingDispatchAll
14481448
auto queueTaskCount = getCmdQImmediate(copyOffload)->getTaskCount();
14491449
auto csrTaskCount = csr->peekTaskCount();
14501450

1451-
if ((this->device->getNEODevice()->isInitDeviceWithFirstSubmissionSupported(csr->getType()) || this->heaplessStateInitEnabled) && csr->peekTaskCount() == 1) {
1452-
DEBUG_BREAK_IF(queueTaskCount != 0);
1453-
queueTaskCount = 1;
1454-
}
1451+
bool skipTaskCountCheck = (csrTaskCount - queueTaskCount == 1) && csr->isLatestFlushIsTaskCountUpdateOnly();
14551452

14561453
if (NEO::debugManager.flags.DirectSubmissionRelaxedOrderingCounterHeuristicTreshold.get() != -1) {
14571454
relaxedOrderingCounterThreshold = static_cast<uint32_t>(NEO::debugManager.flags.DirectSubmissionRelaxedOrderingCounterHeuristicTreshold.get());
14581455
}
14591456

1460-
if (queueTaskCount == csrTaskCount) {
1457+
if (queueTaskCount == csrTaskCount || skipTaskCountCheck) {
14611458
relaxedOrderingCounter++;
14621459
} else {
14631460
// Submission from another queue. Reset counter and keep relaxed ordering allowed

level_zero/core/source/cmdqueue/cmdqueue.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ NEO::SubmissionStatus CommandQueueImp::submitBatchBuffer(size_t offset, NEO::Res
127127

128128
NEO::BatchBuffer batchBuffer(this->startingCmdBuffer->getGraphicsAllocation(), offset, 0, 0, nullptr, false,
129129
NEO::getThrottleFromPowerSavingUint(csr->getUmdPowerHintValue()), NEO::QueueSliceCount::defaultSliceCount,
130-
this->startingCmdBuffer->getUsed(), this->startingCmdBuffer, endingCmdPtr, csr->getNumClients(), true, false, true);
130+
this->startingCmdBuffer->getUsed(), this->startingCmdBuffer, endingCmdPtr, csr->getNumClients(), true, false, true, false);
131131
batchBuffer.disableFlatRingBuffer = true;
132132

133133
if (this->startingCmdBuffer != &this->commandStream) {

level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2060,6 +2060,33 @@ HWTEST2_F(InOrderCmdListTests, givenRelaxedOrderingEnabledWhenSignalEventCalledT
20602060
verifyFlags(false, true); // relaxed ordering disabled == stalling semaphore
20612061
}
20622062

2063+
HWTEST2_F(InOrderCmdListTests, givenCounterHeuristicForRelaxedOrderingEnabledWhenSmallTaskIsFlushedThenIncrementCounter, IsAtLeastXeHpcCore) {
2064+
debugManager.flags.DirectSubmissionRelaxedOrdering.set(1);
2065+
2066+
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
2067+
2068+
auto directSubmission = new MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>>(*ultCsr);
2069+
ultCsr->directSubmission.reset(directSubmission);
2070+
2071+
auto immCmdList = createImmCmdList<gfxCoreFamily>();
2072+
auto queue = immCmdList->getCmdQImmediate(false);
2073+
EXPECT_EQ(0u, queue->getTaskCount());
2074+
EXPECT_EQ(0u, immCmdList->relaxedOrderingCounter);
2075+
2076+
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
2077+
EXPECT_EQ(1u, immCmdList->relaxedOrderingCounter);
2078+
2079+
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
2080+
EXPECT_EQ(2u, immCmdList->relaxedOrderingCounter);
2081+
2082+
ultCsr->flushTagUpdate();
2083+
EXPECT_NE(ultCsr->taskCount, queue->getTaskCount());
2084+
2085+
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
2086+
EXPECT_EQ(3u, immCmdList->relaxedOrderingCounter);
2087+
EXPECT_EQ(ultCsr->taskCount, queue->getTaskCount());
2088+
}
2089+
20632090
HWTEST2_F(InOrderCmdListTests, givenCounterHeuristicForRelaxedOrderingEnabledWhenAppendingThenEnableRelaxedOrderingCorrectly, IsAtLeastXeHpcCore) {
20642091
debugManager.flags.DirectSubmissionRelaxedOrdering.set(1);
20652092

shared/source/command_stream/command_stream_receiver.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,7 @@ class CommandStreamReceiver {
566566
bool enqueueWaitForPagingFence(uint64_t pagingFenceValue);
567567
virtual void unblockPagingFenceSemaphore(uint64_t pagingFenceValue) {}
568568
MOCKABLE_VIRTUAL void drainPagingFenceQueue();
569+
bool isLatestFlushIsTaskCountUpdateOnly() const { return latestFlushIsTaskCountUpdateOnly; }
569570

570571
protected:
571572
void cleanupResources();
@@ -707,6 +708,7 @@ class CommandStreamReceiver {
707708
bool heaplessModeEnabled = false;
708709
bool use4GbHeaps = true;
709710
bool csrSurfaceProgrammingDone = false;
711+
bool latestFlushIsTaskCountUpdateOnly = false;
710712
};
711713

712714
typedef CommandStreamReceiver *(*CommandStreamReceiverCreateFunc)(bool withAubDump,

shared/source/command_stream/command_stream_receiver_hw_base.inl

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushBcsTask(LinearStream &c
266266
BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, 0, taskStartAddress, nullptr,
267267
false, getThrottleFromPowerSavingUint(this->getUmdPowerHintValue()), NEO::QueueSliceCount::defaultSliceCount,
268268
streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation, this->getNumClients(), (submitCSR || dispatchBcsFlags.hasStallingCmds),
269-
dispatchBcsFlags.hasRelaxedOrderingDependencies, dispatchBcsFlags.flushTaskCount};
269+
dispatchBcsFlags.hasRelaxedOrderingDependencies, dispatchBcsFlags.flushTaskCount, false};
270270

271271
updateStreamTaskCount(streamToSubmit, taskCount + 1);
272272
this->latestSentTaskCount = taskCount + 1;
@@ -1124,7 +1124,7 @@ TaskCountType CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropert
11241124
uint64_t taskStartAddress = commandStream.getGpuBase() + commandStreamStart;
11251125

11261126
BatchBuffer batchBuffer{commandStream.getGraphicsAllocation(), commandStreamStart, 0, taskStartAddress, nullptr, false, getThrottleFromPowerSavingUint(this->getUmdPowerHintValue()), QueueSliceCount::defaultSliceCount,
1127-
commandStream.getUsed(), &commandStream, endingCmdPtr, this->getNumClients(), hasStallingCmds, isRelaxedOrderingDispatch, blocking};
1127+
commandStream.getUsed(), &commandStream, endingCmdPtr, this->getNumClients(), hasStallingCmds, isRelaxedOrderingDispatch, blocking, false};
11281128

11291129
updateStreamTaskCount(commandStream, newTaskCount);
11301130

@@ -1258,7 +1258,7 @@ SubmissionStatus CommandStreamReceiverHw<GfxFamily>::flushSmallTask(LinearStream
12581258

12591259
BatchBuffer batchBuffer{commandStreamTask.getGraphicsAllocation(), commandStreamStartTask, 0, taskStartAddress,
12601260
nullptr, false, getThrottleFromPowerSavingUint(this->getUmdPowerHintValue()), QueueSliceCount::defaultSliceCount,
1261-
commandStreamTask.getUsed(), &commandStreamTask, endingCmdPtr, this->getNumClients(), true, false, true};
1261+
commandStreamTask.getUsed(), &commandStreamTask, endingCmdPtr, this->getNumClients(), true, false, true, true};
12621262

12631263
this->latestSentTaskCount = taskCount + 1;
12641264
auto submissionStatus = flushHandler(batchBuffer, getResidencyAllocations());
@@ -1275,6 +1275,7 @@ SubmissionStatus CommandStreamReceiverHw<GfxFamily>::sendRenderStateCacheFlush()
12751275

12761276
template <typename GfxFamily>
12771277
inline SubmissionStatus CommandStreamReceiverHw<GfxFamily>::flushHandler(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) {
1278+
this->latestFlushIsTaskCountUpdateOnly = batchBuffer.taskCountUpdateOnly;
12781279
auto status = flush(batchBuffer, allocationsForResidency);
12791280
makeSurfacePackNonResident(allocationsForResidency, true);
12801281
return status;
@@ -2219,7 +2220,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::handleImmediateFlushSendBatc
22192220
BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, taskStartAddress, chainedBatchBuffer,
22202221
immediateLowPriority, immediateThrottle, immediateSliceCount,
22212222
streamToSubmit.getUsed(), &streamToSubmit, flushData.endPtr, this->getNumClients(), hasStallingCmds,
2222-
dispatchFlags.hasRelaxedOrderingDependencies, dispatchFlags.blockingAppend};
2223+
dispatchFlags.hasRelaxedOrderingDependencies, dispatchFlags.blockingAppend, false};
22232224
updateStreamTaskCount(streamToSubmit, taskCount + 1);
22242225

22252226
auto submissionStatus = flushHandler(batchBuffer, this->getResidencyAllocations());
@@ -2322,7 +2323,7 @@ inline BatchBuffer CommandStreamReceiverHw<GfxFamily>::prepareBatchBufferForSubm
23222323
BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, taskStartAddress, chainedBatchBuffer,
23232324
dispatchFlags.lowPriority, dispatchFlags.throttle, dispatchFlags.sliceCount,
23242325
streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation, this->getNumClients(), (submitCSR || dispatchFlags.hasStallingCmds || hasStallingCmdsOnTaskStream),
2325-
dispatchFlags.hasRelaxedOrderingDependencies, hasStallingCmdsOnTaskStream};
2326+
dispatchFlags.hasRelaxedOrderingDependencies, hasStallingCmdsOnTaskStream, false};
23262327

23272328
updateStreamTaskCount(streamToSubmit, taskCount + 1);
23282329

shared/source/command_stream/submissions_aggregator.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,13 @@ NEO::BatchBuffer::BatchBuffer(GraphicsAllocation *commandBufferAllocation, size_
9797
size_t chainedBatchBufferStartOffset, uint64_t taskStartAddress, GraphicsAllocation *chainedBatchBuffer,
9898
bool lowPriority, QueueThrottle throttle, uint64_t sliceCount,
9999
size_t usedSize, LinearStream *stream, void *endCmdPtr, uint32_t numCsrClients, bool hasStallingCmds,
100-
bool hasRelaxedOrderingDependencies, bool dispatchMonitorFence)
100+
bool hasRelaxedOrderingDependencies, bool dispatchMonitorFence, bool taskCountUpdateOnly)
101101
: commandBufferAllocation(commandBufferAllocation), startOffset(startOffset),
102102
chainedBatchBufferStartOffset(chainedBatchBufferStartOffset), taskStartAddress(taskStartAddress), chainedBatchBuffer(chainedBatchBuffer),
103103
lowPriority(lowPriority),
104104
throttle(throttle), sliceCount(sliceCount),
105105
usedSize(usedSize), stream(stream), endCmdPtr(endCmdPtr), numCsrClients(numCsrClients), hasStallingCmds(hasStallingCmds),
106-
hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies), dispatchMonitorFence(dispatchMonitorFence) {}
106+
hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies), dispatchMonitorFence(dispatchMonitorFence), taskCountUpdateOnly(taskCountUpdateOnly) {}
107107

108108
NEO::CommandBuffer::CommandBuffer(Device &device) : device(device) {
109109
flushStamp.reset(new FlushStampTracker(false));

shared/source/command_stream/submissions_aggregator.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ struct BatchBuffer {
4343
uint32_t numCsrClients,
4444
bool hasStallingCmds,
4545
bool hasRelaxedOrderingDependencies,
46-
bool dispatchMonitorFence);
46+
bool dispatchMonitorFence,
47+
bool taskCountUpdateOnly);
4748
BatchBuffer() {}
4849
GraphicsAllocation *commandBufferAllocation = nullptr;
4950
ResidencyContainer *allocationsForResidency = nullptr;
@@ -68,6 +69,7 @@ struct BatchBuffer {
6869
bool hasRelaxedOrderingDependencies = false;
6970
bool disableFlatRingBuffer = false;
7071
bool dispatchMonitorFence = false;
72+
bool taskCountUpdateOnly = false;
7173
};
7274

7375
struct CommandBuffer : public IDNode<CommandBuffer> {

shared/test/common/helpers/batch_buffer_helper.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2022-2023 Intel Corporation
2+
* Copyright (C) 2022-2024 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -27,7 +27,8 @@ struct BatchBufferHelper {
2727
0, // numCsrClients
2828
false, // hasStallingCmds
2929
false, // hasRelaxedOrderingDependencies
30-
false // dispatchMonitorFence
30+
false, // dispatchMonitorFence
31+
false // taskCountUpdateOnly
3132
);
3233
}
3334

0 commit comments

Comments
 (0)