Skip to content

Commit 1293574

Browse files
Update task count while waiting
Signed-off-by: Lukasz Jobczyk <[email protected]>
1 parent 8df7128 commit 1293574

File tree

10 files changed

+234
-12
lines changed

10 files changed

+234
-12
lines changed

opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,17 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
423423
verifySemaphore<FamilyType>(semaphore, barrierGpuAddress);
424424
}
425425

426+
HWTEST_TEMPLATED_F(BlitAuxTranslationTests, whenFlushTagUpdateThenMiFlushDwIsFlushed) {
427+
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
428+
429+
bcsCsr->flushTagUpdate();
430+
431+
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
432+
433+
auto cmdFound = expectCommand<MI_FLUSH_DW>(cmdListBcs.begin(), cmdListBcs.end());
434+
EXPECT_NE(cmdFound, cmdListBcs.end());
435+
}
436+
426437
HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingCommandBufferThenSynchronizeBcsOutput) {
427438
using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT;
428439
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;

opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -986,6 +986,89 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCsrInBatchingModeWithOutOfOrd
986986
EXPECT_TRUE(pipeControl->getDcFlushEnable());
987987
}
988988

989+
HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhenFlushTaskThenThereIsNoPipeControlForUpdateTaskCount) {
990+
DebugManagerStateRestore restorer;
991+
DebugManager.flags.UpdateTaskCountFromWait.set(1);
992+
993+
CommandQueueHw<FamilyType> commandQueue(nullptr, pClDevice, 0, false);
994+
auto &commandStream = commandQueue.getCS(4096u);
995+
996+
auto mockCsr = new MockCsrHw2<FamilyType>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
997+
pDevice->resetCommandStreamReceiver(mockCsr);
998+
mockCsr->useNewResourceImplicitFlush = false;
999+
mockCsr->useGpuIdleImplicitFlush = false;
1000+
mockCsr->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
1001+
1002+
DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
1003+
dispatchFlags.preemptionMode = PreemptionHelper::getDefaultPreemptionMode(pDevice->getHardwareInfo());
1004+
dispatchFlags.guardCommandBufferWithPipeControl = true;
1005+
1006+
mockCsr->flushTask(commandStream,
1007+
0,
1008+
dsh,
1009+
ioh,
1010+
ssh,
1011+
taskLevel,
1012+
dispatchFlags,
1013+
*pDevice);
1014+
1015+
parseCommands<FamilyType>(commandStream);
1016+
auto itorPipeControl = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
1017+
1018+
EXPECT_EQ(itorPipeControl, cmdList.end());
1019+
}
1020+
1021+
HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhenFlushTaskThenPipeControlIsFlushed) {
1022+
DebugManagerStateRestore restorer;
1023+
DebugManager.flags.UpdateTaskCountFromWait.set(1);
1024+
1025+
CommandQueueHw<FamilyType> commandQueue(nullptr, pClDevice, 0, false);
1026+
1027+
auto mockCsr = new MockCsrHw2<FamilyType>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
1028+
pDevice->resetCommandStreamReceiver(mockCsr);
1029+
mockCsr->useNewResourceImplicitFlush = false;
1030+
mockCsr->useGpuIdleImplicitFlush = false;
1031+
mockCsr->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
1032+
1033+
commandQueue.waitUntilComplete(false, nullptr);
1034+
1035+
parseCommands<FamilyType>(mockCsr->getCS(4096u));
1036+
auto itorPipeControl = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
1037+
1038+
EXPECT_NE(itorPipeControl, cmdList.end());
1039+
EXPECT_EQ(mockCsr->flushCalledCount, 1);
1040+
}
1041+
1042+
HWTEST_F(CommandStreamReceiverFlushTaskTests, givenEnabledDirectSubmissionUpdateTaskCountFromWaitSetWhenFlushTaskThenPipeControlAndBBSIsFlushed) {
1043+
DebugManagerStateRestore restorer;
1044+
DebugManager.flags.UpdateTaskCountFromWait.set(1);
1045+
1046+
struct MockCsrHwDirectSubmission : public MockCsrHw2<FamilyType> {
1047+
using MockCsrHw2<FamilyType>::MockCsrHw2;
1048+
bool isDirectSubmissionEnabled() const override {
1049+
return true;
1050+
}
1051+
};
1052+
1053+
CommandQueueHw<FamilyType> commandQueue(nullptr, pClDevice, 0, false);
1054+
1055+
auto mockCsr = new MockCsrHwDirectSubmission(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
1056+
pDevice->resetCommandStreamReceiver(mockCsr);
1057+
mockCsr->useNewResourceImplicitFlush = false;
1058+
mockCsr->useGpuIdleImplicitFlush = false;
1059+
mockCsr->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
1060+
1061+
commandQueue.waitUntilComplete(false, nullptr);
1062+
1063+
parseCommands<FamilyType>(mockCsr->getCS(4096u));
1064+
auto itorPipeControl = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
1065+
auto itorBBS = find<typename FamilyType::MI_BATCH_BUFFER_START *>(cmdList.begin(), cmdList.end());
1066+
1067+
EXPECT_NE(itorPipeControl, cmdList.end());
1068+
EXPECT_NE(itorBBS, cmdList.end());
1069+
EXPECT_EQ(mockCsr->flushCalledCount, 1);
1070+
}
1071+
9891072
HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCsrInBatchingModeWhenDcFlushIsRequiredThenPipeControlIsNotRegistredForNooping) {
9901073
CommandQueueHw<FamilyType> commandQueue(nullptr, pClDevice, 0, false);
9911074
auto &commandStream = commandQueue.getCS(4096u);

opencl/test/unit_test/kernel/kernel_tests.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,9 @@ class CommandStreamReceiverMock : public CommandStreamReceiver {
452452

453453
using BaseClass::CommandStreamReceiver;
454454

455+
void flushTagUpdate() override{};
456+
void updateTagFromWait() override{};
457+
455458
bool isMultiOsContextCapable() const override { return false; }
456459

457460
MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired) const override {

opencl/test/unit_test/test_files/igdrcl.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ EnableHostPointerImport = -1
215215
EnableHostUsmSupport = -1
216216
ForceBtpPrefetchMode = -1
217217
OverrideProfilingTimerResolution = -1
218+
UpdateTaskCountFromWait = -1
218219
PreferCopyEngineForCopyBufferToBuffer = -1
219220
EnableStaticPartitioning = -1
220221
DisableDeepBind = 0

shared/source/command_stream/command_stream_receiver.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@ class CommandStreamReceiver {
208208

209209
virtual uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled) = 0;
210210

211+
virtual void flushTagUpdate() = 0;
212+
virtual void updateTagFromWait() = 0;
213+
211214
ScratchSpaceController *getScratchSpaceController() const {
212215
return scratchSpaceController.get();
213216
}

shared/source/command_stream/command_stream_receiver_hw.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,16 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
9393

9494
uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled) override;
9595

96+
void flushTagUpdate() override;
97+
void flushMiFlushDW();
98+
void flushPipeControl();
99+
void flushSmallTask(LinearStream &commandStreamTask,
100+
size_t commandStreamStartTask);
101+
void flushHandler(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency);
102+
103+
bool isUpdateTagFromWaitEnabled();
104+
void updateTagFromWait() override;
105+
96106
bool isMultiOsContextCapable() const override;
97107

98108
MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired) const override;

shared/source/command_stream/command_stream_receiver_hw_base.inl

Lines changed: 117 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
204204
initProgrammingFlags();
205205
}
206206

207+
bool updateTag = false;
207208
if (dispatchFlags.blocking || dispatchFlags.dcFlush || dispatchFlags.guardCommandBufferWithPipeControl) {
208209
if (this->dispatchMode == DispatchMode::ImmediateDispatch) {
209210
//for ImmediateDispatch we will send this right away, therefore this pipe control will close the level
@@ -224,14 +225,21 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
224225

225226
auto address = getTagAllocation()->getGpuAddress();
226227

227-
PipeControlArgs args(dispatchFlags.dcFlush);
228-
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
229-
commandStreamTask,
230-
PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
231-
address,
232-
taskCount + 1,
233-
peekHwInfo(),
234-
args);
228+
updateTag = !isUpdateTagFromWaitEnabled();
229+
updateTag |= dispatchFlags.blocking;
230+
231+
if (updateTag) {
232+
PipeControlArgs args(dispatchFlags.dcFlush);
233+
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
234+
commandStreamTask,
235+
PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
236+
address,
237+
taskCount + 1,
238+
peekHwInfo(),
239+
args);
240+
} else {
241+
currentPipeControlForNooping = nullptr;
242+
}
235243

236244
this->latestSentTaskCount = taskCount + 1;
237245
DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "taskCount", peekTaskCount());
@@ -571,9 +579,10 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
571579

572580
if (submitCSR | submitTask) {
573581
if (this->dispatchMode == DispatchMode::ImmediateDispatch) {
574-
this->flush(batchBuffer, this->getResidencyAllocations());
575-
this->latestFlushedTaskCount = this->taskCount + 1;
576-
this->makeSurfacePackNonResident(this->getResidencyAllocations());
582+
if (updateTag) {
583+
this->latestFlushedTaskCount = this->taskCount + 1;
584+
}
585+
flushHandler(batchBuffer, this->getResidencyAllocations());
577586
} else {
578587
auto commandBuffer = new CommandBuffer(device);
579588
commandBuffer->batchBuffer = batchBuffer;
@@ -838,6 +847,8 @@ inline void CommandStreamReceiverHw<GfxFamily>::emitNoop(LinearStream &commandSt
838847

839848
template <typename GfxFamily>
840849
inline void CommandStreamReceiverHw<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) {
850+
updateTagFromWait();
851+
841852
int64_t waitTimeout = 0;
842853
bool enableTimeout = false;
843854

@@ -1079,6 +1090,101 @@ uint32_t CommandStreamReceiverHw<GfxFamily>::blitBuffer(const BlitPropertiesCont
10791090
return newTaskCount;
10801091
}
10811092

1093+
template <typename GfxFamily>
1094+
inline void CommandStreamReceiverHw<GfxFamily>::flushTagUpdate() {
1095+
if (this->osContext->getEngineType() == aub_stream::ENGINE_BCS) {
1096+
this->flushMiFlushDW();
1097+
} else {
1098+
this->flushPipeControl();
1099+
}
1100+
}
1101+
1102+
template <typename GfxFamily>
1103+
inline void CommandStreamReceiverHw<GfxFamily>::flushMiFlushDW() {
1104+
auto lock = obtainUniqueOwnership();
1105+
1106+
auto &commandStream = getCS(EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite());
1107+
auto commandStreamStart = commandStream.getUsed();
1108+
1109+
EncodeMiFlushDW<GfxFamily>::programMiFlushDw(commandStream, tagAllocation->getGpuAddress(), taskCount, false, true);
1110+
1111+
makeResident(*tagAllocation);
1112+
makeResident(*commandStream.getGraphicsAllocation());
1113+
1114+
this->flushSmallTask(commandStream, commandStreamStart);
1115+
}
1116+
1117+
template <typename GfxFamily>
1118+
void CommandStreamReceiverHw<GfxFamily>::flushPipeControl() {
1119+
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
1120+
1121+
auto lock = obtainUniqueOwnership();
1122+
1123+
auto &commandStream = getCS(MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(peekHwInfo()));
1124+
auto commandStreamStart = commandStream.getUsed();
1125+
1126+
PipeControlArgs args(true);
1127+
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(commandStream,
1128+
PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
1129+
getTagAllocation()->getGpuAddress(),
1130+
taskCount,
1131+
peekHwInfo(),
1132+
args);
1133+
1134+
makeResident(*tagAllocation);
1135+
makeResident(*commandStream.getGraphicsAllocation());
1136+
1137+
this->flushSmallTask(commandStream, commandStreamStart);
1138+
}
1139+
1140+
template <typename GfxFamily>
1141+
void CommandStreamReceiverHw<GfxFamily>::flushSmallTask(LinearStream &commandStreamTask, size_t commandStreamStartTask) {
1142+
using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END;
1143+
1144+
void *endingCmdPtr = nullptr;
1145+
1146+
if (isDirectSubmissionEnabled()) {
1147+
endingCmdPtr = commandStreamTask.getSpace(0);
1148+
EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&commandStreamTask,
1149+
0ull,
1150+
false);
1151+
} else {
1152+
auto batchBufferEnd = reinterpret_cast<MI_BATCH_BUFFER_END *>(commandStreamTask.getSpace(sizeof(MI_BATCH_BUFFER_END)));
1153+
*batchBufferEnd = GfxFamily::cmdInitBatchBufferEnd;
1154+
}
1155+
1156+
alignToCacheLine(commandStreamTask);
1157+
1158+
BatchBuffer batchBuffer{commandStreamTask.getGraphicsAllocation(), commandStreamStartTask, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount,
1159+
commandStreamTask.getUsed(), &commandStreamTask, endingCmdPtr, false};
1160+
1161+
flushHandler(batchBuffer, getResidencyAllocations());
1162+
}
1163+
1164+
template <typename GfxFamily>
1165+
inline void CommandStreamReceiverHw<GfxFamily>::flushHandler(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) {
1166+
flush(batchBuffer, allocationsForResidency);
1167+
makeSurfacePackNonResident(allocationsForResidency);
1168+
}
1169+
1170+
template <typename GfxFamily>
1171+
inline bool CommandStreamReceiverHw<GfxFamily>::isUpdateTagFromWaitEnabled() {
1172+
bool enabled = false;
1173+
1174+
if (DebugManager.flags.UpdateTaskCountFromWait.get() != -1) {
1175+
enabled = DebugManager.flags.UpdateTaskCountFromWait.get();
1176+
}
1177+
1178+
return enabled;
1179+
}
1180+
1181+
template <typename GfxFamily>
1182+
inline void CommandStreamReceiverHw<GfxFamily>::updateTagFromWait() {
1183+
if (isUpdateTagFromWaitEnabled()) {
1184+
flushTagUpdate();
1185+
}
1186+
}
1187+
10821188
template <typename GfxFamily>
10831189
inline void CommandStreamReceiverHw<GfxFamily>::programAdditionalPipelineSelect(LinearStream &csr, PipelineSelectArgs &pipelineSelectArgs, bool is3DPipeline) {
10841190
auto &hwHelper = HwHelper::get(peekHwInfo().platform.eRenderCoreFamily);

shared/source/debug_settings/debug_variables_base.inl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ DECLARE_DEBUG_VARIABLE(int64_t, DisableIndirectAccess, -1, "0: default, 0: Use
233233
DECLARE_DEBUG_VARIABLE(int32_t, UseVmBind, 0, "Use new residency model on Linux (requires kernel support), -1: default, 0: disabled, 1: enabled")
234234
DECLARE_DEBUG_VARIABLE(int32_t, PassBoundBOToExec, -1, "Pass bound BOs to exec call to keep dependencies")
235235
DECLARE_DEBUG_VARIABLE(int32_t, EnableStaticPartitioning, -1, "Divide workload into partitions during dispatch, -1: default, 0: disabled, 1: enabled")
236+
DECLARE_DEBUG_VARIABLE(int32_t, UpdateTaskCountFromWait, -1, " Do not update task count after each enqueue, but send update request while wait, -1: default(disabled), 0: disabled, 1: enabled")
236237
DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger")
237238
DECLARE_DEBUG_VARIABLE(bool, ReturnRawGpuTimestamps, false, "Driver returns raw GPU tiemstamps instead of calculated ones.")
238239
DECLARE_DEBUG_VARIABLE(bool, ForcePerDssBackedBufferProgramming, false, "Always program per-DSS memory backed buffer in preamble")

shared/source/memory_manager/deferrable_allocation_deletion.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018-2020 Intel Corporation
2+
* Copyright (C) 2018-2021 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -28,6 +28,7 @@ bool DeferrableAllocationDeletion::apply() {
2828
} else {
2929
isStillUsed = true;
3030
engine.commandStreamReceiver->flushBatchedSubmissions();
31+
engine.commandStreamReceiver->updateTagFromWait();
3132
}
3233
}
3334
}

shared/test/common/mocks/mock_command_stream_receiver.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
4646
}
4747
bool flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override;
4848

49+
void flushTagUpdate() override{};
50+
void updateTagFromWait() override{};
51+
4952
bool isMultiOsContextCapable() const override { return multiOsContextCapable; }
5053

5154
MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired) const override {

0 commit comments

Comments
 (0)