Skip to content

Commit a22ca8c

Browse files
fix: stall RelaxedOrdering scheduler when programming Semaphores
Source: 2383896 Related-To: NEO-7458 Signed-off-by: Dunajski, Bartosz <[email protected]>
1 parent e64450a commit a22ca8c

File tree

3 files changed

+140
-9
lines changed

3 files changed

+140
-9
lines changed

level_zero/core/source/cmdlist/cmdlist_hw_immediate.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
182182

183183
void printKernelsPrintfOutput(bool hangDetected);
184184
ze_result_t synchronizeInOrderExecution(uint64_t timeout) const;
185+
bool hasStallingCmdsForRelaxedOrdering(uint32_t numWaitEvents, bool relaxedOrderingDispatch);
185186

186187
MOCKABLE_VIRTUAL void checkAssert();
187188
ComputeFlushMethodType computeFlushMethod = nullptr;

level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,11 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::waitForEventsFromHost() {
327327
return true;
328328
}
329329

330+
template <GFXCORE_FAMILY gfxCoreFamily>
331+
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::hasStallingCmdsForRelaxedOrdering(uint32_t numWaitEvents, bool relaxedOrderingDispatch) {
332+
return (!relaxedOrderingDispatch && (numWaitEvents > 0 || isInOrderExecutionEnabled()));
333+
}
334+
330335
template <GFXCORE_FAMILY gfxCoreFamily>
331336
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernel(
332337
ze_kernel_handle_t kernelHandle, const ze_group_count_t *threadGroupDimensions,
@@ -350,7 +355,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernel(
350355
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(kernelHandle, threadGroupDimensions,
351356
hSignalEvent, numWaitEvents, phWaitEvents,
352357
launchParams, relaxedOrderingDispatch);
353-
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
358+
359+
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
354360
}
355361

356362
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -366,7 +372,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernelInd
366372

367373
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelIndirect(kernelHandle, pDispatchArgumentsBuffer,
368374
hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
369-
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
375+
376+
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
370377
}
371378

372379
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -401,6 +408,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
401408
checkWaitEventsState(numWaitEvents, phWaitEvents);
402409
}
403410

411+
bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch);
412+
404413
ze_result_t ret;
405414
CpuMemCopyInfo cpuMemCopyInfo(dstptr, srcptr, size);
406415
this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &cpuMemCopyInfo.srcAllocData);
@@ -416,14 +425,17 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
416425
auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size, direction);
417426
if (isSplitNeeded) {
418427
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
428+
hasStallindCmds = !relaxedOrderingDispatch;
429+
419430
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
420431
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, relaxedOrderingDispatch);
421432
});
422433
} else {
423434
ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptr, srcptr, size, hSignalEvent,
424435
numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
425436
}
426-
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
437+
438+
return flushImmediate(ret, true, hasStallindCmds, relaxedOrderingDispatch, hSignalEvent);
427439
}
428440

429441
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -446,12 +458,16 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
446458
checkWaitEventsState(numWaitEvents, phWaitEvents);
447459
}
448460

461+
bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch);
462+
449463
ze_result_t ret;
450464

451465
NEO::TransferDirection direction;
452466
auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch), direction);
453467
if (isSplitNeeded) {
454468
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
469+
hasStallindCmds = !relaxedOrderingDispatch;
470+
455471
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uint32_t, uint32_t>(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
456472
ze_copy_region_t dstRegionLocal = {};
457473
ze_copy_region_t srcRegionLocal = {};
@@ -471,7 +487,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
471487
hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
472488
}
473489

474-
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
490+
return flushImmediate(ret, true, hasStallindCmds, relaxedOrderingDispatch, hSignalEvent);
475491
}
476492

477493
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -489,7 +505,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryFill(void
489505

490506
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(ptr, pattern, patternSize, size, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
491507

492-
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
508+
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
493509
}
494510

495511
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -616,7 +632,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyRegion
616632

617633
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendImageCopyRegion(hDstImage, hSrcImage, pDstRegion, pSrcRegion, hSignalEvent,
618634
numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
619-
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
635+
636+
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
620637
}
621638

622639
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -637,7 +654,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyFromMe
637654
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendImageCopyFromMemory(hDstImage, srcPtr, pDstRegion, hSignalEvent,
638655
numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
639656

640-
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
657+
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
641658
}
642659

643660
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -658,7 +675,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyToMemo
658675
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendImageCopyToMemory(dstPtr, hSrcImage, pSrcRegion, hSignalEvent,
659676
numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
660677

661-
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
678+
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
662679
}
663680

664681
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -690,7 +707,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchCooperati
690707
}
691708

692709
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendLaunchCooperativeKernel(kernelHandle, launchKernelArgs, hSignalEvent, numWaitEvents, waitEventHandles, relaxedOrderingDispatch);
693-
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
710+
711+
return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, hSignalEvent);
694712
}
695713

696714
template <GFXCORE_FAMILY gfxCoreFamily>

level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
*/
77

88
#include "shared/source/command_stream/wait_status.h"
9+
#include "shared/source/direct_submission/relaxed_ordering_helper.h"
910
#include "shared/source/gmm_helper/gmm_helper.h"
1011
#include "shared/source/indirect_heap/indirect_heap.h"
1112
#include "shared/source/memory_manager/internal_allocation_storage.h"
@@ -1085,6 +1086,117 @@ HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingTh
10851086
driverHandle->releaseImportedPointer(dstPtr);
10861087
}
10871088

1089+
HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingDisabledRelaxedOrderingThenPassStallingCmdsInfo, IsAtLeastXeHpcCore) {
1090+
ze_command_queue_desc_t desc = {};
1091+
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
1092+
ze_result_t returnValue;
1093+
auto commandList = zeUniquePtr(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
1094+
ASSERT_NE(nullptr, commandList);
1095+
auto whiteBoxCmdList = static_cast<CommandList *>(commandList.get());
1096+
1097+
ze_event_pool_desc_t eventPoolDesc = {};
1098+
eventPoolDesc.count = 1;
1099+
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
1100+
1101+
ze_event_desc_t eventDesc = {};
1102+
eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
1103+
1104+
ze_event_handle_t event = nullptr;
1105+
1106+
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
1107+
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
1108+
1109+
ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event));
1110+
std::unique_ptr<L0::Event> eventObject(L0::Event::fromHandle(event));
1111+
1112+
Mock<::L0::Kernel> kernel;
1113+
ze_group_count_t groupCount{1, 1, 1};
1114+
CmdListKernelLaunchParams launchParams = {};
1115+
1116+
uint8_t srcPtr[64] = {};
1117+
uint8_t dstPtr[64] = {};
1118+
const ze_copy_region_t region = {0U, 0U, 0U, 1, 1, 0U};
1119+
1120+
driverHandle->importExternalPointer(dstPtr, MemoryConstants::pageSize);
1121+
1122+
auto ultCsr = static_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(whiteBoxCmdList->csr);
1123+
ultCsr->recordFlusheBatchBuffer = true;
1124+
ultCsr->unregisterClient();
1125+
1126+
EXPECT_FALSE(NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*ultCsr, 1));
1127+
1128+
auto verifyFlags = [&ultCsr](ze_result_t result) {
1129+
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
1130+
EXPECT_TRUE(ultCsr->recordedDispatchFlags.hasStallingCmds);
1131+
EXPECT_TRUE(ultCsr->latestFlushedBatchBuffer.hasStallingCmds);
1132+
};
1133+
1134+
auto resetFlags = [&ultCsr]() {
1135+
ultCsr->recordedDispatchFlags.hasStallingCmds = false;
1136+
ultCsr->latestFlushedBatchBuffer.hasStallingCmds = false;
1137+
};
1138+
1139+
bool inOrderExecAlreadyEnabled = false;
1140+
1141+
for (bool inOrderExecution : {false, true}) {
1142+
if (inOrderExecution && !inOrderExecAlreadyEnabled) {
1143+
whiteBoxCmdList->enableInOrderExecution();
1144+
inOrderExecAlreadyEnabled = true;
1145+
}
1146+
1147+
EXPECT_EQ(inOrderExecAlreadyEnabled, inOrderExecution);
1148+
1149+
uint32_t numWaitEvents = inOrderExecution ? 0 : 1;
1150+
ze_event_handle_t *waitlist = inOrderExecution ? nullptr : &event;
1151+
1152+
// non-pipelined state or first in-order exec
1153+
resetFlags();
1154+
verifyFlags(commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, 1, &event, launchParams, false));
1155+
1156+
// non-pipelined state already programmed
1157+
resetFlags();
1158+
verifyFlags(commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, launchParams, false));
1159+
1160+
resetFlags();
1161+
verifyFlags(commandList->appendLaunchKernelIndirect(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, false));
1162+
1163+
resetFlags();
1164+
verifyFlags(commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, numWaitEvents, waitlist, false));
1165+
1166+
resetFlags();
1167+
verifyFlags(commandList->appendMemoryCopyRegion(dstPtr, &region, 0, 0, srcPtr, &region, 0, 0, nullptr, numWaitEvents, waitlist, false));
1168+
1169+
resetFlags();
1170+
verifyFlags(commandList->appendMemoryFill(dstPtr, srcPtr, 8, 1, nullptr, numWaitEvents, waitlist, false));
1171+
1172+
if constexpr (FamilyType::supportsSampler) {
1173+
auto kernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::CopyImageRegion);
1174+
auto mockBuiltinKernel = static_cast<Mock<::L0::Kernel> *>(kernel);
1175+
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
1176+
1177+
auto image = std::make_unique<WhiteBox<::L0::ImageCoreFamily<gfxCoreFamily>>>();
1178+
ze_image_region_t imgRegion = {1, 1, 1, 1, 1, 1};
1179+
ze_image_desc_t zeDesc = {};
1180+
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
1181+
image->initialize(device, &zeDesc);
1182+
1183+
resetFlags();
1184+
verifyFlags(commandList->appendImageCopyRegion(image->toHandle(), image->toHandle(), &imgRegion, &imgRegion, nullptr, numWaitEvents, waitlist, false));
1185+
1186+
resetFlags();
1187+
verifyFlags(commandList->appendImageCopyFromMemory(image->toHandle(), dstPtr, &imgRegion, nullptr, numWaitEvents, waitlist, false));
1188+
1189+
resetFlags();
1190+
verifyFlags(commandList->appendImageCopyToMemory(dstPtr, image->toHandle(), &imgRegion, nullptr, numWaitEvents, waitlist, false));
1191+
}
1192+
1193+
resetFlags();
1194+
verifyFlags(commandList->appendLaunchCooperativeKernel(kernel.toHandle(), &groupCount, nullptr, numWaitEvents, waitlist, false));
1195+
}
1196+
1197+
driverHandle->releaseImportedPointer(dstPtr);
1198+
}
1199+
10881200
HWTEST2_F(CommandListCreate, whenDispatchingThenPassNumCsrClients, IsAtLeastXeHpcCore) {
10891201
ze_command_queue_desc_t desc = {};
10901202
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;

0 commit comments

Comments
 (0)