Skip to content

Commit 0b64ecb

Browse files
Pass active partitions from dispatched kernel to context
Related-To: NEO-6244 Signed-off-by: Zbigniew Zdanowicz <[email protected]>
1 parent 85a52b7 commit 0b64ecb

File tree

6 files changed

+198
-11
lines changed

6 files changed

+198
-11
lines changed

opencl/source/command_queue/hardware_interface_xehp_and_later.inl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,9 @@ inline void HardwareInterface<GfxFamily>::programWalker(
138138
false,
139139
kernel.usesImages(),
140140
workPartitionAllocationGpuVa);
141+
if (queueCsr.isStaticWorkPartitioningEnabled()) {
142+
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount));
143+
}
141144
auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
142145
timestampPacket->setPacketsUsed(partitionCount);
143146
} else {

opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1459,7 +1459,7 @@ struct XeHPAndLaterDispatchWalkerBasicTestStaticPartition : public XeHPAndLaterD
14591459
}
14601460
};
14611461

1462-
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, givenStaticPartitioningWhenEnqueueingKernelThenNoMultipleActivePartitionsAreSetInCsr) {
1462+
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, givenStaticPartitioningWhenEnqueueingKernelThenMultipleActivePartitionsAreSetInCsr) {
14631463
if (!OSInterface::osEnableLocalMemory) {
14641464
GTEST_SKIP();
14651465
}
@@ -1472,7 +1472,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition,
14721472
}
14731473
EXPECT_EQ(1u, commandStreamReceiver.activePartitions);
14741474
cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
1475-
EXPECT_EQ(1u, commandStreamReceiver.activePartitions);
1475+
EXPECT_EQ(2u, commandStreamReceiver.activePartitions);
14761476

14771477
HardwareParse hwParser;
14781478
hwParser.parseCommands<FamilyType>(*cmdQ);
@@ -1482,6 +1482,50 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition,
14821482
EXPECT_EQ(8u, computeWalker->getPartitionSize());
14831483
}
14841484

1485+
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition,
1486+
givenStaticPartitioningWhenEnqueueingNonUnifromKernelThenMultipleActivePartitionsAreSetInCsrAndWparidRegisterIsReconfiguredToStatic) {
1487+
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
1488+
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
1489+
if (!OSInterface::osEnableLocalMemory) {
1490+
GTEST_SKIP();
1491+
}
1492+
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
1493+
size_t gws[] = {129, 1, 1};
1494+
size_t lws[] = {8, 1, 1};
1495+
auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver();
1496+
if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) {
1497+
commandStreamReceiver.createPreemptionAllocation();
1498+
}
1499+
EXPECT_EQ(1u, commandStreamReceiver.activePartitions);
1500+
kernel->mockProgram->allowNonUniform = true;
1501+
cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
1502+
EXPECT_EQ(2u, commandStreamReceiver.activePartitions);
1503+
1504+
HardwareParse hwParser;
1505+
hwParser.parseCommands<FamilyType>(*cmdQ->commandStream);
1506+
1507+
auto firstComputeWalkerItor = find<COMPUTE_WALKER *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
1508+
ASSERT_NE(hwParser.cmdList.end(), firstComputeWalkerItor);
1509+
auto computeWalker = reinterpret_cast<COMPUTE_WALKER *>(*firstComputeWalkerItor);
1510+
EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
1511+
EXPECT_EQ(8u, computeWalker->getPartitionSize());
1512+
1513+
auto nextCmdItor = firstComputeWalkerItor;
1514+
++nextCmdItor;
1515+
1516+
auto secondComputeWalkerItor = find<COMPUTE_WALKER *>(nextCmdItor, hwParser.cmdList.end());
1517+
ASSERT_NE(hwParser.cmdList.end(), secondComputeWalkerItor);
1518+
computeWalker = reinterpret_cast<COMPUTE_WALKER *>(*secondComputeWalkerItor);
1519+
EXPECT_EQ(COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType());
1520+
1521+
auto workPartitionAllocationGpuVa = commandStreamReceiver.getWorkPartitionAllocationGpuAddress();
1522+
auto expectedRegister = 0x221Cu;
1523+
auto loadRegisterMem = hwParser.getCommand<MI_LOAD_REGISTER_MEM>(firstComputeWalkerItor, secondComputeWalkerItor);
1524+
ASSERT_NE(nullptr, loadRegisterMem);
1525+
EXPECT_EQ(workPartitionAllocationGpuVa, loadRegisterMem->getMemoryAddress());
1526+
EXPECT_EQ(expectedRegister, loadRegisterMem->getRegisterAddress());
1527+
}
1528+
14851529
using NonDefaultPlatformGpuWalkerTest = XeHPAndLaterDispatchWalkerBasicTest;
14861530

14871531
HWCMDTEST_F(IGFX_XE_HP_CORE, NonDefaultPlatformGpuWalkerTest, givenNonDefaultPlatformWhenSetupTimestampPacketThenGmmHelperIsTakenFromNonDefaultPlatform) {

opencl/test/unit_test/command_queue/walker_partition_tests_xehp_and_later_1.cpp

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,3 +1276,135 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticWalkerPartitionWhe
12761276
}
12771277
EXPECT_EQ(parsedOffset, totalBytesProgrammed);
12781278
}
1279+
1280+
HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, givenStaticPartitionIsPreferredAndWalkerWithNonUniformStartWhenDynamicPartitionSelectedThenExpectReconfigureWparidToStatic) {
1281+
WalkerPartition::COMPUTE_WALKER<FamilyType> walker;
1282+
walker = FamilyType::cmdInitGpgpuWalker;
1283+
walker.setThreadGroupIdStartingX(1u);
1284+
1285+
checkForProperCmdBufferAddressOffset = false;
1286+
bool preferredStaticPartitioning = true;
1287+
bool staticPartitioning = false;
1288+
auto partitionCount = computePartitionCountAndSetPartitionType<FamilyType>(&walker, 4u, preferredStaticPartitioning, false, &staticPartitioning);
1289+
EXPECT_FALSE(staticPartitioning);
1290+
EXPECT_EQ(1u, partitionCount);
1291+
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, walker.getPartitionType());
1292+
1293+
testArgs.partitionCount = partitionCount;
1294+
testArgs.staticPartitioning = staticPartitioning;
1295+
testArgs.preferredStaticPartitioning = preferredStaticPartitioning;
1296+
testArgs.workPartitionAllocationGpuVa = 0x800BADA55000;
1297+
1298+
auto expectedCommandUsedSize = sizeof(WalkerPartition::LOAD_REGISTER_IMM<FamilyType>) +
1299+
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) * 2 +
1300+
sizeof(WalkerPartition::LOAD_REGISTER_REG<FamilyType>) +
1301+
sizeof(WalkerPartition::MI_SET_PREDICATE<FamilyType>) * 2 +
1302+
sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>) * 3 +
1303+
sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>) +
1304+
sizeof(WalkerPartition::COMPUTE_WALKER<FamilyType>) +
1305+
sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>) +
1306+
sizeof(WalkerPartition::LOAD_REGISTER_MEM<FamilyType>);
1307+
1308+
EXPECT_EQ(expectedCommandUsedSize, computeControlSectionOffset<FamilyType>(testArgs));
1309+
1310+
auto walkerSectionCommands = sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>) +
1311+
sizeof(WalkerPartition::COMPUTE_WALKER<FamilyType>);
1312+
auto totalProgrammedSize = expectedCommandUsedSize + sizeof(BatchBufferControlData);
1313+
1314+
testArgs.tileCount = 2;
1315+
uint64_t gpuVirtualAddress = 0x8000123000;
1316+
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<FamilyType>(cmdBuffer,
1317+
gpuVirtualAddress,
1318+
&walker,
1319+
totalBytesProgrammed,
1320+
testArgs);
1321+
1322+
EXPECT_EQ(totalProgrammedSize, totalBytesProgrammed);
1323+
1324+
auto expectedMask = 0xFFFFu;
1325+
auto expectedRegister = 0x21FCu;
1326+
auto loadRegisterImmediate = genCmdCast<WalkerPartition::LOAD_REGISTER_IMM<FamilyType> *>(cmdBufferAddress);
1327+
ASSERT_NE(nullptr, loadRegisterImmediate);
1328+
EXPECT_EQ(expectedRegister, loadRegisterImmediate->getRegisterOffset());
1329+
EXPECT_EQ(expectedMask, loadRegisterImmediate->getDataDword());
1330+
auto parsedOffset = sizeof(WalkerPartition::LOAD_REGISTER_IMM<FamilyType>);
1331+
1332+
auto miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1333+
ASSERT_NE(nullptr, miAtomic);
1334+
auto miAtomicAddress = gpuVirtualAddress + expectedCommandUsedSize;
1335+
auto miAtomicProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
1336+
EXPECT_EQ(miAtomicAddress, miAtomicProgrammedAddress);
1337+
EXPECT_TRUE(miAtomic->getReturnDataControl());
1338+
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
1339+
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
1340+
1341+
auto loadRegisterReg = genCmdCast<WalkerPartition::LOAD_REGISTER_REG<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1342+
ASSERT_NE(nullptr, loadRegisterReg);
1343+
EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableDestination());
1344+
EXPECT_TRUE(loadRegisterReg->getMmioRemapEnableSource());
1345+
EXPECT_EQ(wparidCCSOffset, loadRegisterReg->getDestinationRegisterAddress());
1346+
EXPECT_EQ(generalPurposeRegister4, loadRegisterReg->getSourceRegisterAddress());
1347+
parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_REG<FamilyType>);
1348+
1349+
auto miSetPredicate = genCmdCast<WalkerPartition::MI_SET_PREDICATE<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1350+
ASSERT_NE(nullptr, miSetPredicate);
1351+
EXPECT_EQ(MI_SET_PREDICATE<FamilyType>::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE, miSetPredicate->getPredicateEnableWparid());
1352+
parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE<FamilyType>);
1353+
1354+
auto batchBufferStart = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1355+
ASSERT_NE(nullptr, batchBufferStart);
1356+
EXPECT_TRUE(batchBufferStart->getPredicationEnable());
1357+
//address routes to WALKER section which is before control section
1358+
auto address = batchBufferStart->getBatchBufferStartAddress();
1359+
EXPECT_EQ(address, gpuVirtualAddress + expectedCommandUsedSize - walkerSectionCommands);
1360+
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
1361+
1362+
miSetPredicate = genCmdCast<WalkerPartition::MI_SET_PREDICATE<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1363+
ASSERT_NE(nullptr, miSetPredicate);
1364+
EXPECT_EQ(MI_SET_PREDICATE<FamilyType>::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_NEVER, miSetPredicate->getPredicateEnableWparid());
1365+
EXPECT_EQ(MI_SET_PREDICATE<FamilyType>::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE, miSetPredicate->getPredicateEnable());
1366+
parsedOffset += sizeof(WalkerPartition::MI_SET_PREDICATE<FamilyType>);
1367+
1368+
auto pipeControl = genCmdCast<WalkerPartition::PIPE_CONTROL<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1369+
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
1370+
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::isDcFlushAllowed(), pipeControl->getDcFlushEnable());
1371+
parsedOffset += sizeof(WalkerPartition::PIPE_CONTROL<FamilyType>);
1372+
1373+
miAtomic = genCmdCast<WalkerPartition::MI_ATOMIC<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1374+
ASSERT_NE(nullptr, miAtomic);
1375+
auto miAtomicTileAddress = gpuVirtualAddress + expectedCommandUsedSize + sizeof(uint32_t);
1376+
auto miAtomicTileProgrammedAddress = UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
1377+
EXPECT_EQ(miAtomicTileAddress, miAtomicTileProgrammedAddress);
1378+
EXPECT_FALSE(miAtomic->getReturnDataControl());
1379+
EXPECT_EQ(MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
1380+
parsedOffset += sizeof(WalkerPartition::MI_ATOMIC<FamilyType>);
1381+
1382+
auto miSemaphoreWait = genCmdCast<WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1383+
ASSERT_NE(nullptr, miSemaphoreWait);
1384+
EXPECT_EQ(miAtomicTileAddress, miSemaphoreWait->getSemaphoreGraphicsAddress());
1385+
EXPECT_EQ(MI_SEMAPHORE_WAIT<FamilyType>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphoreWait->getCompareOperation());
1386+
EXPECT_EQ(2u, miSemaphoreWait->getSemaphoreDataDword());
1387+
parsedOffset += sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>);
1388+
1389+
auto loadRegisterMem = genCmdCast<WalkerPartition::LOAD_REGISTER_MEM<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1390+
ASSERT_NE(nullptr, loadRegisterMem);
1391+
EXPECT_EQ(testArgs.workPartitionAllocationGpuVa, loadRegisterMem->getMemoryAddress());
1392+
EXPECT_EQ(wparidCCSOffset, loadRegisterMem->getRegisterAddress());
1393+
parsedOffset += sizeof(WalkerPartition::LOAD_REGISTER_MEM<FamilyType>);
1394+
1395+
//final batch buffer start that routes at the end of the batch buffer
1396+
auto batchBufferStartFinal = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1397+
EXPECT_NE(nullptr, batchBufferStartFinal);
1398+
EXPECT_EQ(batchBufferStartFinal->getBatchBufferStartAddress(), gpuVirtualAddress + totalProgrammedSize);
1399+
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
1400+
1401+
auto computeWalker = genCmdCast<WalkerPartition::COMPUTE_WALKER<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1402+
ASSERT_NE(nullptr, computeWalker);
1403+
parsedOffset += sizeof(WalkerPartition::COMPUTE_WALKER<FamilyType>);
1404+
1405+
batchBufferStart = genCmdCast<WalkerPartition::BATCH_BUFFER_START<FamilyType> *>(ptrOffset(cmdBuffer, parsedOffset));
1406+
ASSERT_NE(nullptr, batchBufferStart);
1407+
EXPECT_FALSE(batchBufferStart->getPredicationEnable());
1408+
EXPECT_EQ(gpuVirtualAddress, batchBufferStart->getBatchBufferStartAddress());
1409+
parsedOffset += sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
1410+
}

opencl/test/unit_test/mocks/mock_program.h

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,35 +28,35 @@ ClDeviceVector toClDeviceVector(ClDevice &clDevice);
2828
////////////////////////////////////////////////////////////////////////////////
2929
class MockProgram : public Program {
3030
public:
31-
using Program::createProgramFromBinary;
32-
using Program::deviceBuildInfos;
33-
using Program::internalOptionsToExtract;
34-
using Program::kernelDebugEnabled;
35-
using Program::linkBinary;
36-
using Program::separateBlockKernels;
37-
using Program::setBuildStatus;
38-
using Program::updateNonUniformFlag;
39-
31+
using Program::allowNonUniform;
4032
using Program::applyAdditionalOptions;
4133
using Program::areSpecializationConstantsInitialized;
4234
using Program::blockKernelManager;
4335
using Program::buildInfos;
4436
using Program::context;
4537
using Program::createdFrom;
38+
using Program::createProgramFromBinary;
4639
using Program::debugData;
4740
using Program::debugDataSize;
41+
using Program::deviceBuildInfos;
4842
using Program::extractInternalOptions;
4943
using Program::getKernelInfo;
44+
using Program::internalOptionsToExtract;
5045
using Program::irBinary;
5146
using Program::irBinarySize;
5247
using Program::isSpirV;
48+
using Program::kernelDebugEnabled;
49+
using Program::linkBinary;
5350
using Program::options;
5451
using Program::packDeviceBinary;
5552
using Program::Program;
53+
using Program::separateBlockKernels;
54+
using Program::setBuildStatus;
5655
using Program::sourceCode;
5756
using Program::specConstantsIds;
5857
using Program::specConstantsSizes;
5958
using Program::specConstantsValues;
59+
using Program::updateNonUniformFlag;
6060

6161
MockProgram(const ClDeviceVector &deviceVector) : Program(nullptr, false, deviceVector) {
6262
}

shared/source/command_container/implicit_scaling_xehp_and_later.inl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool emitSelfCleanup,
4242
args.emitPipeControlStall = ImplicitScalingHelper::isPipeControlStallRequired();
4343
args.emitBatchBufferEnd = false;
4444
args.staticPartitioning = staticPartitioning;
45+
args.preferredStaticPartitioning = preferStaticPartitioning;
4546

4647
return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(args));
4748
}
@@ -76,6 +77,7 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
7677
args.emitBatchBufferEnd = false;
7778
args.secondaryBatchBuffer = useSecondaryBatchBuffer;
7879
args.staticPartitioning = staticPartitioning;
80+
args.preferredStaticPartitioning = preferStaticPartitioning;
7981

8082
if (staticPartitioning) {
8183
UNRECOVERABLE_IF(tileCount != partitionCount);

shared/source/command_container/walker_partition_xehp_and_later.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ struct WalkerPartitionArgs {
3232
bool useAtomicsForSelfCleanup = false;
3333
bool initializeWparidRegister = false;
3434
bool emitPipeControlStall = false;
35+
bool preferredStaticPartitioning = false;
3536
};
3637

3738
template <typename GfxFamily>
@@ -457,6 +458,7 @@ uint64_t computeControlSectionOffset(WalkerPartitionArgs &args) {
457458
if (args.emitSelfCleanup) {
458459
size += computeSelfCleanupSectionSize<GfxFamily>(args.useAtomicsForSelfCleanup);
459460
}
461+
size += args.preferredStaticPartitioning ? sizeof(LOAD_REGISTER_MEM<GfxFamily>) : 0u;
460462
return size;
461463
}
462464

@@ -587,6 +589,10 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
587589
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
588590
}
589591

592+
if (args.preferredStaticPartitioning) {
593+
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
594+
}
595+
590596
//this bb start goes to the end of partitioned command buffer
591597
programMiBatchBufferStart<GfxFamily>(
592598
currentBatchBufferPointer,

0 commit comments

Comments
 (0)