Skip to content

Commit ed0c361

Browse files
Apply heuristics when setting TG dispatch size on XE_HPC_CORE
The default TG dispatch size can be changed to a better value based on number of threads in TG or currently available amount of threads on GPU. Decision on what TG dispatch size should be are based on implemented heuristics. Signed-off-by: Rafal Maziejuk <[email protected]> Related-To: NEO-6989
1 parent 52133e6 commit ed0c361

21 files changed

+188
-19
lines changed

level_zero/core/test/unit_tests/mocks/mock_kernel.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ struct Mock<::L0::Kernel> : public WhiteBox<::L0::Kernel> {
111111
kernelTokens.header = &kernelHeader;
112112

113113
iOpenCL::SPatchExecutionEnvironment execEnv = {};
114+
execEnv.NumGRFRequired = 128;
114115
execEnv.LargestCompiledSIMDSize = 8;
115116
kernelTokens.tokens.executionEnvironment = &execEnv;
116117

opencl/source/command_queue/hardware_interface_bdw_and_later.inl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
6565
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
6666
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
6767
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
68+
auto threadGroupCount = static_cast<uint32_t>(walkerArgs.numberOfWorkgroups->x * walkerArgs.numberOfWorkgroups->y * walkerArgs.numberOfWorkgroups->z);
6869

6970
if (walkerArgs.currentTimestampPacketNodes && commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
7071
auto timestampPacketNode = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
@@ -83,6 +84,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
8384
kernel.getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
8485
simd,
8586
walkerArgs.localWorkSizes,
87+
threadGroupCount,
8688
walkerArgs.offsetInterfaceDescriptorTable,
8789
walkerArgs.interfaceDescriptorIndex,
8890
walkerArgs.preemptionMode,

opencl/source/command_queue/hardware_interface_xehp_and_later.inl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
6060
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
6161
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
6262
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
63+
auto threadGroupCount = static_cast<uint32_t>(walkerArgs.numberOfWorkgroups->x * walkerArgs.numberOfWorkgroups->y * walkerArgs.numberOfWorkgroups->z);
6364
uint32_t requiredWalkOrder = 0u;
6465

6566
bool localIdsGenerationByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
@@ -98,6 +99,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
9899
kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, false),
99100
simd,
100101
walkerArgs.localWorkSizes,
102+
threadGroupCount,
101103
walkerArgs.offsetInterfaceDescriptorTable,
102104
walkerArgs.interfaceDescriptorIndex,
103105
walkerArgs.preemptionMode,

opencl/source/helpers/hardware_commands_helper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
4747
size_t bindingTablePointer,
4848
[[maybe_unused]] size_t offsetSamplerState,
4949
uint32_t numSamplers,
50+
const uint32_t threadGroupCount,
5051
uint32_t numThreadsPerThreadGroup,
5152
const Kernel &kernel,
5253
uint32_t bindingTablePrefetchSize,
@@ -79,6 +80,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
7980
uint64_t kernelStartOffset,
8081
uint32_t simd,
8182
const size_t localWorkSize[3],
83+
const uint32_t threadGroupCount,
8284
const uint64_t offsetInterfaceDescriptorTable,
8385
uint32_t &interfaceDescriptorIndex,
8486
PreemptionMode preemptionMode,

opencl/source/helpers/hardware_commands_helper_base.inl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
113113
size_t bindingTablePointer,
114114
[[maybe_unused]] size_t offsetSamplerState,
115115
uint32_t numSamplers,
116+
const uint32_t threadGroupCount,
116117
uint32_t threadsPerThreadGroup,
117118
const Kernel &kernel,
118119
uint32_t bindingTablePrefetchSize,
@@ -169,7 +170,8 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
169170
hardwareInfo);
170171

171172
PreemptionHelper::programInterfaceDescriptorDataPreemption<GfxFamily>(&interfaceDescriptor, preemptionMode);
172-
EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorData(interfaceDescriptor, hardwareInfo);
173+
174+
EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorData(interfaceDescriptor, hardwareInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired);
173175

174176
*pInterfaceDescriptor = interfaceDescriptor;
175177
return (size_t)offsetInterfaceDescriptor;
@@ -185,6 +187,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
185187
uint64_t kernelStartOffset,
186188
uint32_t simd,
187189
const size_t localWorkSize[3],
190+
const uint32_t threadGroupCount,
188191
const uint64_t offsetInterfaceDescriptorTable,
189192
uint32_t &interfaceDescriptorIndex,
190193
PreemptionMode preemptionMode,
@@ -263,6 +266,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
263266
dstBindingTablePointer,
264267
samplerStateOffset,
265268
samplerCount,
269+
threadGroupCount,
266270
threadsPerThreadGroup,
267271
kernel,
268272
bindingTablePrefetchSize,

opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,10 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenProgramInterfaceDescriptor
8888
auto usedIndirectHeapBefore = indirectHeap.getUsed();
8989
indirectHeap.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
9090

91+
const uint32_t threadGroupCount = 1u;
9192
size_t crossThreadDataSize = kernel->getCrossThreadDataSize();
9293
HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
93-
indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, 1, *kernel, 0, pDevice->getPreemptionMode(), nullptr, *pDevice);
94+
indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, threadGroupCount, 1, *kernel, 0, pDevice->getPreemptionMode(), nullptr, *pDevice);
9495

9596
auto usedIndirectHeapAfter = indirectHeap.getUsed();
9697
EXPECT_EQ(sizeof(INTERFACE_DESCRIPTOR_DATA), usedIndirectHeapAfter - usedIndirectHeapBefore);
@@ -309,6 +310,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
309310

310311
const size_t localWorkSize = 256;
311312
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
313+
const uint32_t threadGroupCount = 1u;
312314

313315
auto &commandStream = cmdQ.getCS(1024);
314316
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
@@ -343,6 +345,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
343345
kernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
344346
kernel->getKernelInfo().getMaxSimdSize(),
345347
localWorkSizes,
348+
threadGroupCount,
346349
idToffset,
347350
interfaceDescriptorIndex,
348351
pDevice->getPreemptionMode(),
@@ -385,6 +388,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
385388
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192);
386389
const size_t localWorkSize = 256;
387390
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
391+
const uint32_t threadGroupCount = 1u;
388392
uint32_t interfaceDescriptorIndex = 0;
389393
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
390394
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
@@ -398,6 +402,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
398402
mockKernelWithInternal->mockKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
399403
mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
400404
localWorkSizes,
405+
threadGroupCount,
401406
0,
402407
interfaceDescriptorIndex,
403408
pDevice->getPreemptionMode(),
@@ -431,6 +436,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
431436
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192);
432437
const size_t localWorkSize = 256;
433438
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
439+
const uint32_t threadGroupCount = 1u;
434440
uint32_t interfaceDescriptorIndex = 0;
435441
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
436442
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
@@ -444,6 +450,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
444450
mockKernelWithInternal->mockKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
445451
mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
446452
localWorkSizes,
453+
threadGroupCount,
447454
0,
448455
interfaceDescriptorIndex,
449456
pDevice->getPreemptionMode(),
@@ -487,6 +494,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
487494
const size_t localWorkSizeY = 3;
488495
const size_t localWorkSizeZ = 4;
489496
const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ};
497+
const uint32_t threadGroupCount = 1u;
490498

491499
auto &commandStream = cmdQ.getCS(1024);
492500
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
@@ -523,6 +531,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
523531
mockKernel.getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
524532
modifiedKernelInfo.getMaxSimdSize(),
525533
localWorkSizes,
534+
threadGroupCount,
526535
idToffset,
527536
interfaceDescriptorIndex,
528537
pDevice->getPreemptionMode(),
@@ -578,6 +587,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenSendingIndirectStateThenBi
578587
ASSERT_NE(nullptr, kernel);
579588

580589
const size_t localWorkSizes[3]{256, 1, 1};
590+
const uint32_t threadGroupCount = 1u;
581591

582592
auto &commandStream = cmdQ.getCS(1024);
583593
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
@@ -613,6 +623,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenSendingIndirectStateThenBi
613623
kernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
614624
kernel->getKernelInfo().getMaxSimdSize(),
615625
localWorkSizes,
626+
threadGroupCount,
616627
0,
617628
interfaceDescriptorIndex,
618629
pDevice->getPreemptionMode(),
@@ -701,6 +712,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
701712
EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates());
702713

703714
const size_t localWorkSizes[3]{256, 1, 1};
715+
const uint32_t threadGroupCount = 1u;
704716

705717
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
706718

@@ -722,6 +734,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
722734
pKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
723735
pKernel->getKernelInfo().getMaxSimdSize(),
724736
localWorkSizes,
737+
threadGroupCount,
725738
0,
726739
interfaceDescriptorIndex,
727740
pDevice->getPreemptionMode(),
@@ -847,6 +860,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
847860
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192);
848861
const size_t localWorkSize = 256;
849862
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
863+
const uint32_t threadGroupCount = 1u;
850864
uint32_t interfaceDescriptorIndex = 0;
851865
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
852866
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
@@ -863,6 +877,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
863877
mockKernelWithInternal->mockKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
864878
mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
865879
localWorkSizes,
880+
threadGroupCount,
866881
0,
867882
interfaceDescriptorIndex,
868883
pDevice->getPreemptionMode(),
@@ -887,6 +902,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
887902
mockKernelWithInternal->mockKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
888903
mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
889904
localWorkSizes,
905+
threadGroupCount,
890906
0,
891907
interfaceDescriptorIndex,
892908
pDevice->getPreemptionMode(),
@@ -909,6 +925,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
909925

910926
CommandQueueHw<FamilyType> cmdQ(nullptr, pClDevice, 0, false);
911927
const size_t localWorkSizes[3]{1, 1, 1};
928+
const uint32_t threadGroupCount = 1u;
912929

913930
auto &commandStream = cmdQ.getCS(1024);
914931
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
@@ -957,6 +974,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
957974
mockKernelWithInternal->mockKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
958975
8,
959976
localWorkSizes,
977+
threadGroupCount,
960978
interfaceDescriptorTableOffset,
961979
interfaceDescriptorIndex,
962980
pDevice->getPreemptionMode(),

opencl/test/unit_test/kernel/kernel_slm_tests.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, KernelSLMAndBarrierTest, GivenStaticSlmSizeWhenProgr
6767
// After creating Mock Kernel now create Indirect Heap
6868
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192);
6969

70+
const uint32_t threadGroupCount = 1u;
7071
uint64_t interfaceDescriptorOffset = indirectHeap.getUsed();
7172

7273
size_t offsetInterfaceDescriptorData = HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
@@ -78,6 +79,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, KernelSLMAndBarrierTest, GivenStaticSlmSizeWhenProgr
7879
0,
7980
0,
8081
0,
82+
threadGroupCount,
8183
1,
8284
kernel,
8385
4u,
@@ -154,6 +156,7 @@ HWTEST_F(KernelSLMAndBarrierTest, GivenInterfaceDescriptorProgrammedWhenOverride
154156
CommandQueueHw<FamilyType> cmdQ(nullptr, pClDevice, 0, false);
155157
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192);
156158

159+
const uint32_t threadGroupCount = 1u;
157160
uint64_t interfaceDescriptorOffset = indirectHeap.getUsed();
158161
INTERFACE_DESCRIPTOR_DATA interfaceDescriptorData;
159162

@@ -166,6 +169,7 @@ HWTEST_F(KernelSLMAndBarrierTest, GivenInterfaceDescriptorProgrammedWhenOverride
166169
0,
167170
0,
168171
0,
172+
threadGroupCount,
169173
1,
170174
kernel,
171175
4u,

shared/source/command_container/command_encoder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ struct EncodeDispatchKernel {
9797

9898
static void programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
9999

100-
static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo);
100+
static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf);
101101

102102
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
103103

shared/source/command_container/command_encoder.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,7 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
672672
}
673673

674674
template <typename Family>
675-
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo) {}
675+
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {}
676676

677677
template <typename Family>
678678
constexpr bool EncodeDispatchKernel<Family>::shouldUpdateGlobalAtomics(bool &currentVal, bool refVal, bool updateCurrent) { return false; }

shared/source/command_container/command_encoder_bdw_and_later.inl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
221221
container.getResidencyContainer().push_back(args.device->getBindlessHeapsHelper()->getHeap(NEO::BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH)->getGraphicsAllocation());
222222
}
223223

224-
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, hwInfo);
224+
auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension();
225+
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired);
225226

226227
PreemptionHelper::applyPreemptionWaCmdsBegin<Family>(listCmdBufferStream, *args.device);
227228

0 commit comments

Comments
 (0)