Skip to content

Commit bbdf1ac

Browse files
refactor: change encoder for thread group over dispatch 1/n
- change method name to more meaningful - add all inputs of the algorithm as explicit function arguments - position all implementations accordingly - rename unit test names to fit new method name - fix unit test to have correct initial command values - fix unit test to have consistent input values with command values - fix unit test to change input values together with command values Related-To: NEO-12639 Signed-off-by: Zbigniew Zdanowicz <[email protected]>
1 parent 8c3c703 commit bbdf1ac

11 files changed

+343
-244
lines changed

opencl/source/helpers/hardware_commands_helper_base.inl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,8 +196,8 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
196196
defaultPipelinedThreadArbitrationPolicy = NEO::debugManager.flags.OverrideThreadArbitrationPolicy.get();
197197
}
198198
EncodeDispatchKernel<GfxFamily>::encodeEuSchedulingPolicy(&interfaceDescriptor, kernelDescriptor, defaultPipelinedThreadArbitrationPolicy);
199-
200-
EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorData(interfaceDescriptor, device, hardwareInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, *walkerCmd);
199+
const uint32_t threadGroupDimensions[] = {walkerCmd->getThreadGroupIdXDimension(), walkerCmd->getThreadGroupIdYDimension(), walkerCmd->getThreadGroupIdXDimension()};
200+
EncodeDispatchKernel<GfxFamily>::encodeThreadGroupDispatch(interfaceDescriptor, device, hardwareInfo, threadGroupDimensions, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, threadsPerThreadGroup, *walkerCmd);
201201

202202
*pInterfaceDescriptor = interfaceDescriptor;
203203
return (size_t)offsetInterfaceDescriptor;

shared/source/command_container/command_encoder.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -172,10 +172,9 @@ struct EncodeDispatchKernel {
172172
static void programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
173173

174174
template <typename WalkerType, typename InterfaceDescriptorType>
175-
static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd);
176-
177-
template <typename WalkerType, typename InterfaceDescriptorType>
178-
static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd);
175+
static void encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo,
176+
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup,
177+
WalkerType &walkerCmd);
179178

180179
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
181180

shared/source/command_container/command_encoder.inl

Lines changed: 0 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -761,10 +761,6 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
761761
}
762762
}
763763

764-
template <typename Family>
765-
template <typename WalkerType, typename InterfaceDescriptorType>
766-
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {}
767-
768764
template <typename Family>
769765
size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) {
770766
using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;
@@ -791,119 +787,6 @@ size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &
791787
return size;
792788
}
793789

794-
template <typename GfxFamily>
795-
template <typename WalkerType, typename InterfaceDescriptorType>
796-
void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
797-
const auto &productHelper = device.getProductHelper();
798-
799-
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
800-
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
801-
802-
bool adjustTGDispatchSize = true;
803-
if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) {
804-
adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get();
805-
}
806-
// apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount
807-
auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1;
808-
if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) {
809-
algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get();
810-
}
811-
812-
if (algorithmVersion == 2) {
813-
auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
814-
if (grfCount == 256) {
815-
threadsPerXeCore /= 2;
816-
}
817-
auto tgDispatchSizeSelected = 8;
818-
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
819-
820-
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
821-
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
822-
tgDispatchSizeSelected /= 2;
823-
}
824-
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
825-
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
826-
tgDispatchSizeSelected /= 2;
827-
}
828-
}
829-
830-
auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
831-
auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u;
832-
833-
// make sure we fit all xe core
834-
while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) {
835-
tgDispatchSizeSelected /= 2;
836-
}
837-
838-
auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup;
839-
// make sure we do not use more threads then present on each xe core
840-
while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) {
841-
tgDispatchSizeSelected /= 2;
842-
threadCountPerGrouping /= 2;
843-
}
844-
845-
if (tgDispatchSizeSelected == 8) {
846-
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
847-
} else if (tgDispatchSizeSelected == 1) {
848-
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
849-
} else if (tgDispatchSizeSelected == 2) {
850-
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
851-
} else {
852-
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
853-
}
854-
} else {
855-
if (adjustTGDispatchSize) {
856-
UNRECOVERABLE_IF(grfCount == 0u);
857-
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
858-
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
859-
auto &gfxCoreHelper = device.getGfxCoreHelper();
860-
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, grfCount);
861-
if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
862-
const uint32_t tilesCount = device.getNumSubDevices();
863-
availableThreadCount *= tilesCount;
864-
}
865-
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
866-
uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
867-
UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
868-
auto tgDispatchSizeSelected = 1u;
869-
870-
if (dispatchedTotalThreadCount <= availableThreadCount) {
871-
tgDispatchSizeSelected = 1;
872-
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
873-
tgDispatchSizeSelected = 8;
874-
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
875-
tgDispatchSizeSelected = 4;
876-
} else {
877-
tgDispatchSizeSelected = 2;
878-
}
879-
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
880-
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
881-
tgDispatchSizeSelected /= 2;
882-
}
883-
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
884-
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
885-
tgDispatchSizeSelected /= 2;
886-
}
887-
}
888-
if (tgDispatchSizeSelected == 8) {
889-
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
890-
} else if (tgDispatchSizeSelected == 1) {
891-
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
892-
} else if (tgDispatchSizeSelected == 2) {
893-
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
894-
} else {
895-
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
896-
}
897-
}
898-
}
899-
}
900-
901-
if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) {
902-
interfaceDescriptor.setThreadGroupDispatchSize(static_cast<typename InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE>(
903-
debugManager.flags.ForceThreadGroupDispatchSize.get()));
904-
}
905-
}
906-
907790
template <typename Family>
908791
size_t EncodeDispatchKernel<Family>::getSizeRequiredSsh(const KernelInfo &kernelInfo) {
909792
size_t requiredSshSize = kernelInfo.heapInfo.surfaceStateHeapSize;

shared/source/command_container/command_encoder_bdw_and_later.inl

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
6767

6868
LinearStream *listCmdBufferStream = container.getCommandStream();
6969

70-
auto threadDims = static_cast<const uint32_t *>(args.threadGroupDimensions);
71-
const Vec3<size_t> threadStartVec{0, 0, 0};
72-
Vec3<size_t> threadDimsVec{0, 0, 0};
73-
if (!args.isIndirect) {
74-
threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
75-
}
70+
auto threadGroupDims = static_cast<const uint32_t *>(args.threadGroupDimensions);
7671

7772
DefaultWalkerType cmd = Family::cmdInitGpgpuWalker;
7873
auto idd = Family::cmdInitInterfaceDescriptorData;
@@ -267,11 +262,11 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
267262

268263
EncodeDispatchKernel<Family>::encodeThreadData(cmd,
269264
nullptr,
270-
threadDims,
265+
threadGroupDims,
271266
args.dispatchInterface->getGroupSize(),
272267
kernelDescriptor.kernelAttributes.simdSize,
273268
kernelDescriptor.kernelAttributes.numLocalIdChannels,
274-
args.dispatchInterface->getNumThreadsPerThreadGroup(),
269+
numThreadsPerThreadGroup,
275270
args.dispatchInterface->getThreadExecutionMask(),
276271
true,
277272
false,
@@ -282,7 +277,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
282277
cmd.setPredicateEnable(args.isPredicate);
283278

284279
auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension();
285-
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, cmd);
280+
EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd);
286281

287282
memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd));
288283

@@ -635,4 +630,11 @@ template <typename WalkerType, typename InterfaceDescriptorType>
635630
void EncodeDispatchKernel<Family>::overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor) {
636631
}
637632

633+
template <typename Family>
634+
template <typename WalkerType, typename InterfaceDescriptorType>
635+
void EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo,
636+
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup,
637+
WalkerType &walkerCmd) {
638+
}
639+
638640
} // namespace NEO

shared/source/command_container/command_encoder_enablers.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ template void NEO::EncodeDispatchKernel<Family>::setupPostSyncForRegularEvent<Fa
1616
template void NEO::EncodeDispatchKernel<Family>::setupPostSyncForInOrderExec<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args);
1717
template void NEO::EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
1818
template void NEO::EncodeDispatchKernel<Family>::setupPreferredSlmSize<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
19-
template void NEO::EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, Family::DefaultWalkerType &walkerCmd);
19+
template void NEO::EncodeDispatchKernel<Family>::encodeThreadGroupDispatch<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, Family::DefaultWalkerType &walkerCmd);
2020
template void NEO::EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
2121
template void NEO::EncodeDispatchKernel<Family>::encode<Family::DefaultWalkerType>(CommandContainer &container, EncodeDispatchKernelArgs &args);
2222
template void NEO::EncodeDispatchKernel<Family>::encodeThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);

shared/source/command_container/command_encoder_xe_hpc_core_and_later.inl

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,4 @@ void EncodeBatchBufferStartOrEnd<Family>::appendBatchBufferStart(MI_BATCH_BUFFER
3333
cmd.setPredicationEnable(predicate);
3434
}
3535

36-
template <>
37-
template <typename WalkerType, typename InterfaceDescriptorType>
38-
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
39-
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorDataForOverdispatch(interfaceDescriptor, device, hwInfo, threadGroupCount, grfCount, walkerCmd);
40-
}
41-
4236
} // namespace NEO

0 commit comments

Comments
 (0)