intel
diff --git a/‎opencl/source/helpers/hardware_commands_helper_base.inl‎
Lines changed: 2 additions & 2 deletions b/‎opencl/source/helpers/hardware_commands_helper_base.inl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎shared/source/command_container/command_encoder.h‎
Lines changed: 3 additions & 4 deletions b/‎shared/source/command_container/command_encoder.h‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎shared/source/command_container/command_encoder.inl‎
Lines changed: 0 additions & 117 deletions b/‎shared/source/command_container/command_encoder.inl‎
Lines changed: 0 additions & 117 deletions
diff --git a/‎shared/source/command_container/command_encoder_bdw_and_later.inl‎
Lines changed: 11 additions & 9 deletions b/‎shared/source/command_container/command_encoder_bdw_and_later.inl‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎shared/source/command_container/command_encoder_enablers.inl‎
Lines changed: 1 addition & 1 deletion b/‎shared/source/command_container/command_encoder_enablers.inl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎shared/source/command_container/command_encoder_xe_hpc_core_and_later.inl‎
Lines changed: 0 additions & 6 deletions b/‎shared/source/command_container/command_encoder_xe_hpc_core_and_later.inl‎
Lines changed: 0 additions & 6 deletions
@@ -196,8 +196,8 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
         defaultPipelinedThreadArbitrationPolicy = NEO::debugManager.flags.OverrideThreadArbitrationPolicy.get();
     }
     EncodeDispatchKernel<GfxFamily>::encodeEuSchedulingPolicy(&interfaceDescriptor, kernelDescriptor, defaultPipelinedThreadArbitrationPolicy);
-
-    EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorData(interfaceDescriptor, device, hardwareInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, *walkerCmd);
+    const uint32_t threadGroupDimensions[] = {walkerCmd->getThreadGroupIdXDimension(), walkerCmd->getThreadGroupIdYDimension(), walkerCmd->getThreadGroupIdXDimension()};
+    EncodeDispatchKernel<GfxFamily>::encodeThreadGroupDispatch(interfaceDescriptor, device, hardwareInfo, threadGroupDimensions, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, threadsPerThreadGroup, *walkerCmd);
 
     *pInterfaceDescriptor = interfaceDescriptor;
     return (size_t)offsetInterfaceDescriptor;
 
@@ -172,10 +172,9 @@ struct EncodeDispatchKernel {
     static void programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
 
     template <typename WalkerType, typename InterfaceDescriptorType>
-    static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd);
-
-    template <typename WalkerType, typename InterfaceDescriptorType>
-    static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd);
+    static void encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo,
+                                          const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup,
+                                          WalkerType &walkerCmd);
 
     static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
 
 
@@ -761,10 +761,6 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
     }
 }
 
-template <typename Family>
-template <typename WalkerType, typename InterfaceDescriptorType>
-void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {}
-
 template <typename Family>
 size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) {
     using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;
@@ -791,119 +787,6 @@ size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &
     return size;
 }
 
-template <typename GfxFamily>
-template <typename WalkerType, typename InterfaceDescriptorType>
-void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
-    const auto &productHelper = device.getProductHelper();
-
-    if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
-        interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
-
-        bool adjustTGDispatchSize = true;
-        if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) {
-            adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get();
-        }
-        // apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount
-        auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1;
-        if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) {
-            algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get();
-        }
-
-        if (algorithmVersion == 2) {
-            auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
-            if (grfCount == 256) {
-                threadsPerXeCore /= 2;
-            }
-            auto tgDispatchSizeSelected = 8;
-            uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
-
-            if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
-                while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
-                    tgDispatchSizeSelected /= 2;
-                }
-            } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
-                while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
-                    tgDispatchSizeSelected /= 2;
-                }
-            }
-
-            auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
-            auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u;
-
-            // make sure we fit all xe core
-            while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) {
-                tgDispatchSizeSelected /= 2;
-            }
-
-            auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup;
-            // make sure we do not use more threads then present on each xe core
-            while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) {
-                tgDispatchSizeSelected /= 2;
-                threadCountPerGrouping /= 2;
-            }
-
-            if (tgDispatchSizeSelected == 8) {
-                interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
-            } else if (tgDispatchSizeSelected == 1) {
-                interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
-            } else if (tgDispatchSizeSelected == 2) {
-                interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
-            } else {
-                interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
-            }
-        } else {
-            if (adjustTGDispatchSize) {
-                UNRECOVERABLE_IF(grfCount == 0u);
-                constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
-                constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
-                auto &gfxCoreHelper = device.getGfxCoreHelper();
-                uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, grfCount);
-                if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
-                    const uint32_t tilesCount = device.getNumSubDevices();
-                    availableThreadCount *= tilesCount;
-                }
-                uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
-                uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
-                UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
-                auto tgDispatchSizeSelected = 1u;
-
-                if (dispatchedTotalThreadCount <= availableThreadCount) {
-                    tgDispatchSizeSelected = 1;
-                } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
-                    tgDispatchSizeSelected = 8;
-                } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
-                    tgDispatchSizeSelected = 4;
-                } else {
-                    tgDispatchSizeSelected = 2;
-                }
-                if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
-                    while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
-                        tgDispatchSizeSelected /= 2;
-                    }
-                } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
-                    while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
-                        tgDispatchSizeSelected /= 2;
-                    }
-                }
-                if (tgDispatchSizeSelected == 8) {
-                    interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
-                } else if (tgDispatchSizeSelected == 1) {
-                    interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
-                } else if (tgDispatchSizeSelected == 2) {
-                    interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
-                } else {
-                    interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
-                }
-            }
-        }
-    }
-
-    if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) {
-        interfaceDescriptor.setThreadGroupDispatchSize(static_cast<typename InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE>(
-            debugManager.flags.ForceThreadGroupDispatchSize.get()));
-    }
-}
-
 template <typename Family>
 size_t EncodeDispatchKernel<Family>::getSizeRequiredSsh(const KernelInfo &kernelInfo) {
     size_t requiredSshSize = kernelInfo.heapInfo.surfaceStateHeapSize;
 
@@ -67,12 +67,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
 
     LinearStream *listCmdBufferStream = container.getCommandStream();
 
-    auto threadDims = static_cast<const uint32_t *>(args.threadGroupDimensions);
-    const Vec3<size_t> threadStartVec{0, 0, 0};
-    Vec3<size_t> threadDimsVec{0, 0, 0};
-    if (!args.isIndirect) {
-        threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
-    }
+    auto threadGroupDims = static_cast<const uint32_t *>(args.threadGroupDimensions);
 
     DefaultWalkerType cmd = Family::cmdInitGpgpuWalker;
     auto idd = Family::cmdInitInterfaceDescriptorData;
@@ -267,11 +262,11 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
 
     EncodeDispatchKernel<Family>::encodeThreadData(cmd,
                                                    nullptr,
-                                                   threadDims,
+                                                   threadGroupDims,
                                                    args.dispatchInterface->getGroupSize(),
                                                    kernelDescriptor.kernelAttributes.simdSize,
                                                    kernelDescriptor.kernelAttributes.numLocalIdChannels,
-                                                   args.dispatchInterface->getNumThreadsPerThreadGroup(),
+                                                   numThreadsPerThreadGroup,
                                                    args.dispatchInterface->getThreadExecutionMask(),
                                                    true,
                                                    false,
@@ -282,7 +277,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
     cmd.setPredicateEnable(args.isPredicate);
 
     auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension();
-    EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, cmd);
+    EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd);
 
     memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd));
 
@@ -635,4 +630,11 @@ template <typename WalkerType, typename InterfaceDescriptorType>
 void EncodeDispatchKernel<Family>::overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor) {
 }
 
+template <typename Family>
+template <typename WalkerType, typename InterfaceDescriptorType>
+void EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo,
+                                                             const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup,
+                                                             WalkerType &walkerCmd) {
+}
+
 } // namespace NEO
@@ -16,7 +16,7 @@ template void NEO::EncodeDispatchKernel<Family>::setupPostSyncForRegularEvent<Fa
 template void NEO::EncodeDispatchKernel<Family>::setupPostSyncForInOrderExec<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args);
 template void NEO::EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
 template void NEO::EncodeDispatchKernel<Family>::setupPreferredSlmSize<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
-template void NEO::EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, Family::DefaultWalkerType &walkerCmd);
+template void NEO::EncodeDispatchKernel<Family>::encodeThreadGroupDispatch<Family::DefaultWalkerType, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, Family::DefaultWalkerType &walkerCmd);
 template void NEO::EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
 template void NEO::EncodeDispatchKernel<Family>::encode<Family::DefaultWalkerType>(CommandContainer &container, EncodeDispatchKernelArgs &args);
 template void NEO::EncodeDispatchKernel<Family>::encodeThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
 
@@ -33,10 +33,4 @@ void EncodeBatchBufferStartOrEnd<Family>::appendBatchBufferStart(MI_BATCH_BUFFER
     cmd.setPredicationEnable(predicate);
 }
 
-template <>
-template <typename WalkerType, typename InterfaceDescriptorType>
-void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
-    EncodeDispatchKernel<Family>::adjustInterfaceDescriptorDataForOverdispatch(interfaceDescriptor, device, hwInfo, threadGroupCount, grfCount, walkerCmd);
-}
-
 } // namespace NEO