@@ -761,10 +761,6 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
761761 }
762762}
763763
764- template <typename Family>
765- template <typename WalkerType, typename InterfaceDescriptorType>
766- void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {}
767-
768764template <typename Family>
769765size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) {
770766 using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;
@@ -791,119 +787,6 @@ size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &
791787 return size;
792788}
793789
794- template <typename GfxFamily>
795- template <typename WalkerType, typename InterfaceDescriptorType>
796- void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {
797- const auto &productHelper = device.getProductHelper ();
798-
799- if (productHelper.isDisableOverdispatchAvailable (hwInfo)) {
800- interfaceDescriptor.setThreadGroupDispatchSize (InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
801-
802- bool adjustTGDispatchSize = true ;
803- if (debugManager.flags .AdjustThreadGroupDispatchSize .get () != -1 ) {
804- adjustTGDispatchSize = !!debugManager.flags .AdjustThreadGroupDispatchSize .get ();
805- }
806- // apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount
807- auto algorithmVersion = hwInfo.gtSystemInfo .MaxSubSlicesSupported == hwInfo.gtSystemInfo .SubSliceCount ? 2 : 1 ;
808- if (debugManager.flags .ForceThreadGroupDispatchSizeAlgorithm .get () != -1 ) {
809- algorithmVersion = debugManager.flags .ForceThreadGroupDispatchSizeAlgorithm .get ();
810- }
811-
812- if (algorithmVersion == 2 ) {
813- auto threadsPerXeCore = hwInfo.gtSystemInfo .ThreadCount / hwInfo.gtSystemInfo .MaxSubSlicesSupported ;
814- if (grfCount == 256 ) {
815- threadsPerXeCore /= 2 ;
816- }
817- auto tgDispatchSizeSelected = 8 ;
818- uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup ();
819-
820- if (walkerCmd.getThreadGroupIdXDimension () > 1 && (walkerCmd.getThreadGroupIdYDimension () > 1 || walkerCmd.getThreadGroupIdZDimension () > 1 )) {
821- while (walkerCmd.getThreadGroupIdXDimension () % tgDispatchSizeSelected != 0 ) {
822- tgDispatchSizeSelected /= 2 ;
823- }
824- } else if (walkerCmd.getThreadGroupIdYDimension () > 1 && walkerCmd.getThreadGroupIdZDimension () > 1 ) {
825- while (walkerCmd.getThreadGroupIdYDimension () % tgDispatchSizeSelected != 0 ) {
826- tgDispatchSizeSelected /= 2 ;
827- }
828- }
829-
830- auto workgroupCount = walkerCmd.getThreadGroupIdXDimension () * walkerCmd.getThreadGroupIdYDimension () * walkerCmd.getThreadGroupIdZDimension ();
831- auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled (device.getDeviceBitfield (), true ) ? device.getNumSubDevices () : 1u ;
832-
833- // make sure we fit all xe core
834- while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo .MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1 ) {
835- tgDispatchSizeSelected /= 2 ;
836- }
837-
838- auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup;
839- // make sure we do not use more threads then present on each xe core
840- while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1 ) {
841- tgDispatchSizeSelected /= 2 ;
842- threadCountPerGrouping /= 2 ;
843- }
844-
845- if (tgDispatchSizeSelected == 8 ) {
846- interfaceDescriptor.setThreadGroupDispatchSize (InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
847- } else if (tgDispatchSizeSelected == 1 ) {
848- interfaceDescriptor.setThreadGroupDispatchSize (InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
849- } else if (tgDispatchSizeSelected == 2 ) {
850- interfaceDescriptor.setThreadGroupDispatchSize (InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
851- } else {
852- interfaceDescriptor.setThreadGroupDispatchSize (InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
853- }
854- } else {
855- if (adjustTGDispatchSize) {
856- UNRECOVERABLE_IF (grfCount == 0u );
857- constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u ;
858- constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u ;
859- auto &gfxCoreHelper = device.getGfxCoreHelper ();
860- uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount (hwInfo, grfCount);
861- if (ImplicitScalingHelper::isImplicitScalingEnabled (device.getDeviceBitfield (), true )) {
862- const uint32_t tilesCount = device.getNumSubDevices ();
863- availableThreadCount *= tilesCount;
864- }
865- uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup ();
866- uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
867- UNRECOVERABLE_IF (numberOfThreadsInThreadGroup == 0u );
868- auto tgDispatchSizeSelected = 1u ;
869-
870- if (dispatchedTotalThreadCount <= availableThreadCount) {
871- tgDispatchSizeSelected = 1 ;
872- } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
873- tgDispatchSizeSelected = 8 ;
874- } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
875- tgDispatchSizeSelected = 4 ;
876- } else {
877- tgDispatchSizeSelected = 2 ;
878- }
879- if (walkerCmd.getThreadGroupIdXDimension () > 1 && (walkerCmd.getThreadGroupIdYDimension () > 1 || walkerCmd.getThreadGroupIdZDimension () > 1 )) {
880- while (walkerCmd.getThreadGroupIdXDimension () % tgDispatchSizeSelected != 0 ) {
881- tgDispatchSizeSelected /= 2 ;
882- }
883- } else if (walkerCmd.getThreadGroupIdYDimension () > 1 && walkerCmd.getThreadGroupIdZDimension () > 1 ) {
884- while (walkerCmd.getThreadGroupIdYDimension () % tgDispatchSizeSelected != 0 ) {
885- tgDispatchSizeSelected /= 2 ;
886- }
887- }
888- if (tgDispatchSizeSelected == 8 ) {
889- interfaceDescriptor.setThreadGroupDispatchSize (InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
890- } else if (tgDispatchSizeSelected == 1 ) {
891- interfaceDescriptor.setThreadGroupDispatchSize (InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
892- } else if (tgDispatchSizeSelected == 2 ) {
893- interfaceDescriptor.setThreadGroupDispatchSize (InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
894- } else {
895- interfaceDescriptor.setThreadGroupDispatchSize (InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
896- }
897- }
898- }
899- }
900-
901- if (debugManager.flags .ForceThreadGroupDispatchSize .get () != -1 ) {
902- interfaceDescriptor.setThreadGroupDispatchSize (static_cast <typename InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE>(
903- debugManager.flags .ForceThreadGroupDispatchSize .get ()));
904- }
905- }
906-
907790template <typename Family>
908791size_t EncodeDispatchKernel<Family>::getSizeRequiredSsh(const KernelInfo &kernelInfo) {
909792 size_t requiredSshSize = kernelInfo.heapInfo .surfaceStateHeapSize ;
0 commit comments