Skip to content

Commit e5de625

Browse files
admitricsys_zuul
authored andcommitted
Force thread group size option added
Change-Id: Icd2858bc6227308d0562ba17849c4e2fdc4875c9
1 parent ea969d0 commit e5de625

File tree

7 files changed

+160
-18
lines changed

7 files changed

+160
-18
lines changed

IGC/Compiler/CISACodeGen/ComputeShaderCodeGen.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ namespace IGC
4646
, m_hasSLM(false)
4747
, m_tileY(false)
4848
, m_walkOrder(WO_XYZ)
49+
, m_threadGroupModifier_X(0)
50+
, m_threadGroupModifier_Y(0)
4951
{
5052
}
5153

@@ -321,6 +323,9 @@ namespace IGC
321323

322324
pKernelProgram->ThreadGroupSize = m_threadGroupSize;
323325

326+
pKernelProgram->ThreadGroupModifier_X = m_threadGroupModifier_X;
327+
pKernelProgram->ThreadGroupModifier_Y = m_threadGroupModifier_Y;
328+
324329
pKernelProgram->CSHThreadDispatchChannel = 0;
325330

326331
pKernelProgram->CompiledForIndirectPayload = 0;
@@ -351,6 +356,15 @@ namespace IGC
351356
m_threadGroupSize_Z = int_cast<uint>(llvm::cast<llvm::ConstantInt>(pGlobal->getInitializer())->getZExtValue());
352357

353358
m_threadGroupSize = m_threadGroupSize_X * m_threadGroupSize_Y * m_threadGroupSize_Z;
359+
360+
pGlobal = module->getGlobalVariable("ThreadGroupModifier_X");
361+
if ((pGlobal != nullptr) && pGlobal->hasInitializer()) {
362+
m_threadGroupModifier_X = int_cast<uint>(llvm::cast<llvm::ConstantInt>(pGlobal->getInitializer())->getZExtValue());
363+
}
364+
pGlobal = module->getGlobalVariable("ThreadGroupModifier_Y");
365+
if ((pGlobal != nullptr) && pGlobal->hasInitializer()) {
366+
m_threadGroupModifier_Y = int_cast<uint>(llvm::cast<llvm::ConstantInt>(pGlobal->getInitializer())->getZExtValue());
367+
}
354368
}
355369

356370
void CComputeShader::PreCompile()

IGC/Compiler/CISACodeGen/ComputeShaderCodeGen.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ namespace IGC
6666
bool m_dispatchAlongY;
6767
bool m_disableMidThreadPreemption;
6868
bool m_hasSLM;
69+
70+
uint m_threadGroupModifier_X;
71+
uint m_threadGroupModifier_Y;
6972
private:
7073
enum WALK_ORDER {
7174
WO_XYZ,

IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1422,8 +1422,11 @@ void OptimizeIR(CodeGenContext* const pContext)
14221422
mpm.add(createIPConstantPropagationPass());
14231423
}
14241424

1425+
14251426
// enable this only when Pooled EU is not supported
1426-
if (IGC_IS_FLAG_ENABLED(EnableThreadCombiningOpt) &&
1427+
if ((IGC_IS_FLAG_ENABLED(EnableThreadCombiningOpt) ||
1428+
IGC_IS_FLAG_ENABLED(EnableForceThreadCombining) ||
1429+
IGC_IS_FLAG_ENABLED(EnableForceGroupSize)) &&
14271430
(pContext->type == ShaderType::COMPUTE_SHADER) &&
14281431
!pContext->platform.supportPooledEU() &&
14291432
pContext->platform.supportsThreadCombining())

IGC/Compiler/CodeGenPublic.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,9 @@ namespace IGC
532532

533533
bool DispatchAlongY;
534534

535+
unsigned int ThreadGroupModifier_X;
536+
unsigned int ThreadGroupModifier_Y;
537+
535538
/* Output related to only the PingPong Textures */
536539
bool SecondCompile;
537540
bool IsRowMajor;

IGC/Compiler/ThreadCombining.cpp

Lines changed: 127 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
2323
2424
2525
======================= end_copyright_notice ==================================*/
26-
2726
#include "GenISAIntrinsics/GenIntrinsicInst.h"
2827
#include "ThreadCombining.hpp"
2928
#include "Compiler/IGCPassSupport.h"
@@ -366,16 +365,15 @@ bool ThreadCombining::canDoOptimization(Function* m_kernel, llvm::Module& M)
366365
unsigned int threadGroupSize_Y = GetthreadGroupSize(M, ThreadGroupSize_Y);
367366
unsigned int threadGroupSize_Z = GetthreadGroupSize(M, ThreadGroupSize_Z);
368367

369-
if (threadGroupSize_X == 1 ||
370-
threadGroupSize_Y == 1 ||
371-
threadGroupSize_Z != 1)
372-
{
373-
return false;
374-
}
375-
376368
std::vector<llvm::Instruction*> barriers;
377369
PreAnalysis(m_kernel, M, barriers);
378370

371+
// Explicit thread group size shrinking works only for no-barrier no-SLM case
372+
if (IGC_IS_FLAG_ENABLED(EnableForceGroupSize))
373+
{
374+
return barriers.empty() && !m_SLMUsed && (threadGroupSize_Z == 1);
375+
}
376+
379377
PostDominatorTree* PDT = &getAnalysis<PostDominatorTreeWrapperPass>(*m_kernel).getPostDomTree();
380378
// Check if any of the barriers are within control flow
381379
bool anyBarrierWithinControlFlow = false;
@@ -387,13 +385,23 @@ bool ThreadCombining::canDoOptimization(Function* m_kernel, llvm::Module& M)
387385
}
388386
}
389387

390-
if ((!m_SLMUsed && IGC_IS_FLAG_DISABLED(EnableThreadCombiningWithNoSLM)) ||
391-
anyBarrierWithinControlFlow)
388+
if (anyBarrierWithinControlFlow)
392389
{
393390
return false;
394391
}
395392

396-
FindRegistersAliveAcrossBarriers(m_kernel, M);
393+
if (threadGroupSize_X == 1 ||
394+
threadGroupSize_Y == 1 ||
395+
threadGroupSize_Z != 1)
396+
{
397+
return false;
398+
}
399+
400+
if (!m_SLMUsed && IGC_IS_FLAG_DISABLED(EnableThreadCombiningWithNoSLM)
401+
&& !IGC_IS_FLAG_ENABLED(EnableForceThreadCombining))
402+
{
403+
return false;
404+
}
397405

398406
return true;
399407
}
@@ -621,6 +629,83 @@ void ThreadCombining::CreateNewKernel(llvm::Module& M,
621629
}
622630
}
623631

632+
// Remap ThreadIDs and GroupIDs to old values
633+
void ThreadCombining::remapThreads(
634+
llvm::Module& M,
635+
unsigned int newSizeX,
636+
unsigned int newSizeY,
637+
unsigned int threadGroupSize_X,
638+
unsigned int threadGroupSize_Y,
639+
llvm::IRBuilder<> builder)
640+
{
641+
unsigned int threadGroupSizeModifier_X = threadGroupSize_X / newSizeX;
642+
unsigned int threadGroupSizeModifier_Y = threadGroupSize_Y / newSizeY;
643+
644+
BasicBlock* oldEntry = &(m_kernel->getEntryBlock());
645+
BasicBlock* newEntry = BasicBlock::Create(M.getContext(), "ThreadID_remap", m_kernel, oldEntry);
646+
647+
builder.SetInsertPoint(newEntry);
648+
649+
Function* ThreadIDFN = GenISAIntrinsic::getDeclaration(&M, GenISAIntrinsic::GenISA_DCL_SystemValue, builder.getFloatTy());
650+
Value* threadID_X = builder.CreateCall(ThreadIDFN, builder.getInt32(THREAD_ID_IN_GROUP_X));
651+
Value* threadID_Y = builder.CreateCall(ThreadIDFN, builder.getInt32(THREAD_ID_IN_GROUP_Y));
652+
Value* groupID_X = builder.CreateCall(ThreadIDFN, builder.getInt32(THREAD_GROUP_ID_X));
653+
Value* groupID_Y = builder.CreateCall(ThreadIDFN, builder.getInt32(THREAD_GROUP_ID_Y));
654+
655+
threadID_X = builder.CreateBitCast(threadID_X, builder.getInt32Ty());
656+
threadID_Y = builder.CreateBitCast(threadID_Y, builder.getInt32Ty());
657+
groupID_X = builder.CreateBitCast(groupID_X, builder.getInt32Ty());
658+
groupID_Y = builder.CreateBitCast(groupID_Y, builder.getInt32Ty());
659+
660+
Value* oldGroupID_X = builder.CreateUDiv(groupID_X, builder.getInt32(threadGroupSizeModifier_X));
661+
Value* oldGroupID_Y = builder.CreateUDiv(groupID_Y, builder.getInt32(threadGroupSizeModifier_Y));
662+
663+
Value* oldThreadID_X = builder.CreateURem(groupID_X, builder.getInt32(threadGroupSizeModifier_X));
664+
oldThreadID_X = builder.CreateAdd(threadID_X, builder.CreateMul(builder.getInt32(newSizeX), oldThreadID_X));
665+
Value* oldThreadID_Y = builder.CreateURem(groupID_Y, builder.getInt32(threadGroupSizeModifier_Y));
666+
oldThreadID_Y = builder.CreateAdd(threadID_Y, builder.CreateMul(builder.getInt32(newSizeY), oldThreadID_Y));
667+
668+
for (auto& BI : *m_kernel)
669+
{
670+
for (auto& inst : BI)
671+
{
672+
if (&BI == newEntry)
673+
{
674+
continue;
675+
}
676+
if (GenIntrinsicInst * b = dyn_cast<GenIntrinsicInst>(&inst))
677+
{
678+
if (b->getIntrinsicID() == GenISAIntrinsic::GenISA_DCL_SystemValue)
679+
{
680+
switch (cast<ConstantInt>(b->getOperand(0))->getZExtValue())
681+
{
682+
case THREAD_ID_IN_GROUP_X:
683+
b->replaceAllUsesWith(builder.CreateBitCast(oldThreadID_X, b->getType()));
684+
break;
685+
case THREAD_ID_IN_GROUP_Y:
686+
b->replaceAllUsesWith(builder.CreateBitCast(oldThreadID_Y, b->getType()));
687+
break;
688+
case THREAD_GROUP_ID_X:
689+
b->replaceAllUsesWith(builder.CreateBitCast(oldGroupID_X, b->getType()));
690+
break;
691+
case THREAD_GROUP_ID_Y:
692+
b->replaceAllUsesWith(builder.CreateBitCast(oldGroupID_Y, b->getType()));
693+
break;
694+
default:
695+
break;
696+
}
697+
}
698+
}
699+
}
700+
}
701+
builder.CreateBr(oldEntry);
702+
703+
// Set in global variable, how many times thread group size was reduced
704+
// It will be used by UMD for increasing dispatch size in the same amount
705+
M.getGlobalVariable("ThreadGroupModifier_X")->setInitializer(builder.getInt32(threadGroupSizeModifier_X));
706+
M.getGlobalVariable("ThreadGroupModifier_Y")->setInitializer(builder.getInt32(threadGroupSizeModifier_Y));
707+
}
708+
624709
bool ThreadCombining::runOnModule(llvm::Module& M)
625710
{
626711
llvm::IRBuilder<> builder(M.getContext());
@@ -635,6 +720,8 @@ bool ThreadCombining::runOnModule(llvm::Module& M)
635720
return false;
636721
}
637722

723+
FindRegistersAliveAcrossBarriers(m_kernel, M);
724+
638725
unsigned int threadGroupSize_X = GetthreadGroupSize(M, ThreadGroupSize_X);
639726
unsigned int threadGroupSize_Y = GetthreadGroupSize(M, ThreadGroupSize_Y);
640727

@@ -680,15 +767,20 @@ bool ThreadCombining::runOnModule(llvm::Module& M)
680767

681768
unsigned int newSizeX = threadGroupSize_X;
682769
unsigned int newSizeY = threadGroupSize_Y;
683-
// Heuristic for Threadcombining based on EU Occupancy, if EU occupancy increases with the new
684-
// size then combine threads, otherwise skip it
685-
if (IGC_IS_FLAG_ENABLED(EnableForceGroupSize))
770+
if (IGC_IS_FLAG_ENABLED(EnableForceGroupSize) || IGC_IS_FLAG_ENABLED(EnableForceThreadCombining))
686771
{
772+
if (IGC_GET_FLAG_VALUE(ForceGroupSizeShaderHash) &&
773+
(IGC_GET_FLAG_VALUE(ForceGroupSizeShaderHash) != (DWORD)csCtx->hash.getAsmHash()))
774+
{
775+
return false;
776+
}
687777
newSizeX = IGC_GET_FLAG_VALUE(ForceGroupSizeX);
688778
newSizeY = IGC_GET_FLAG_VALUE(ForceGroupSizeY);
689779
}
690780
else if (x * y >= minTGSizeHeuristic && newThreadOccupancy > currentThreadOccupancy)
691781
{
782+
// Heuristic for Threadcombining based on EU Occupancy, if EU occupancy increases with the new
783+
// size then combine threads, otherwise skip it
692784
newSizeX = x;
693785
newSizeY = y;
694786
currentThreadOccupancy = newThreadOccupancy;
@@ -706,12 +798,31 @@ bool ThreadCombining::runOnModule(llvm::Module& M)
706798
return false;
707799
}
708800

709-
IGC_ASSERT(newSizeX <= threadGroupSize_X);
710-
IGC_ASSERT(newSizeY <= threadGroupSize_Y);
801+
if ((newSizeX > threadGroupSize_X) ||
802+
(newSizeY > threadGroupSize_Y) ||
803+
((threadGroupSize_X % newSizeX) != 0) ||
804+
((threadGroupSize_Y % newSizeY) != 0))
805+
{
806+
return false;
807+
}
711808

712809
SetthreadGroupSize(M, builder.getInt32(newSizeX), ThreadGroupSize_X);
713810
SetthreadGroupSize(M, builder.getInt32(newSizeY), ThreadGroupSize_Y);
714811

812+
if (IGC_IS_FLAG_ENABLED(EnableForceGroupSize))
813+
{
814+
// Don't perform thread combining, just remap threads as if thread group size hasn't been changed
815+
remapThreads(
816+
M,
817+
newSizeX,
818+
newSizeY,
819+
threadGroupSize_X,
820+
threadGroupSize_Y,
821+
builder);
822+
return true;
823+
}
824+
825+
// Perform Thread Combining
715826
// Create a new function with function arguments, New threadIDX, threadIDY,
716827
// a bool variable to indicate if it is kernel section before last barrier or after
717828
// last barrier and all the live variables
@@ -759,6 +870,5 @@ bool ThreadCombining::runOnModule(llvm::Module& M)
759870
builder);
760871

761872
context->m_threadCombiningOptDone = true;
762-
763873
return true;
764874
}

IGC/Compiler/ThreadCombining.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,13 @@ namespace IGC
8787
bool isSLMUsed(llvm::Instruction* I) const;
8888
unsigned int GetthreadGroupSize(llvm::Module& M, dim dimension);
8989
void SetthreadGroupSize(llvm::Module& M, llvm::Constant* size, dim dimension);
90+
void remapThreads(llvm::Module& M,
91+
unsigned int newSizeX,
92+
unsigned int newSizeY,
93+
unsigned int threadGroupSize_X,
94+
unsigned int threadGroupSize_Y,
95+
llvm::IRBuilder<> builder);
96+
9097
void CreateLoopKernel(
9198
llvm::Module& M,
9299
unsigned int newSizeX,

IGC/common/igc_flags.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,8 @@ DECLARE_IGC_REGKEY(bool, EnableStackCallFuncCall, false, "If enabled, the
337337
DECLARE_IGC_REGKEY(DWORD, OCLInlineThreshold, 512, "Setting OCL inline thershold", false)
338338
DECLARE_IGC_REGKEY(bool, DisableAddingAlwaysAttribute, false, "Disable adding always attribute", true)
339339
DECLARE_IGC_REGKEY(bool, EnableForceGroupSize, false, "Enable forcing thread Group Size ForceGroupSizeX and ForceGroupSizeY", false)
340+
DECLARE_IGC_REGKEY(bool, EnableForceThreadCombining, false, "Enable forcing Thread Combining with thread Group Size ForceGroupSizeX and ForceGroupSizeY", false)
341+
DECLARE_IGC_REGKEY(DWORD, ForceGroupSizeShaderHash, 0, "Shader hash for forcing thread group size or thread combining (lower 8 hex digits)", false)
340342
DECLARE_IGC_REGKEY(DWORD, ForceGroupSizeX, 8, "force group size along X", false)
341343
DECLARE_IGC_REGKEY(DWORD, ForceGroupSizeY, 8, "force group size along Y", false)
342344
DECLARE_IGC_REGKEY(bool, EnableThreadCombiningWithNoSLM, false, "Enable thread combining opt for shader without SLM", false)

0 commit comments

Comments
 (0)