1414
1515#include " AMDGPU.h"
1616#include " AMDGPUTargetMachine.h"
17+ #include " AMDGPUTargetTransformInfo.h"
1718#include " llvm/Analysis/AssumptionCache.h"
1819#include " llvm/Analysis/UniformityAnalysis.h"
1920#include " llvm/Analysis/ValueTracking.h"
@@ -45,6 +46,7 @@ class AMDGPULateCodeGenPrepare
4546 Function &F;
4647 const DataLayout &DL;
4748 const GCNSubtarget &ST;
49+ const TargetTransformInfo &TTI;
4850
4951 AssumptionCache *const AC;
5052 UniformityInfo &UA;
@@ -53,8 +55,9 @@ class AMDGPULateCodeGenPrepare
5355
5456public:
5557 AMDGPULateCodeGenPrepare (Function &F, const GCNSubtarget &ST,
56- AssumptionCache *AC, UniformityInfo &UA)
57- : F(F), DL(F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
58+ const TargetTransformInfo &TTI, AssumptionCache *AC,
59+ UniformityInfo &UA)
60+ : F(F), DL(F.getDataLayout()), ST(ST), TTI(TTI), AC(AC), UA(UA) {}
5861 bool run ();
5962 bool visitInstruction (Instruction &) { return false ; }
6063
@@ -75,6 +78,8 @@ class LiveRegOptimizer {
7578 Module &Mod;
7679 const DataLayout &DL;
7780 const GCNSubtarget &ST;
81+ const TargetTransformInfo &TTI;
82+
7883 // / The scalar type to convert to
7984 Type *const ConvertToScalar;
8085 // / The set of visited Instructions
@@ -125,8 +130,41 @@ class LiveRegOptimizer {
125130 return LK.first != TargetLoweringBase::TypeLegal;
126131 }
127132
128- LiveRegOptimizer (Module &Mod, const GCNSubtarget &ST)
129- : Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
133+ // Filtering based on operation or its cost.
134+ // If an operation incurs high enough cost or natively work on
135+ // vector of illegal type, ie. v2i8, then it makes sense to try
136+ // to avoid scalarizing across BB.
137+ bool shouldReplaceBasedOnOp (Instruction *II) {
138+ // Ignore pseudos
139+ if (II->isDebugOrPseudoInst ())
140+ return false ;
141+
142+ // Instruction Cost
143+ const auto Cost = TTI.getInstructionCost (
144+ II, TargetTransformInfo::TargetCostKind::TCK_SizeAndLatency);
145+ LLVM_DEBUG (dbgs () << " shouldReplaceBasedOnOp: " << *II << " Cost=" << Cost
146+ << ' \n ' ;);
147+ if (Cost >= 8 )
148+ return true ;
149+
150+ // Intrinsics - assume they natively handle illegal type
151+ if (dyn_cast<IntrinsicInst>(II))
152+ return true ;
153+
154+ // Stores
155+ if (dyn_cast<StoreInst>(II))
156+ return true ;
157+
158+ // Shuffles
159+ if (dyn_cast<ShuffleVectorInst>(II))
160+ return true ;
161+
162+ return false ;
163+ }
164+
165+ LiveRegOptimizer (Module &Mod, const GCNSubtarget &ST,
166+ const TargetTransformInfo &TTI)
167+ : Mod(Mod), DL(Mod.getDataLayout()), ST(ST), TTI(TTI),
130168 ConvertToScalar (Type::getInt32Ty(Mod.getContext())) {}
131169};
132170
@@ -140,7 +178,7 @@ bool AMDGPULateCodeGenPrepare::run() {
140178 // vectors to equivalent vectors of legal type (which are converted back
141179 // before uses in subsequent blocks), to pack the bits into fewer physical
142180 // registers (used in CopyToReg/CopyFromReg pairs).
143- LiveRegOptimizer LRO (*F.getParent (), ST);
181+ LiveRegOptimizer LRO (*F.getParent (), ST, TTI );
144182
145183 bool Changed = false ;
146184
@@ -259,6 +297,9 @@ bool LiveRegOptimizer::optimizeLiveType(
259297 if (!shouldReplace (II->getType ()))
260298 continue ;
261299
300+ if (!shouldReplaceBasedOnOp (II))
301+ continue ;
302+
262303 if (PHINode *Phi = dyn_cast<PHINode>(II)) {
263304 PhiNodes.insert (Phi);
264305 // Collect all the incoming values of problematic PHI nodes.
@@ -478,11 +519,12 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
478519PreservedAnalyses
479520AMDGPULateCodeGenPreparePass::run (Function &F, FunctionAnalysisManager &FAM) {
480521 const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
522+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo (F);
481523
482524 AssumptionCache &AC = FAM.getResult <AssumptionAnalysis>(F);
483525 UniformityInfo &UI = FAM.getResult <UniformityInfoAnalysis>(F);
484526
485- bool Changed = AMDGPULateCodeGenPrepare (F, ST, &AC, UI).run ();
527+ bool Changed = AMDGPULateCodeGenPrepare (F, ST, TTI, &AC, UI).run ();
486528
487529 if (!Changed)
488530 return PreservedAnalyses::all ();
@@ -518,13 +560,14 @@ bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
518560 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
519561 const TargetMachine &TM = TPC.getTM <TargetMachine>();
520562 const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
563+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo (F);
521564
522565 AssumptionCache &AC =
523566 getAnalysis<AssumptionCacheTracker>().getAssumptionCache (F);
524567 UniformityInfo &UI =
525568 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo ();
526569
527- return AMDGPULateCodeGenPrepare (F, ST, &AC, UI).run ();
570+ return AMDGPULateCodeGenPrepare (F, ST, TTI, &AC, UI).run ();
528571}
529572
530573INITIALIZE_PASS_BEGIN (AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
0 commit comments