1414
1515#include " AMDGPU.h"
1616#include " AMDGPUTargetMachine.h"
17+ #include " AMDGPUTargetTransformInfo.h"
1718#include " llvm/Analysis/AssumptionCache.h"
1819#include " llvm/Analysis/UniformityAnalysis.h"
1920#include " llvm/Analysis/ValueTracking.h"
@@ -45,6 +46,7 @@ class AMDGPULateCodeGenPrepare
4546 Function &F;
4647 const DataLayout &DL;
4748 const GCNSubtarget &ST;
49+ const TargetTransformInfo &TTI;
4850
4951 AssumptionCache *const AC;
5052 UniformityInfo &UA;
@@ -53,8 +55,9 @@ class AMDGPULateCodeGenPrepare
5355
5456public:
5557 AMDGPULateCodeGenPrepare (Function &F, const GCNSubtarget &ST,
58+ const TargetTransformInfo &TTI,
5659 AssumptionCache *AC, UniformityInfo &UA)
57- : F(F), DL(F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
60+ : F(F), DL(F.getDataLayout()), ST(ST), TTI(TTI), AC(AC), UA(UA) {}
5861 bool run ();
5962 bool visitInstruction (Instruction &) { return false ; }
6063
@@ -75,6 +78,8 @@ class LiveRegOptimizer {
7578 Module &Mod;
7679 const DataLayout &DL;
7780 const GCNSubtarget &ST;
81+ const TargetTransformInfo &TTI;
82+
7883 // / The scalar type to convert to
7984 Type *const ConvertToScalar;
8085 // / The set of visited Instructions
@@ -125,8 +130,43 @@ class LiveRegOptimizer {
125130 return LK.first != TargetLoweringBase::TypeLegal;
126131 }
127132
128- LiveRegOptimizer (Module &Mod, const GCNSubtarget &ST)
129- : Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
133+ // Filtering based on operation or its cost.
134+ // If an operation incurs high enough cost or natively work on
135+ // vector of illegal type, ie. v2i8, then it makes sense to try
136+ // to avoid scalarizing across BB.
137+ bool shouldReplaceBasedOnOp (Instruction *II) {
138+ // Ignore pseudos
139+ if (II->isDebugOrPseudoInst ())
140+ return false ;
141+
142+ // Instruction Cost
143+ const auto Cost = TTI.getInstructionCost (II,
144+ TargetTransformInfo::TargetCostKind::TCK_SizeAndLatency);
145+ LLVM_DEBUG (
146+ dbgs () << " shouldReplaceBasedOnOp: " <<
147+ *II << " Cost=" << Cost << ' \n ' ;
148+ );
149+ if (Cost >= 8 )
150+ return true ;
151+
152+ // Intrinsics - assume they natively handle illegal type
153+ if (dyn_cast<IntrinsicInst>(II))
154+ return true ;
155+
156+ // Stores
157+ if (dyn_cast<StoreInst>(II))
158+ return true ;
159+
160+ // Shuffles
161+ if (dyn_cast<ShuffleVectorInst>(II))
162+ return true ;
163+
164+ return false ;
165+ }
166+
167+ LiveRegOptimizer (Module &Mod, const GCNSubtarget &ST,
168+ const TargetTransformInfo &TTI)
169+ : Mod(Mod), DL(Mod.getDataLayout()), ST(ST), TTI(TTI),
130170 ConvertToScalar (Type::getInt32Ty(Mod.getContext())) {}
131171};
132172
@@ -140,7 +180,7 @@ bool AMDGPULateCodeGenPrepare::run() {
140180 // vectors to equivalent vectors of legal type (which are converted back
141181 // before uses in subsequent blocks), to pack the bits into fewer physical
142182 // registers (used in CopyToReg/CopyFromReg pairs).
143- LiveRegOptimizer LRO (*F.getParent (), ST);
183+ LiveRegOptimizer LRO (*F.getParent (), ST, TTI );
144184
145185 bool Changed = false ;
146186
@@ -259,6 +299,9 @@ bool LiveRegOptimizer::optimizeLiveType(
259299 if (!shouldReplace (II->getType ()))
260300 continue ;
261301
302+ if (!shouldReplaceBasedOnOp (II))
303+ continue ;
304+
262305 if (PHINode *Phi = dyn_cast<PHINode>(II)) {
263306 PhiNodes.insert (Phi);
264307 // Collect all the incoming values of problematic PHI nodes.
@@ -478,11 +521,12 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
478521PreservedAnalyses
479522AMDGPULateCodeGenPreparePass::run (Function &F, FunctionAnalysisManager &FAM) {
480523 const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
524+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo (F);
481525
482526 AssumptionCache &AC = FAM.getResult <AssumptionAnalysis>(F);
483527 UniformityInfo &UI = FAM.getResult <UniformityInfoAnalysis>(F);
484528
485- bool Changed = AMDGPULateCodeGenPrepare (F, ST, &AC, UI).run ();
529+ bool Changed = AMDGPULateCodeGenPrepare (F, ST, TTI, &AC, UI).run ();
486530
487531 if (!Changed)
488532 return PreservedAnalyses::all ();
@@ -518,13 +562,14 @@ bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
518562 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
519563 const TargetMachine &TM = TPC.getTM <TargetMachine>();
520564 const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
565+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo (F);
521566
522567 AssumptionCache &AC =
523568 getAnalysis<AssumptionCacheTracker>().getAssumptionCache (F);
524569 UniformityInfo &UI =
525570 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo ();
526571
527- return AMDGPULateCodeGenPrepare (F, ST, &AC, UI).run ();
572+ return AMDGPULateCodeGenPrepare (F, ST, TTI, &AC, UI).run ();
528573}
529574
530575INITIALIZE_PASS_BEGIN (AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
0 commit comments