@@ -45,6 +45,7 @@ The passes are
4545 VectorBitCastOpt
4646 GenStrengthReduction
4747 FlattenSmallSwitch
48+ SplitIndirectEEtoSel
4849
4950CustomSafeOptPass does peephole optimizations
5051For example, reduce the alloca size so there is a chance to promote indexed temp.
@@ -69,6 +70,9 @@ GenStrengthReduction performs a fdiv optimization.
6970FlattenSmallSwitch flatten the if/else or switch structure and use cmp+sel
7071instead if the structure is small.
7172
73+ SplitIndirectEEtoSel splits extractelements with very small vec to a series of
74+ cmp+sel to avoid expensive VxH mov.
75+
7276=============================================================================*/
7377
7478#include " Compiler/CustomSafeOptPass.hpp"
@@ -4460,6 +4464,210 @@ void FCmpPaternMatch::visitSelectInst(SelectInst& I)
44604464IGC_INITIALIZE_PASS_BEGIN (FlattenSmallSwitch, " flattenSmallSwitch" , " flattenSmallSwitch" , false , false )
44614465IGC_INITIALIZE_PASS_END(FlattenSmallSwitch, " flattenSmallSwitch" , " flattenSmallSwitch" , false , false )
44624466
4467+
4468+
4469+ /* ======================== SplitIndirectEEtoSel =============================
4470+
4471+ This class changes extract element for small vectors to series of cmp+sel to avoid VxH mov.
4472+ before:
4473+ %268 = mul nuw i32 %res.i2.i, 3
4474+ %269 = extractelement <12 x float> %234, i32 %268
4475+ %270 = add i32 %268, 1
4476+ %271 = extractelement <12 x float> %234, i32 %270
4477+ %272 = add i32 %268, 2
4478+ %273 = extractelement <12 x float> %234, i32 %272
4479+ %274 = extractelement <12 x float> %198, i32 %268
4480+ %275 = extractelement <12 x float> %198, i32 %270
4481+ %276 = extractelement <12 x float> %198, i32 %272
4482+
4483+ after:
4484+ %250 = icmp eq i32 %res.i2.i, i16 1
4485+ %251 = select i1 %250, float %206, float %200
4486+ %252 = select i1 %250, float %208, float %202
4487+ %253 = select i1 %250, float %210, float %204
4488+ %254 = select i1 %250, float %48, float %32
4489+ %255 = select i1 %250, float %49, float %33
4490+ %256 = select i1 %250, float %50, float %34
4491+ %257 = icmp eq i32 %res.i2.i, i16 2
4492+ %258 = select i1 %257, float %214, float %251
4493+ %259 = select i1 %257, float %215, float %252
4494+ %260 = select i1 %257, float %216, float %253
4495+ %261 = select i1 %257, float %64, float %254
4496+ %262 = select i1 %257, float %65, float %255
4497+ %263 = select i1 %257, float %66, float %256
4498+
4499+ It is a bit similar to SimplifyConstant::isCmpSelProfitable for OCL, but not restricted to api.
4500+ And to GenSimplification::visitExtractElement() but not restricted to vec of 2, and later.
4501+ TODO: for known vectors check how many unique items there are.
4502+ ===========================================================================*/
4503+ namespace {
4504+ class SplitIndirectEEtoSel : public FunctionPass , public llvm ::InstVisitor<SplitIndirectEEtoSel>
4505+ {
4506+ public:
4507+ static char ID;
4508+ SplitIndirectEEtoSel () : FunctionPass(ID)
4509+ {
4510+ initializeSplitIndirectEEtoSelPass (*PassRegistry::getPassRegistry ());
4511+ }
4512+ virtual llvm::StringRef getPassName () const { return " Split Indirect EE to ICmp Plus Sel" ; }
4513+ virtual bool runOnFunction (Function& F);
4514+ void visitExtractElementInst (llvm::ExtractElementInst& I);
4515+ private:
4516+ bool isProfitableToSplit (uint64_t num, int64_t mul, int64_t add);
4517+ bool didSomething;
4518+ };
4519+
4520+ } // namespace
4521+
4522+
4523+ char SplitIndirectEEtoSel::ID = 0 ;
4524+ FunctionPass* IGC::createSplitIndirectEEtoSelPass () { return new SplitIndirectEEtoSel (); }
4525+
4526+ bool SplitIndirectEEtoSel::runOnFunction (Function& F)
4527+ {
4528+ didSomething = false ;
4529+ visit (F);
4530+ return didSomething;
4531+ }
4532+
4533+ bool SplitIndirectEEtoSel::isProfitableToSplit (uint64_t num, int64_t mul, int64_t add)
4534+ {
4535+ /* Assumption:
4536+ Pass is profitable when: (X * cmp + Y * sel) < (ExecSize * mov VxH).
4537+ */
4538+
4539+ const int64_t assumedVXHCost = IGC_GET_FLAG_VALUE (SplitIndirectEEtoSelThreshold);
4540+ int64_t possibleCost = 0 ;
4541+
4542+ /* for: extractelement <4 x float> , %index
4543+ cost is (4 - 1) * (icmp + sel) = 6;
4544+ */
4545+ possibleCost = ((int64_t )num -1 ) * 2 ;
4546+ if (possibleCost < assumedVXHCost)
4547+ return true ;
4548+
4549+ /* for: extractelement <12 x float> , (mul %real_index, 3)
4550+ cost is ((12/3) - 1) * (icmp + sel) = 6;
4551+ */
4552+
4553+ if (mul > 0 ) // not tested negative options
4554+ {
4555+ int64_t differentOptions = 1 + ((int64_t )num - 1 ) / mul; // ceil(num/mul)
4556+ possibleCost = (differentOptions - 1 ) * 2 ;
4557+
4558+ if (possibleCost < assumedVXHCost)
4559+ return true ;
4560+ }
4561+
4562+ return false ;
4563+ }
4564+
4565+ void SplitIndirectEEtoSel::visitExtractElementInst (llvm::ExtractElementInst& I)
4566+ {
4567+ using namespace llvm ::PatternMatch;
4568+
4569+ VectorType* vecTy = I.getVectorOperandType ();
4570+ uint64_t num = vecTy->getNumElements ();
4571+ Type* eleType = vecTy->getElementType ();
4572+
4573+ Value* vec = I.getVectorOperand ();
4574+ Value* index = I.getIndexOperand ();
4575+
4576+ // ignore constant index
4577+ if (dyn_cast<ConstantInt>(index))
4578+ {
4579+ return ;
4580+ }
4581+
4582+ // ignore others for now (did not yet evaluate perf. impact)
4583+ if (!(eleType->isIntegerTy (32 ) || eleType->isFloatTy ()))
4584+ {
4585+ return ;
4586+ }
4587+
4588+ // used to calculate offsets
4589+ int64_t add = 0 ;
4590+ int64_t mul = 1 ;
4591+
4592+
4593+ /* strip mul/add from index calculation and remember it for later:
4594+ %268 = mul nuw i32 %res.i2.i, 3
4595+ %270 = add i32 %268, 1
4596+ %271 = extractelement <12 x float> %234, i32 %270
4597+ */
4598+ Value* Val1 = nullptr ;
4599+ ConstantInt* ci_add = nullptr ;
4600+ ConstantInt* ci_mul = nullptr ;
4601+
4602+ auto pat1 = m_Add (m_Mul (m_Value (Val1), m_ConstantInt (ci_mul)), m_ConstantInt (ci_add));
4603+ auto pat2 = m_Mul (m_Value (Val1), m_ConstantInt (ci_mul));
4604+ // Some code shows `shl+or` instead of mul+add.
4605+ auto pat21 = m_Or (m_Shl (m_Value (Val1), m_ConstantInt (ci_mul)), m_ConstantInt (ci_add));
4606+ auto pat22 = m_Shl (m_Value (Val1), m_ConstantInt (ci_mul));
4607+
4608+ if (match (index, pat1) || match (index, pat2))
4609+ {
4610+ add = ci_add ? ci_add->getSExtValue () : 0 ;
4611+ mul = ci_mul ? ci_mul->getSExtValue () : 1 ;
4612+ index = Val1;
4613+ }
4614+ else if (match (index, pat21) || match (index, pat22))
4615+ {
4616+ add = ci_add ? ci_add->getSExtValue () : 0 ;
4617+ mul = ci_mul ? (1LL << ci_mul->getSExtValue ()) : 1LL ;
4618+ index = Val1;
4619+ }
4620+
4621+ if (!isProfitableToSplit (num, mul, add))
4622+ return ;
4623+
4624+ Value* vTemp = llvm::UndefValue::get (eleType);
4625+ IRBuilder<> builder (I.getNextNode ());
4626+
4627+ // returns true if we can skip this icmp, such as:
4628+ // icmp eq (add (mul %index, 3), 2), 1
4629+ // icmp eq (mul %index, 3), 1
4630+ auto canSafelySkipThis = [&](int64_t add, int64_t mul, int64_t & newIndex) {
4631+ if (mul)
4632+ {
4633+ newIndex -= add;
4634+ if ((newIndex % mul) != 0 )
4635+ return true ;
4636+ newIndex = newIndex / mul;
4637+ }
4638+ return false ;
4639+ };
4640+
4641+ // Generate combinations
4642+ for (uint64_t elemIndex = 0 ; elemIndex < num; elemIndex++)
4643+ {
4644+ int64_t cmpIndex = elemIndex;
4645+
4646+ if (canSafelySkipThis (add, mul, cmpIndex))
4647+ continue ;
4648+
4649+ // Those 2 might be different, when cmp will get altered by it's operands, but EE index stays the same
4650+ ConstantInt* cmpIndexCI = llvm::ConstantInt::get (builder.getInt32Ty (), (uint64_t )cmpIndex);
4651+ ConstantInt* eeiIndexCI = llvm::ConstantInt::get (builder.getInt32Ty (), (uint64_t )elemIndex);
4652+
4653+ Value* cmp = builder.CreateICmp (CmpInst::Predicate::ICMP_EQ, index, cmpIndexCI);
4654+ Value* subcaseEE = builder.CreateExtractElement (vec, eeiIndexCI);
4655+ Value* sel = builder.CreateSelect (cmp, subcaseEE, vTemp);
4656+ vTemp = sel;
4657+ didSomething = true ;
4658+ }
4659+
4660+ // In theory there's no situation where we don't do something up to this point.
4661+ if (didSomething)
4662+ {
4663+ I.replaceAllUsesWith (vTemp);
4664+ }
4665+ }
4666+
4667+
4668+ IGC_INITIALIZE_PASS_BEGIN (SplitIndirectEEtoSel, " SplitIndirectEEtoSel" , " SplitIndirectEEtoSel" , false , false )
4669+ IGC_INITIALIZE_PASS_END(SplitIndirectEEtoSel, " SplitIndirectEEtoSel" , " SplitIndirectEEtoSel" , false , false )
4670+
44634671// //////////////////////////////////////////////////////////////////////
44644672// LogicalAndToBranch trying to find logical AND like below:
44654673// res = simpleCond0 && complexCond1
0 commit comments