Skip to content

Commit 157a1ea

Browse files
bgajdaINTCsys_zuul
authored andcommitted
Add pass to Split Indirect EE to sel to avoid VxH mov.
It's a separate pass, as it wants to capture result of LowerGEPIntoPrivMem. Heuristic currently handles calculating cost cases: - less than <4 x type> vec size - when EEI index is a multiply(+ add) of some ConstantInt as in below example. From: %268 = mul nuw i32 %res.i2.i, 3 %269 = extractelement <12 x float> %234, i32 %268 %274 = extractelement <12 x float> %198, i32 %268 %270 = add i32 %268, 1 %271 = extractelement <12 x float> %234, i32 %270 %275 = extractelement <12 x float> %198, i32 %270 %272 = add i32 %268, 2 %273 = extractelement <12 x float> %234, i32 %272 %276 = extractelement <12 x float> %198, i32 %272 To: %250 = icmp eq i32 %res.i2.i, i16 1 %251 = select i1 %250, float %206, float %200 %252 = select i1 %250, float %208, float %202 %253 = select i1 %250, float %210, float %204 %254 = select i1 %250, float %48, float %32 %255 = select i1 %250, float %49, float %33 %256 = select i1 %250, float %50, float %34 %257 = icmp eq i32 %res.i2.i, i16 2 %258 = select i1 %257, float %214, float %251 %259 = select i1 %257, float %215, float %252 %260 = select i1 %257, float %216, float %253 %261 = select i1 %257, float %64, float %254 %262 = select i1 %257, float %65, float %255 %263 = select i1 %257, float %66, float %256 Change-Id: Ibd4ec1da3b81f18cef6a405d5f599d86eb36c900
1 parent 4bc7cc1 commit 157a1ea

File tree

5 files changed

+218
-0
lines changed

5 files changed

+218
-0
lines changed

IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,12 @@ static void AddLegalizationPasses(CodeGenContext& ctx, IGCPassManager& mpm, PSSi
551551
// Since we don't support switch statements, switch lowering is needed after the last CFG simplication
552552
mpm.add(llvm::createLowerSwitchPass());
553553

554+
// There's no particular reason for this exact place, but it should be after LowerGEPForPrivMem
555+
if (IGC_IS_FLAG_ENABLED(EnableSplitIndirectEEtoSel))
556+
{
557+
mpm.add(createSplitIndirectEEtoSelPass());
558+
}
559+
554560
// Split big vector & 3-element load/store, etc.
555561
mpm.add(createVectorPreProcessPass());
556562

IGC/Compiler/CustomSafeOptPass.cpp

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ The passes are
4545
VectorBitCastOpt
4646
GenStrengthReduction
4747
FlattenSmallSwitch
48+
SplitIndirectEEtoSel
4849
4950
CustomSafeOptPass does peephole optimizations
5051
For example, reduce the alloca size so there is a chance to promote indexed temp.
@@ -69,6 +70,9 @@ GenStrengthReduction performs a fdiv optimization.
6970
FlattenSmallSwitch flatten the if/else or switch structure and use cmp+sel
7071
instead if the structure is small.
7172
73+
SplitIndirectEEtoSel splits extractelements with very small vec to a series of
74+
cmp+sel to avoid expensive VxH mov.
75+
7276
=============================================================================*/
7377

7478
#include "Compiler/CustomSafeOptPass.hpp"
@@ -4460,6 +4464,210 @@ void FCmpPaternMatch::visitSelectInst(SelectInst& I)
44604464
IGC_INITIALIZE_PASS_BEGIN(FlattenSmallSwitch, "flattenSmallSwitch", "flattenSmallSwitch", false, false)
44614465
IGC_INITIALIZE_PASS_END(FlattenSmallSwitch, "flattenSmallSwitch", "flattenSmallSwitch", false, false)
44624466

4467+
4468+
4469+
/*======================== SplitIndirectEEtoSel =============================
4470+
4471+
This class changes extract element for small vectors to series of cmp+sel to avoid VxH mov.
4472+
before:
4473+
%268 = mul nuw i32 %res.i2.i, 3
4474+
%269 = extractelement <12 x float> %234, i32 %268
4475+
%270 = add i32 %268, 1
4476+
%271 = extractelement <12 x float> %234, i32 %270
4477+
%272 = add i32 %268, 2
4478+
%273 = extractelement <12 x float> %234, i32 %272
4479+
%274 = extractelement <12 x float> %198, i32 %268
4480+
%275 = extractelement <12 x float> %198, i32 %270
4481+
%276 = extractelement <12 x float> %198, i32 %272
4482+
4483+
after:
4484+
%250 = icmp eq i32 %res.i2.i, i16 1
4485+
%251 = select i1 %250, float %206, float %200
4486+
%252 = select i1 %250, float %208, float %202
4487+
%253 = select i1 %250, float %210, float %204
4488+
%254 = select i1 %250, float %48, float %32
4489+
%255 = select i1 %250, float %49, float %33
4490+
%256 = select i1 %250, float %50, float %34
4491+
%257 = icmp eq i32 %res.i2.i, i16 2
4492+
%258 = select i1 %257, float %214, float %251
4493+
%259 = select i1 %257, float %215, float %252
4494+
%260 = select i1 %257, float %216, float %253
4495+
%261 = select i1 %257, float %64, float %254
4496+
%262 = select i1 %257, float %65, float %255
4497+
%263 = select i1 %257, float %66, float %256
4498+
4499+
It is a bit similar to SimplifyConstant::isCmpSelProfitable for OCL, but not restricted to api.
4500+
And to GenSimplification::visitExtractElement() but not restricted to vec of 2, and later.
4501+
TODO: for known vectors check how many unique items there are.
4502+
===========================================================================*/
4503+
namespace {
4504+
class SplitIndirectEEtoSel : public FunctionPass, public llvm::InstVisitor<SplitIndirectEEtoSel>
4505+
{
4506+
public:
4507+
static char ID;
4508+
SplitIndirectEEtoSel() : FunctionPass(ID)
4509+
{
4510+
initializeSplitIndirectEEtoSelPass(*PassRegistry::getPassRegistry());
4511+
}
4512+
virtual llvm::StringRef getPassName() const { return "Split Indirect EE to ICmp Plus Sel"; }
4513+
virtual bool runOnFunction(Function& F);
4514+
void visitExtractElementInst(llvm::ExtractElementInst& I);
4515+
private:
4516+
bool isProfitableToSplit(uint64_t num, int64_t mul, int64_t add);
4517+
bool didSomething;
4518+
};
4519+
4520+
} // namespace
4521+
4522+
4523+
char SplitIndirectEEtoSel::ID = 0;
4524+
FunctionPass* IGC::createSplitIndirectEEtoSelPass() { return new SplitIndirectEEtoSel(); }
4525+
4526+
bool SplitIndirectEEtoSel::runOnFunction(Function& F)
4527+
{
4528+
didSomething = false;
4529+
visit(F);
4530+
return didSomething;
4531+
}
4532+
4533+
bool SplitIndirectEEtoSel::isProfitableToSplit(uint64_t num, int64_t mul, int64_t add)
4534+
{
4535+
/* Assumption:
4536+
Pass is profitable when: (X * cmp + Y * sel) < (ExecSize * mov VxH).
4537+
*/
4538+
4539+
const int64_t assumedVXHCost = IGC_GET_FLAG_VALUE(SplitIndirectEEtoSelThreshold);
4540+
int64_t possibleCost = 0;
4541+
4542+
/* for: extractelement <4 x float> , %index
4543+
cost is (4 - 1) * (icmp + sel) = 6;
4544+
*/
4545+
possibleCost = ((int64_t)num -1) * 2;
4546+
if (possibleCost < assumedVXHCost)
4547+
return true;
4548+
4549+
/* for: extractelement <12 x float> , (mul %real_index, 3)
4550+
cost is ((12/3) - 1) * (icmp + sel) = 6;
4551+
*/
4552+
4553+
if (mul > 0) // not tested negative options
4554+
{
4555+
int64_t differentOptions = 1 + ((int64_t)num - 1) / mul; // ceil(num/mul)
4556+
possibleCost = (differentOptions - 1) * 2;
4557+
4558+
if (possibleCost < assumedVXHCost)
4559+
return true;
4560+
}
4561+
4562+
return false;
4563+
}
4564+
4565+
void SplitIndirectEEtoSel::visitExtractElementInst(llvm::ExtractElementInst& I)
4566+
{
4567+
using namespace llvm::PatternMatch;
4568+
4569+
VectorType* vecTy = I.getVectorOperandType();
4570+
uint64_t num = vecTy->getNumElements();
4571+
Type* eleType = vecTy->getElementType();
4572+
4573+
Value* vec = I.getVectorOperand();
4574+
Value* index = I.getIndexOperand();
4575+
4576+
// ignore constant index
4577+
if (dyn_cast<ConstantInt>(index))
4578+
{
4579+
return;
4580+
}
4581+
4582+
// ignore others for now (did not yet evaluate perf. impact)
4583+
if (!(eleType->isIntegerTy(32) || eleType->isFloatTy()))
4584+
{
4585+
return;
4586+
}
4587+
4588+
// used to calculate offsets
4589+
int64_t add = 0;
4590+
int64_t mul = 1;
4591+
4592+
4593+
/* strip mul/add from index calculation and remember it for later:
4594+
%268 = mul nuw i32 %res.i2.i, 3
4595+
%270 = add i32 %268, 1
4596+
%271 = extractelement <12 x float> %234, i32 %270
4597+
*/
4598+
Value* Val1 = nullptr;
4599+
ConstantInt* ci_add = nullptr;
4600+
ConstantInt* ci_mul = nullptr;
4601+
4602+
auto pat1 = m_Add(m_Mul(m_Value(Val1), m_ConstantInt(ci_mul)), m_ConstantInt(ci_add));
4603+
auto pat2 = m_Mul(m_Value(Val1), m_ConstantInt(ci_mul));
4604+
// Some code shows `shl+or` instead of mul+add.
4605+
auto pat21 = m_Or(m_Shl(m_Value(Val1), m_ConstantInt(ci_mul)), m_ConstantInt(ci_add));
4606+
auto pat22 = m_Shl(m_Value(Val1), m_ConstantInt(ci_mul));
4607+
4608+
if (match(index, pat1) || match(index, pat2))
4609+
{
4610+
add = ci_add ? ci_add->getSExtValue() : 0;
4611+
mul = ci_mul ? ci_mul->getSExtValue() : 1;
4612+
index = Val1;
4613+
}
4614+
else if (match(index, pat21) || match(index, pat22))
4615+
{
4616+
add = ci_add ? ci_add->getSExtValue() : 0;
4617+
mul = ci_mul ? (1LL << ci_mul->getSExtValue()) : 1LL;
4618+
index = Val1;
4619+
}
4620+
4621+
if (!isProfitableToSplit(num, mul, add))
4622+
return;
4623+
4624+
Value* vTemp = llvm::UndefValue::get(eleType);
4625+
IRBuilder<> builder(I.getNextNode());
4626+
4627+
// returns true if we can skip this icmp, such as:
4628+
// icmp eq (add (mul %index, 3), 2), 1
4629+
// icmp eq (mul %index, 3), 1
4630+
auto canSafelySkipThis = [&](int64_t add, int64_t mul, int64_t & newIndex) {
4631+
if (mul)
4632+
{
4633+
newIndex -= add;
4634+
if ((newIndex % mul) != 0)
4635+
return true;
4636+
newIndex = newIndex / mul;
4637+
}
4638+
return false;
4639+
};
4640+
4641+
// Generate combinations
4642+
for (uint64_t elemIndex = 0; elemIndex < num; elemIndex++)
4643+
{
4644+
int64_t cmpIndex = elemIndex;
4645+
4646+
if (canSafelySkipThis(add, mul, cmpIndex))
4647+
continue;
4648+
4649+
// Those 2 might be different, when cmp will get altered by it's operands, but EE index stays the same
4650+
ConstantInt* cmpIndexCI = llvm::ConstantInt::get(builder.getInt32Ty(), (uint64_t)cmpIndex);
4651+
ConstantInt* eeiIndexCI = llvm::ConstantInt::get(builder.getInt32Ty(), (uint64_t)elemIndex);
4652+
4653+
Value* cmp = builder.CreateICmp(CmpInst::Predicate::ICMP_EQ, index, cmpIndexCI);
4654+
Value* subcaseEE = builder.CreateExtractElement(vec, eeiIndexCI);
4655+
Value* sel = builder.CreateSelect(cmp, subcaseEE, vTemp);
4656+
vTemp = sel;
4657+
didSomething = true;
4658+
}
4659+
4660+
// In theory there's no situation where we don't do something up to this point.
4661+
if (didSomething)
4662+
{
4663+
I.replaceAllUsesWith(vTemp);
4664+
}
4665+
}
4666+
4667+
4668+
IGC_INITIALIZE_PASS_BEGIN(SplitIndirectEEtoSel, "SplitIndirectEEtoSel", "SplitIndirectEEtoSel", false, false)
4669+
IGC_INITIALIZE_PASS_END(SplitIndirectEEtoSel, "SplitIndirectEEtoSel", "SplitIndirectEEtoSel", false, false)
4670+
44634671
////////////////////////////////////////////////////////////////////////
44644672
// LogicalAndToBranch trying to find logical AND like below:
44654673
// res = simpleCond0 && complexCond1

IGC/Compiler/CustomSafeOptPass.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ namespace IGC
250250
llvm::FunctionPass* createGenStrengthReductionPass();
251251
llvm::FunctionPass* createNanHandlingPass();
252252
llvm::FunctionPass* createFlattenSmallSwitchPass();
253+
llvm::FunctionPass* createSplitIndirectEEtoSelPass();
253254
llvm::FunctionPass* createIGCIndirectICBPropagaionPass();
254255
llvm::FunctionPass* createBlendToDiscardPass();
255256
llvm::FunctionPass* createMarkReadOnlyLoadPass();

IGC/Compiler/InitializePasses.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ void initializeGenUpdateCBPass(llvm::PassRegistry&);
6868
void initializeGenStrengthReductionPass(llvm::PassRegistry&);
6969
void initializeNanHandlingPass(llvm::PassRegistry&);
7070
void initializeFlattenSmallSwitchPass(llvm::PassRegistry&);
71+
void initializeSplitIndirectEEtoSelPass(llvm::PassRegistry&);
7172
void initializeGenXFunctionGroupAnalysisPass(llvm::PassRegistry&);
7273
void initializeGenXCodeGenModulePass(llvm::PassRegistry&);
7374
void initializeEstimateFunctionSizePass(llvm::PassRegistry&);

IGC/common/igc_flags.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ DECLARE_IGC_REGKEY(bool, EnableSimplifyGEP, true, "Enable IGC to si
135135
DECLARE_IGC_REGKEY(bool, DisableCustomUnsafeOpt, false, "Disable IGC to run custom unsafe optimizations", false)
136136
DECLARE_IGC_REGKEY(bool, EnableFastMath, false, "Enable fast math optimizations in IGC", false)
137137
DECLARE_IGC_REGKEY(bool, DisableFlattenSmallSwitch, false, "Disable the flatten small switch pass", false)
138+
DECLARE_IGC_REGKEY(bool, EnableSplitIndirectEEtoSel, true, "Enable the split indirect extractelement to icmp+sel pass", false)
139+
DECLARE_IGC_REGKEY(bool, SplitIndirectEEtoSelThreshold, 8, "Split indirect extractelement cost threshold", false)
138140
DECLARE_IGC_REGKEY(bool, DisableImmConstantOpt, false, "Disable IGC IndirectICBPropagaion optimization", false)
139141
DECLARE_IGC_REGKEY(DWORD,MaxImmConstantSizePushed, 256, "Set the max size of immediate constant buffer pushed", false)
140142
DECLARE_IGC_REGKEY(bool, EnableCustomLoopVersioning, true, "Enable IGC to do custom loop versioning", false)

0 commit comments

Comments
 (0)