Skip to content

Commit 701ec74

Browse files
committed
[VectorCombine] New folding pattern for extract/binop/shuffle chains
Resolves #144654 Part of #143088 This adds a new `foldShuffleChainsToReduce` for horizontal reduction of patterns like: ```llvm define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 { %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) %7 = extractelement <8 x i16> %6, i64 0 ret i16 %7 } ``` ...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call. Similar transformation for other ops when costs permit to do so.
1 parent ff68f71 commit 701ec74

File tree

3 files changed

+504
-0
lines changed

3 files changed

+504
-0
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ class VectorCombine {
135135
bool foldShuffleOfIntrinsics(Instruction &I);
136136
bool foldShuffleToIdentity(Instruction &I);
137137
bool foldShuffleFromReductions(Instruction &I);
138+
bool foldShuffleChainsToReduce(Instruction &I);
138139
bool foldCastFromReductions(Instruction &I);
139140
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
140141
bool foldInterleaveIntrinsics(Instruction &I);
@@ -3129,6 +3130,179 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
31293130
return MadeChanges;
31303131
}
31313132

3133+
bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3134+
auto *EEI = dyn_cast<ExtractElementInst>(&I);
3135+
if (!EEI)
3136+
return false;
3137+
3138+
std::queue<Value *> InstWorklist;
3139+
Value *InitEEV = nullptr;
3140+
Intrinsic::ID CommonOp = 0;
3141+
3142+
bool IsFirstCallInst = true;
3143+
bool ShouldBeCallInst = true;
3144+
3145+
SmallVector<Value *, 3> PrevVecV(3, nullptr);
3146+
int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1;
3147+
int64_t VecSize = -1;
3148+
3149+
Value *VecOp;
3150+
if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero())))
3151+
return false;
3152+
3153+
auto *FVT = dyn_cast<FixedVectorType>(VecOp->getType());
3154+
if (!FVT)
3155+
return false;
3156+
3157+
VecSize = FVT->getNumElements();
3158+
if (VecSize < 2 || (VecSize % 2) != 0)
3159+
return false;
3160+
3161+
ShuffleMaskHalf = 1;
3162+
PrevVecV[2] = VecOp;
3163+
InitEEV = EEI;
3164+
3165+
InstWorklist.push(PrevVecV[2]);
3166+
3167+
while (!InstWorklist.empty()) {
3168+
Value *V = InstWorklist.front();
3169+
InstWorklist.pop();
3170+
3171+
auto *CI = dyn_cast<Instruction>(V);
3172+
if (!CI)
3173+
return false;
3174+
3175+
if (auto *CallI = dyn_cast<CallInst>(CI)) {
3176+
if (!ShouldBeCallInst || !PrevVecV[2])
3177+
return false;
3178+
3179+
if (!IsFirstCallInst &&
3180+
any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3181+
return false;
3182+
3183+
if (CallI != (IsFirstCallInst ? PrevVecV[2] : PrevVecV[0]))
3184+
return false;
3185+
IsFirstCallInst = false;
3186+
3187+
auto *II = dyn_cast<IntrinsicInst>(CallI);
3188+
if (!II)
3189+
return false;
3190+
3191+
if (!CommonOp)
3192+
CommonOp = II->getIntrinsicID();
3193+
if (II->getIntrinsicID() != CommonOp)
3194+
return false;
3195+
3196+
switch (II->getIntrinsicID()) {
3197+
case Intrinsic::umin:
3198+
case Intrinsic::umax:
3199+
case Intrinsic::smin:
3200+
case Intrinsic::smax: {
3201+
auto *Op0 = CallI->getOperand(0);
3202+
auto *Op1 = CallI->getOperand(1);
3203+
PrevVecV[0] = Op0;
3204+
PrevVecV[1] = Op1;
3205+
break;
3206+
}
3207+
default:
3208+
return false;
3209+
}
3210+
ShouldBeCallInst ^= 1;
3211+
3212+
if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3213+
std::swap(PrevVecV[0], PrevVecV[1]);
3214+
InstWorklist.push(PrevVecV[1]);
3215+
InstWorklist.push(PrevVecV[0]);
3216+
} else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3217+
if (ShouldBeCallInst ||
3218+
any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3219+
return false;
3220+
3221+
if (SVInst != PrevVecV[1])
3222+
return false;
3223+
3224+
auto *ShuffleVec = SVInst->getOperand(0);
3225+
if (!ShuffleVec || ShuffleVec != PrevVecV[0])
3226+
return false;
3227+
3228+
SmallVector<int> CurMask;
3229+
SVInst->getShuffleMask(CurMask);
3230+
3231+
if (ShuffleMaskHalf != ExpectedShuffleMaskHalf)
3232+
return false;
3233+
ExpectedShuffleMaskHalf *= 2;
3234+
3235+
for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3236+
if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask)
3237+
return false;
3238+
if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3239+
return false;
3240+
}
3241+
ShuffleMaskHalf *= 2;
3242+
if (ExpectedShuffleMaskHalf == VecSize)
3243+
break;
3244+
ShouldBeCallInst ^= 1;
3245+
} else {
3246+
return false;
3247+
}
3248+
}
3249+
3250+
if (ShouldBeCallInst)
3251+
return false;
3252+
3253+
assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize &&
3254+
"Expected Match for Vector Size and Mask Half");
3255+
3256+
Value *FinalVecV = PrevVecV[0];
3257+
auto *FinalVecVTy = dyn_cast<FixedVectorType>(FinalVecV->getType());
3258+
3259+
if (!InitEEV || !FinalVecV)
3260+
return false;
3261+
3262+
assert(FinalVecVTy && "Expected non-null value for Vector Type");
3263+
3264+
Intrinsic::ID ReducedOp = 0;
3265+
switch (CommonOp) {
3266+
case Intrinsic::umin:
3267+
ReducedOp = Intrinsic::vector_reduce_umin;
3268+
break;
3269+
case Intrinsic::umax:
3270+
ReducedOp = Intrinsic::vector_reduce_umax;
3271+
break;
3272+
case Intrinsic::smin:
3273+
ReducedOp = Intrinsic::vector_reduce_smin;
3274+
break;
3275+
case Intrinsic::smax:
3276+
ReducedOp = Intrinsic::vector_reduce_smax;
3277+
break;
3278+
default:
3279+
return false;
3280+
}
3281+
3282+
InstructionCost OrigCost = 0;
3283+
unsigned int NumLevels = Log2_64(VecSize);
3284+
3285+
for (unsigned int Level = 0; Level < NumLevels; ++Level) {
3286+
OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
3287+
FinalVecVTy, FinalVecVTy);
3288+
OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy);
3289+
}
3290+
OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy,
3291+
CostKind, 0);
3292+
3293+
IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3294+
InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
3295+
3296+
if (NewCost >= OrigCost)
3297+
return false;
3298+
3299+
auto *ReducedResult =
3300+
Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3301+
replaceValue(*InitEEV, *ReducedResult);
3302+
3303+
return true;
3304+
}
3305+
31323306
/// Determine if its more efficient to fold:
31333307
/// reduce(trunc(x)) -> trunc(reduce(x)).
31343308
/// reduce(sext(x)) -> sext(reduce(x)).
@@ -4216,6 +4390,9 @@ bool VectorCombine::run() {
42164390
if (foldCastFromReductions(I))
42174391
return true;
42184392
break;
4393+
case Instruction::ExtractElement:
4394+
MadeChange |= foldShuffleChainsToReduce(I);
4395+
break;
42194396
case Instruction::ICmp:
42204397
case Instruction::FCmp:
42214398
if (foldExtractExtract(I))

0 commit comments

Comments
 (0)