Skip to content

Commit 7e054c3

Browse files
committed
[VectorCombine] foldShuffleOfCastops - don't restrict to oneuse but compare total costs instead
Some casts (especially bitcasts but others as well) are incredibly cheap (or free), so don't limit the shuffle(cast(x),cast(y)) -> cast(shuffle(x,y)) to oneuse cases, but instead compare the total before/after costs of possibly repeating some casts.
1 parent 5c40e56 commit 7e054c3

File tree

2 files changed

+30
-7
lines changed

2 files changed

+30
-7
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1491,8 +1491,7 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
14911491
bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
14921492
Value *V0, *V1;
14931493
ArrayRef<int> OldMask;
1494-
if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
1495-
m_Mask(OldMask))))
1494+
if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
14961495
return false;
14971496

14981497
auto *C0 = dyn_cast<CastInst>(V0);
@@ -1551,11 +1550,13 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
15511550
// Try to replace a castop with a shuffle if the shuffle is not costly.
15521551
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15531552

1554-
InstructionCost OldCost =
1553+
InstructionCost CostC0 =
15551554
TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
1556-
TTI::CastContextHint::None, CostKind) +
1555+
TTI::CastContextHint::None, CostKind);
1556+
InstructionCost CostC1 =
15571557
TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
15581558
TTI::CastContextHint::None, CostKind);
1559+
InstructionCost OldCost = CostC0 + CostC1;
15591560
OldCost +=
15601561
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, CastDstTy,
15611562
OldMask, CostKind, 0, nullptr, std::nullopt, &I);
@@ -1564,6 +1565,10 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
15641565
TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, NewMask, CostKind);
15651566
NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
15661567
TTI::CastContextHint::None, CostKind);
1568+
if (!C0->hasOneUse())
1569+
NewCost += CostC0;
1570+
if (!C1->hasOneUse())
1571+
NewCost += CostC1;
15671572

15681573
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
15691574
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost

llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -224,14 +224,14 @@ define <16 x i16> @concat_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) {
224224
ret <16 x i16> %r
225225
}
226226

227-
; negative - multiuse
227+
; multiuse - ensure cost of any duplicated casts are worth it
228228

229229
define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(<4 x i32> %a0, <4 x i32> %a1, ptr %a2) {
230230
; CHECK-LABEL: define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(
231231
; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], ptr [[A2:%.*]]) #[[ATTR0]] {
232232
; CHECK-NEXT: [[X0:%.*]] = trunc <4 x i32> [[A0]] to <4 x i16>
233-
; CHECK-NEXT: [[X1:%.*]] = trunc <4 x i32> [[A1]] to <4 x i16>
234-
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[X0]], <4 x i16> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
233+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
234+
; CHECK-NEXT: [[R:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
235235
; CHECK-NEXT: store <4 x i16> [[X0]], ptr [[A2]], align 8
236236
; CHECK-NEXT: ret <8 x i16> [[R]]
237237
;
@@ -242,6 +242,24 @@ define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(<4 x i32> %a0, <4 x i32> %a1
242242
ret <8 x i16> %r
243243
}
244244

245+
; negative - multiuse - ensure cost of any duplicated casts are worth it
246+
247+
define <16 x i8> @concat_trunc_v8i64_v16i8_multiuse(<8 x i64> %a0, <8 x i64> %a1, ptr %a2) {
248+
; CHECK-LABEL: define <16 x i8> @concat_trunc_v8i64_v16i8_multiuse(
249+
; CHECK-SAME: <8 x i64> [[A0:%.*]], <8 x i64> [[A1:%.*]], ptr [[A2:%.*]]) #[[ATTR0]] {
250+
; CHECK-NEXT: [[X0:%.*]] = trunc <8 x i64> [[A0]] to <8 x i8>
251+
; CHECK-NEXT: [[X1:%.*]] = trunc <8 x i64> [[A1]] to <8 x i8>
252+
; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i8> [[X0]], <8 x i8> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 4, i32 15>
253+
; CHECK-NEXT: store <8 x i8> [[X0]], ptr [[A2]], align 8
254+
; CHECK-NEXT: ret <16 x i8> [[R]]
255+
;
256+
%x0 = trunc <8 x i64> %a0 to <8 x i8>
257+
%x1 = trunc <8 x i64> %a1 to <8 x i8>
258+
%r = shufflevector <8 x i8> %x0, <8 x i8> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 4, i32 15>
259+
store <8 x i8> %x0, ptr %a2
260+
ret <16 x i8> %r
261+
}
262+
245263
; negative - bitcasts (unscalable higher element count)
246264

247265
define <16 x i16> @revpair_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) {

0 commit comments

Comments
 (0)