Skip to content

Commit 079746d

Browse files
[SLP]Better cost estimation for masked gather or "clustered" loads.
After landing support for actual vectorization of the "clustered" loads, need better estimate the cost between the masked gather and clustered loads. This includes estimation of the address calculation and better estimation of the gathered loads. Also, this estimation now relies on SLPCostThreshold option, allowing modify the behavior of the compiler. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#105858
1 parent c49770c commit 079746d

File tree

6 files changed

+257
-126
lines changed

6 files changed

+257
-126
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 147 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -4820,105 +4820,173 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
48204820
}
48214821
}
48224822
}
4823-
auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4823+
// Correctly identify compare the cost of loads + shuffles rather than
4824+
// strided/masked gather loads. Returns true if vectorized + shuffles
4825+
// representation is better than just gather.
4826+
auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
4827+
bool ProfitableGatherPointers) {
4828+
// Compare masked gather cost and loads + insert subvector costs.
4829+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4830+
auto [ScalarGEPCost, VectorGEPCost] =
4831+
getGEPCosts(TTI, PointerOps, PointerOps.front(),
4832+
Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
4833+
// Estimate the cost of masked gather GEP. If not a splat, roughly
4834+
// estimate as a buildvector, otherwise estimate as splat.
4835+
APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
4836+
VectorType *PtrVecTy =
4837+
getWidenedType(PointerOps.front()->getType()->getScalarType(),
4838+
VecTy->getNumElements());
4839+
if (static_cast<unsigned>(count_if(
4840+
PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
4841+
any_of(PointerOps, [&](Value *V) {
4842+
return getUnderlyingObject(V) !=
4843+
getUnderlyingObject(PointerOps.front());
4844+
}))
4845+
VectorGEPCost += TTI.getScalarizationOverhead(
4846+
PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
4847+
else
4848+
VectorGEPCost +=
4849+
TTI.getScalarizationOverhead(
4850+
PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
4851+
/*Insert=*/true, /*Extract=*/false, CostKind) +
4852+
::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, std::nullopt,
4853+
CostKind);
4854+
// The cost of scalar loads.
4855+
InstructionCost ScalarLoadsCost =
4856+
std::accumulate(VL.begin(), VL.end(), InstructionCost(),
4857+
[&](InstructionCost C, Value *V) {
4858+
return C + TTI.getInstructionCost(
4859+
cast<Instruction>(V), CostKind);
4860+
}) +
4861+
ScalarGEPCost;
4862+
// The cost of masked gather.
4863+
InstructionCost MaskedGatherCost =
4864+
TTI.getGatherScatterOpCost(
4865+
Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
4866+
/*VariableMask=*/false, CommonAlignment, CostKind) +
4867+
(ProfitableGatherPointers ? 0 : VectorGEPCost);
4868+
InstructionCost GatherCost =
4869+
TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
4870+
/*Extract=*/false, CostKind) +
4871+
ScalarLoadsCost;
4872+
// The list of loads is small or perform partial check already - directly
4873+
// compare masked gather cost and gather cost.
4874+
constexpr unsigned ListLimit = 4;
4875+
if (!TryRecursiveCheck || VL.size() < ListLimit)
4876+
return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
48244877
unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4825-
unsigned MinVF = getMinVF(Sz);
4826-
unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4827-
MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4828-
for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4829-
unsigned VectorizedCnt = 0;
4878+
unsigned MinVF = getMinVF(2 * Sz);
4879+
DemandedElts.clearAllBits();
4880+
// Iterate through possible vectorization factors and check if vectorized +
4881+
// shuffles is better than just gather.
4882+
for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
48304883
SmallVector<LoadsState> States;
4831-
for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4832-
Cnt += VF, ++VectorizedCnt) {
4884+
for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
48334885
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
48344886
SmallVector<unsigned> Order;
48354887
SmallVector<Value *> PointerOps;
48364888
LoadsState LS =
48374889
canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
48384890
/*TryRecursiveCheck=*/false);
48394891
// Check that the sorted loads are consecutive.
4840-
if (LS == LoadsState::Gather)
4841-
break;
4892+
if (LS == LoadsState::Gather) {
4893+
DemandedElts.setBits(Cnt, Cnt + VF);
4894+
continue;
4895+
}
48424896
// If need the reorder - consider as high-cost masked gather for now.
48434897
if ((LS == LoadsState::Vectorize ||
48444898
LS == LoadsState::StridedVectorize) &&
48454899
!Order.empty() && !isReverseOrder(Order))
48464900
LS = LoadsState::ScatterVectorize;
48474901
States.push_back(LS);
48484902
}
4903+
if (DemandedElts.isAllOnes())
4904+
// All loads gathered - try smaller VF.
4905+
continue;
4906+
InstructionCost ScalarVFGEPCost = 0;
48494907
// Can be vectorized later as a serie of loads/insertelements.
4850-
if (VectorizedCnt == VL.size() / VF) {
4851-
// Compare masked gather cost and loads + insersubvector costs.
4852-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4853-
auto [ScalarGEPCost, VectorGEPCost] =
4854-
getGEPCosts(TTI, PointerOps, PointerOps.front(),
4855-
Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
4856-
InstructionCost MaskedGatherCost =
4857-
TTI.getGatherScatterOpCost(Instruction::Load, VecTy,
4858-
cast<LoadInst>(VL0)->getPointerOperand(),
4859-
/*VariableMask=*/false, CommonAlignment,
4860-
CostKind) +
4861-
VectorGEPCost - ScalarGEPCost;
4862-
InstructionCost VecLdCost = 0;
4863-
auto *SubVecTy = getWidenedType(ScalarTy, VF);
4864-
for (auto [I, LS] : enumerate(States)) {
4865-
auto *LI0 = cast<LoadInst>(VL[I * VF]);
4866-
switch (LS) {
4867-
case LoadsState::Vectorize: {
4868-
auto [ScalarGEPCost, VectorGEPCost] =
4869-
getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4870-
LI0->getPointerOperand(), Instruction::Load,
4871-
CostKind, ScalarTy, SubVecTy);
4872-
VecLdCost += TTI.getMemoryOpCost(
4873-
Instruction::Load, SubVecTy, LI0->getAlign(),
4874-
LI0->getPointerAddressSpace(), CostKind,
4875-
TTI::OperandValueInfo()) +
4876-
VectorGEPCost - ScalarGEPCost;
4877-
break;
4878-
}
4879-
case LoadsState::StridedVectorize: {
4880-
auto [ScalarGEPCost, VectorGEPCost] =
4881-
getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4882-
LI0->getPointerOperand(), Instruction::Load,
4883-
CostKind, ScalarTy, SubVecTy);
4884-
VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
4885-
LI0->getPointerOperand(),
4886-
/*VariableMask=*/false,
4887-
CommonAlignment, CostKind) +
4888-
VectorGEPCost - ScalarGEPCost;
4889-
break;
4890-
}
4891-
case LoadsState::ScatterVectorize: {
4892-
auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4893-
TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4894-
LI0->getPointerOperand(), Instruction::GetElementPtr, CostKind,
4895-
ScalarTy, SubVecTy);
4896-
VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
4897-
LI0->getPointerOperand(),
4898-
/*VariableMask=*/false,
4899-
CommonAlignment, CostKind) +
4900-
VectorGEPCost - ScalarGEPCost;
4901-
break;
4902-
}
4903-
case LoadsState::Gather:
4904-
llvm_unreachable(
4905-
"Expected only consecutive, strided or masked gather loads.");
4906-
}
4907-
SmallVector<int> ShuffleMask(VL.size());
4908-
for (int Idx : seq<int>(0, VL.size()))
4909-
ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4908+
InstructionCost VecLdCost = 0;
4909+
if (!DemandedElts.isZero()) {
4910+
VecLdCost =
4911+
TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
4912+
/*Extract=*/false, CostKind) +
4913+
ScalarGEPCost;
4914+
for (unsigned Idx : seq<unsigned>(VL.size()))
4915+
if (DemandedElts[Idx])
4916+
VecLdCost +=
4917+
TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
4918+
}
4919+
auto *SubVecTy = getWidenedType(ScalarTy, VF);
4920+
for (auto [I, LS] : enumerate(States)) {
4921+
auto *LI0 = cast<LoadInst>(VL[I * VF]);
4922+
InstructionCost VectorGEPCost =
4923+
(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
4924+
? 0
4925+
: getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4926+
LI0->getPointerOperand(),
4927+
Instruction::GetElementPtr, CostKind, ScalarTy,
4928+
SubVecTy)
4929+
.second;
4930+
if (LS == LoadsState::ScatterVectorize) {
4931+
if (static_cast<unsigned>(
4932+
count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
4933+
PointerOps.size() - 1 ||
4934+
any_of(PointerOps, [&](Value *V) {
4935+
return getUnderlyingObject(V) !=
4936+
getUnderlyingObject(PointerOps.front());
4937+
}))
4938+
VectorGEPCost += TTI.getScalarizationOverhead(
4939+
SubVecTy, APInt::getAllOnes(VF),
4940+
/*Insert=*/true, /*Extract=*/false, CostKind);
4941+
else
4942+
VectorGEPCost += TTI.getScalarizationOverhead(
4943+
SubVecTy, APInt::getOneBitSet(VF, 0),
4944+
/*Insert=*/true, /*Extract=*/false, CostKind) +
4945+
::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy,
4946+
std::nullopt, CostKind);
4947+
}
4948+
switch (LS) {
4949+
case LoadsState::Vectorize:
4950+
VecLdCost +=
4951+
TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
4952+
LI0->getPointerAddressSpace(), CostKind,
4953+
TTI::OperandValueInfo()) +
4954+
VectorGEPCost;
4955+
break;
4956+
case LoadsState::StridedVectorize:
4957+
VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
4958+
LI0->getPointerOperand(),
4959+
/*VariableMask=*/false,
4960+
CommonAlignment, CostKind) +
4961+
VectorGEPCost;
4962+
break;
4963+
case LoadsState::ScatterVectorize:
4964+
VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
4965+
LI0->getPointerOperand(),
4966+
/*VariableMask=*/false,
4967+
CommonAlignment, CostKind) +
4968+
VectorGEPCost;
4969+
break;
4970+
case LoadsState::Gather:
4971+
// Gathers are already calculated - ignore.
4972+
continue;
4973+
}
4974+
SmallVector<int> ShuffleMask(VL.size());
4975+
for (int Idx : seq<int>(0, VL.size()))
4976+
ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4977+
if (I > 0)
49104978
VecLdCost +=
49114979
::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
49124980
CostKind, I * VF, SubVecTy);
4913-
}
4914-
// If masked gather cost is higher - better to vectorize, so
4915-
// consider it as a gather node. It will be better estimated
4916-
// later.
4917-
if (MaskedGatherCost >= VecLdCost)
4918-
return true;
49194981
}
4982+
// If masked gather cost is higher - better to vectorize, so
4983+
// consider it as a gather node. It will be better estimated
4984+
// later.
4985+
if (MaskedGatherCost >= VecLdCost &&
4986+
VecLdCost - GatherCost < -SLPCostThreshold)
4987+
return true;
49204988
}
4921-
return false;
4989+
return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
49224990
};
49234991
// TODO: need to improve analysis of the pointers, if not all of them are
49244992
// GEPs or have > 2 operands, we end up with a gather node, which just
@@ -4939,7 +5007,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
49395007
!TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
49405008
// Check if potential masked gather can be represented as series
49415009
// of loads + insertsubvectors.
4942-
if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
5010+
if (TryRecursiveCheck &&
5011+
CheckForShuffledLoads(CommonAlignment, ProfitableGatherPointers)) {
49435012
// If masked gather cost is higher - better to vectorize, so
49445013
// consider it as a gather node. It will be better estimated
49455014
// later.

llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
180180
; AVX512F-NEXT: ret void
181181
;
182182
; AVX512VL-LABEL: @gather_load_2(
183-
; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0
184-
; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer
185-
; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
186-
; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !tbaa [[TBAA0]]
187-
; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
188-
; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
183+
; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
184+
; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
185+
; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
186+
; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
187+
; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
188+
; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
189+
; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
190+
; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
191+
; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
192+
; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
193+
; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
194+
; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
195+
; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
196+
; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
189197
; AVX512VL-NEXT: ret void
190198
;
191199
%3 = getelementptr inbounds i32, ptr %1, i64 1

llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
180180
; AVX512F-NEXT: ret void
181181
;
182182
; AVX512VL-LABEL: @gather_load_2(
183-
; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0
184-
; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer
185-
; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
186-
; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !tbaa [[TBAA0]]
187-
; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
188-
; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
183+
; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
184+
; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
185+
; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
186+
; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
187+
; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
188+
; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
189+
; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
190+
; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
191+
; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
192+
; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
193+
; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
194+
; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
195+
; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
196+
; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
189197
; AVX512VL-NEXT: ret void
190198
;
191199
%3 = getelementptr inbounds i32, ptr %1, i64 1

llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,23 @@
88
; YAML-NEXT: Function: test
99
; YAML-NEXT: Args:
1010
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
11-
; YAML-NEXT: - Cost: '-5'
11+
; YAML-NEXT: - Cost: '-7'
1212
; YAML-NEXT: - String: ' and with tree size '
13-
; YAML-NEXT: - TreeSize: '7'
13+
; YAML-NEXT: - TreeSize: '5'
1414

1515
define void @test(ptr noalias %p, ptr noalias %p1) {
1616
; CHECK-LABEL: @test(
1717
; CHECK-NEXT: entry:
18-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[P:%.*]], i32 0
19-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
20-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP1]], <4 x i64> <i64 0, i64 32, i64 33, i64 34>
21-
; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
22-
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[P]], align 4
23-
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
18+
; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[P:%.*]], align 4
19+
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr i32, ptr [[P]], i64 32
20+
; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
21+
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P]], i64 33
22+
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX11]], align 4
23+
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 4
24+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[I]], i32 0
25+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[I2]], i32 1
26+
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP3]], <2 x i32> [[TMP0]], i64 2)
27+
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP1]]
2428
; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[P1:%.*]], align 4
2529
; CHECK-NEXT: ret void
2630
;

0 commit comments

Comments
 (0)