Skip to content

Commit 37f7b31

Browse files
authored
Reland [VPlan] Handle WidenGEP in narrowToSingleScalars (#167880)
Changes: Fix a missed update to WidenGEP::usesFirstLaneOnly, and include reduced-case test that was previously hitting the new assert: the underlying reason was that VPWidenGEP::usesScalars was too weak, and the single-scalar WidenGEP was not narrowed by narrowToSingleScalarRecipes. This allows us to strip a special case in VPWidenGEP::execute.
1 parent 81f4ab8 commit 37f7b31

File tree

6 files changed

+107
-73
lines changed

6 files changed

+107
-73
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1854,12 +1854,6 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
18541854
return getOperand(I + 1)->isDefinedOutsideLoopRegions();
18551855
}
18561856

1857-
bool areAllOperandsInvariant() const {
1858-
return all_of(operands(), [](VPValue *Op) {
1859-
return Op->isDefinedOutsideLoopRegions();
1860-
});
1861-
}
1862-
18631857
public:
18641858
VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef<VPValue *> Operands,
18651859
const VPIRFlags &Flags = {},
@@ -1898,14 +1892,7 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
18981892
}
18991893

19001894
/// Returns true if the recipe only uses the first lane of operand \p Op.
1901-
bool usesFirstLaneOnly(const VPValue *Op) const override {
1902-
assert(is_contained(operands(), Op) &&
1903-
"Op must be an operand of the recipe");
1904-
if (Op == getOperand(0))
1905-
return isPointerLoopInvariant();
1906-
else
1907-
return !isPointerLoopInvariant() && Op->isDefinedOutsideLoopRegions();
1908-
}
1895+
bool usesFirstLaneOnly(const VPValue *Op) const override;
19091896

19101897
protected:
19111898
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 31 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2536,6 +2536,11 @@ void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
25362536
}
25372537
#endif
25382538

2539+
bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const {
2540+
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2541+
return vputils::isSingleScalar(Op);
2542+
}
2543+
25392544
void VPWidenGEPRecipe::execute(VPTransformState &State) {
25402545
assert(State.VF.isVector() && "not widening");
25412546
// Construct a vector GEP by widening the operands of the scalar GEP as
@@ -2544,51 +2549,32 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
25442549
// is vector-typed. Thus, to keep the representation compact, we only use
25452550
// vector-typed operands for loop-varying values.
25462551

2547-
if (areAllOperandsInvariant()) {
2548-
// If we are vectorizing, but the GEP has only loop-invariant operands,
2549-
// the GEP we build (by only using vector-typed operands for
2550-
// loop-varying values) would be a scalar pointer. Thus, to ensure we
2551-
// produce a vector of pointers, we need to either arbitrarily pick an
2552-
// operand to broadcast, or broadcast a clone of the original GEP.
2553-
// Here, we broadcast a clone of the original.
2554-
//
2555-
// TODO: If at some point we decide to scalarize instructions having
2556-
// loop-invariant operands, this special case will no longer be
2557-
// required. We would add the scalarization decision to
2558-
// collectLoopScalars() and teach getVectorValue() to broadcast
2559-
// the lane-zero scalar value.
2560-
SmallVector<Value *> Ops;
2561-
for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2562-
Ops.push_back(State.get(getOperand(I), VPLane(0)));
2563-
2564-
auto *NewGEP =
2565-
State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
2566-
"", getGEPNoWrapFlags());
2567-
Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2568-
State.set(this, Splat);
2569-
} else {
2570-
// If the GEP has at least one loop-varying operand, we are sure to
2571-
// produce a vector of pointers unless VF is scalar.
2572-
// The pointer operand of the new GEP. If it's loop-invariant, we
2573-
// won't broadcast it.
2574-
auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2575-
2576-
// Collect all the indices for the new GEP. If any index is
2577-
// loop-invariant, we won't broadcast it.
2578-
SmallVector<Value *, 4> Indices;
2579-
for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2580-
VPValue *Operand = getOperand(I);
2581-
Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2582-
}
2583-
2584-
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2585-
// but it should be a vector, otherwise.
2586-
auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2587-
"", getGEPNoWrapFlags());
2588-
assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2589-
"NewGEP is not a pointer vector");
2590-
State.set(this, NewGEP);
2591-
}
2552+
assert(
2553+
any_of(operands(),
2554+
[](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); }) &&
2555+
"Expected at least one loop-variant operand");
2556+
2557+
// If the GEP has at least one loop-varying operand, we are sure to
2558+
// produce a vector of pointers unless VF is scalar.
2559+
// The pointer operand of the new GEP. If it's loop-invariant, we
2560+
// won't broadcast it.
2561+
auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2562+
2563+
// Collect all the indices for the new GEP. If any index is
2564+
// loop-invariant, we won't broadcast it.
2565+
SmallVector<Value *, 4> Indices;
2566+
for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2567+
VPValue *Operand = getOperand(I);
2568+
Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2569+
}
2570+
2571+
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2572+
// but it should be a vector, otherwise.
2573+
auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2574+
"", getGEPNoWrapFlags());
2575+
assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2576+
"NewGEP is not a pointer vector");
2577+
State.set(this, NewGEP);
25922578
}
25932579

25942580
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1451,7 +1451,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
14511451
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
14521452
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
14531453
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1454-
if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
1454+
if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPWidenGEPRecipe,
1455+
VPReplicateRecipe>(&R))
14551456
continue;
14561457
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
14571458
if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))

llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) {
6363
; CHECK-NEXT: store i32 [[STORE]], ptr [[NBRBOXES]], align 4
6464
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
6565
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp sgt i32 [[IV]], [[IBOX]]
66-
; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
66+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
6767
; CHECK: exit:
6868
; CHECK-NEXT: ret void
6969
;
@@ -114,7 +114,7 @@ define void @predicated_strided_store(ptr %start) {
114114
; RVA23-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
115115
; RVA23-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
116116
; RVA23-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
117-
; RVA23-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
117+
; RVA23-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
118118
; RVA23: middle.block:
119119
; RVA23-NEXT: br label [[LOOP:%.*]]
120120
; RVA23: exit:
@@ -141,7 +141,7 @@ define void @predicated_strided_store(ptr %start) {
141141
; RVA23ZVL1024B-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
142142
; RVA23ZVL1024B-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
143143
; RVA23ZVL1024B-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
144-
; RVA23ZVL1024B-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
144+
; RVA23ZVL1024B-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
145145
; RVA23ZVL1024B: middle.block:
146146
; RVA23ZVL1024B-NEXT: br label [[LOOP:%.*]]
147147
; RVA23ZVL1024B: exit:
@@ -185,16 +185,16 @@ define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr no
185185
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P1:%.*]], <vscale x 2 x i64> [[VEC_IND]]
186186
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
187187
; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[P2:%.*]], align 4
188-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
189-
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
190-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[P3:%.*]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]]
188+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P3:%.*]], i64 [[TMP6]]
189+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP8]], i64 0
190+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
191191
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
192192
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
193193
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x ptr> align 1 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
194194
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
195195
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
196196
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
197-
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
197+
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
198198
; CHECK: middle.block:
199199
; CHECK-NEXT: br label [[LOOP:%.*]]
200200
; CHECK: exit:
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 6
2+
; RUN: opt -p loop-vectorize -force-vector-width=2 \
3+
; RUN: -force-target-supports-scalable-vectors=true \
4+
; RUN: -scalable-vectorization=preferred -S %s | FileCheck %s
5+
6+
define void @widengep_narrow(ptr %in, ptr noalias %p) {
7+
; CHECK-LABEL: define void @widengep_narrow(
8+
; CHECK-SAME: ptr [[IN:%.*]], ptr noalias [[P:%.*]]) {
9+
; CHECK-NEXT: [[ENTRY:.*:]]
10+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
11+
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
12+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
13+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
14+
; CHECK: [[VECTOR_PH]]:
15+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
16+
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
17+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
18+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
19+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[IN]], i64 8
20+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP4]], i64 0
21+
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
22+
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
23+
; CHECK-NEXT: [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
24+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
25+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP3]], i64 0
26+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
27+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
28+
; CHECK: [[VECTOR_BODY]]:
29+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
30+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
31+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, <vscale x 2 x ptr> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> [[VEC_IND]]
32+
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
33+
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i32 [[TMP8]], 2
34+
; CHECK-NEXT: [[TMP10:%.*]] = sub i32 [[TMP9]], 1
35+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <vscale x 2 x ptr> [[TMP7]], i32 [[TMP10]]
36+
; CHECK-NEXT: store ptr [[TMP11]], ptr [[P]], align 8
37+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
38+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
39+
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
40+
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
41+
; CHECK: [[MIDDLE_BLOCK]]:
42+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
43+
; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
44+
; CHECK: [[SCALAR_PH]]:
45+
;
46+
entry:
47+
br label %loop
48+
49+
loop:
50+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
51+
%gep.in.off = getelementptr i8, ptr %in, i64 8
52+
%gep.in.iv = getelementptr i32, ptr %gep.in.off, i64 %iv
53+
store ptr %gep.in.iv, ptr %p
54+
%iv.next = add i64 %iv, 1
55+
%ec = icmp eq i64 %iv, 1024
56+
br i1 %ec, label %exit, label %loop
57+
58+
exit:
59+
ret void
60+
}

llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@ define void @pr63340(ptr %A, ptr %B) {
88
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
99
; CHECK: vector.ph:
1010
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 1
11-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
12-
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x ptr> [[DOTSPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
11+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
12+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
1313
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1414
; CHECK: vector.body:
1515
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1616
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
1717
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[B]], i8 [[OFFSET_IDX]]
18-
; CHECK-NEXT: store <4 x ptr> [[DOTSPLAT]], ptr [[TMP1]], align 8
18+
; CHECK-NEXT: store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
1919
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
2020
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
2121
; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -55,11 +55,11 @@ define void @wide_gep_index_invariant(ptr noalias %dst, ptr noalias %src, i64 %n
5555
; CHECK: vector.body:
5656
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
5757
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC]], align 8
58-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
58+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr [[TMP0]], i64 [[N]]
59+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1]], i64 0
5960
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
60-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, <4 x ptr> [[BROADCAST_SPLAT]], i64 [[N]]
6161
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[INDEX]]
62-
; CHECK-NEXT: store <4 x ptr> [[TMP1]], ptr [[TMP2]], align 8
62+
; CHECK-NEXT: store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
6363
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
6464
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
6565
; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]

0 commit comments

Comments
 (0)