Skip to content

Commit 3e898bc

Browse files
[LV] Fix cost misaligned when gather/scatter w/ addr is uniform. (#157387)
This patch fix the assertion when the `isUniform` (from legacy model) and `isSingleScalar`(from Vplan-based model) mismatch. The simplify test that cause assertion ``` loop: loadA = load %a => %a is loop invariant. loadB = load %LoadA ... ``` In the legacy cost model, it cannot analysis that addr of `%loadB` is uniform but in the Vplan-based cost model both addr in `%loadA` and `loadB` is single scalar. Full test caused crash: https://llvm.godbolt.org/z/zEG8YKjqh. --------- Co-authored-by: Luke Lau <[email protected]>
1 parent 9179d3f commit 3e898bc

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6907,6 +6907,16 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
69076907
if (isa<VPPartialReductionRecipe>(&R))
69086908
return true;
69096909

6910+
// The VPlan-based cost model can analyze if recipes are scalar
6911+
// recursively, but the legacy cost model cannot.
6912+
if (auto *WidenMemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
6913+
auto *AddrI = dyn_cast<Instruction>(
6914+
getLoadStorePointerOperand(&WidenMemR->getIngredient()));
6915+
if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
6916+
CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
6917+
return true;
6918+
}
6919+
69106920
/// If a VPlan transform folded a recipe to one producing a single-scalar,
69116921
/// but the original instruction wasn't uniform-after-vectorization in the
69126922
/// legacy cost model, the legacy cost overestimates the actual cost.

llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,75 @@ loop:
184184
exit:
185185
ret void
186186
}
187+
188+
define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2, ptr %p3, i64 %N) {
189+
; CHECK-LABEL: @store_to_addr_generated_from_invariant_addr(
190+
; CHECK-NEXT: entry:
191+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
192+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
193+
; CHECK: vector.ph:
194+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P0:%.*]], i64 0
195+
; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
196+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
197+
; CHECK-NEXT: [[TMP2:%.*]] = mul <vscale x 2 x i64> [[TMP1]], splat (i64 1)
198+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP2]]
199+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
200+
; CHECK: vector.body:
201+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
202+
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
203+
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
204+
; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
205+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP4]], i64 0
206+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
207+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P1:%.*]], <vscale x 2 x i64> [[VEC_IND]]
208+
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
209+
; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[P2:%.*]], align 4
210+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
211+
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
212+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[P3:%.*]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]]
213+
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
214+
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
215+
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x ptr> align 1 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
216+
; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP3]] to i64
217+
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
218+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
219+
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
220+
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
221+
; CHECK: middle.block:
222+
; CHECK-NEXT: br label [[EXIT:%.*]]
223+
; CHECK: scalar.ph:
224+
; CHECK-NEXT: br label [[LOOP:%.*]]
225+
; CHECK: loop:
226+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
227+
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P1]], i64 [[IV]]
228+
; CHECK-NEXT: store ptr [[P0]], ptr [[ARRAYIDX11]], align 8
229+
; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[P2]], align 4
230+
; CHECK-NEXT: [[BITS_TO_GO:%.*]] = getelementptr i8, ptr [[P3]], i64 [[TMP10]]
231+
; CHECK-NEXT: store i32 0, ptr [[BITS_TO_GO]], align 4
232+
; CHECK-NEXT: store i32 0, ptr [[BITS_TO_GO]], align 4
233+
; CHECK-NEXT: store i8 0, ptr [[BITS_TO_GO]], align 1
234+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
235+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
236+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
237+
; CHECK: exit:
238+
; CHECK-NEXT: ret void
239+
;
240+
entry:
241+
br label %loop
242+
243+
loop:
244+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
245+
%arrayidx11 = getelementptr i32, ptr %p1, i64 %iv
246+
store ptr %p0, ptr %arrayidx11, align 8
247+
%0 = load i64, ptr %p2, align 4
248+
%bits_to_go = getelementptr i8, ptr %p3, i64 %0
249+
store i32 0, ptr %bits_to_go, align 4
250+
store i32 0, ptr %bits_to_go, align 4
251+
store i8 0, ptr %bits_to_go, align 1
252+
%iv.next = add i64 %iv, 1
253+
%exitcond.not = icmp eq i64 %iv, %N
254+
br i1 %exitcond.not, label %exit, label %loop
255+
256+
exit:
257+
ret void
258+
}

0 commit comments

Comments
 (0)