Skip to content

Commit d67dba5

Browse files
committed
[VPlan] Check Def2LaneDefs first in cloneForLane. (NFC)
If we have entries in Def2LaneDefs, we always have to use it. Move the check before. Otherwise we may not pick the correct operand, e.g. if Op was a replicate recipe that got single-scalar after replicating it. Fixes llvm#154330.
1 parent b20c291 commit d67dba5

File tree

2 files changed

+103
-16
lines changed

2 files changed

+103
-16
lines changed

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -473,20 +473,20 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
473473
// Collect the operands at Lane, creating extracts as needed.
474474
SmallVector<VPValue *> NewOps;
475475
for (VPValue *Op : RepR->operands()) {
476-
if (vputils::isSingleScalar(Op)) {
477-
NewOps.push_back(Op);
476+
// If Op is a definition that has been unrolled, directly use the clone for
477+
// the corresponding lane.
478+
auto LaneDefs = Def2LaneDefs.find(Op);
479+
if (LaneDefs != Def2LaneDefs.end()) {
480+
NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
478481
continue;
479482
}
480483
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
481484
NewOps.push_back(
482485
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
483486
continue;
484487
}
485-
// If Op is a definition that has been unrolled, directly use the clone for
486-
// the corresponding lane.
487-
auto LaneDefs = Def2LaneDefs.find(Op);
488-
if (LaneDefs != Def2LaneDefs.end()) {
489-
NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
488+
if (vputils::isSingleScalar(Op)) {
489+
NewOps.push_back(Op);
490490
continue;
491491
}
492492

llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll

Lines changed: 96 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
22
; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s
33

44
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@@ -215,11 +215,98 @@ loop.latch:
215215
exit:
216216
ret void
217217
}
218-
;.
219-
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
220-
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
221-
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
222-
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
223-
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
224-
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
225-
;.
218+
219+
define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) {
220+
; CHECK-LABEL: define float @uniform_load_replicating_select(
221+
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[TMP0:%.*]]) {
222+
; CHECK-NEXT: [[ENTRY:.*]]:
223+
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 1
224+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
225+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
226+
; CHECK: [[VECTOR_PH]]:
227+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
228+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
229+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
230+
; CHECK: [[VECTOR_BODY]]:
231+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
232+
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 4
233+
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 5
234+
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 6
235+
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 7
236+
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[A]], align 4
237+
; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
238+
; CHECK-NEXT: [[TMP8:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
239+
; CHECK-NEXT: [[TMP9:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
240+
; CHECK-NEXT: [[TMP10:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
241+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i32 0
242+
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i1> [[TMP11]], i1 [[TMP8]], i32 1
243+
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP9]], i32 2
244+
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 3
245+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]]
246+
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
247+
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
248+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
249+
; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP7]], ptr [[A]], ptr [[TMP15]]
250+
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP8]], ptr [[A]], ptr [[TMP16]]
251+
; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP9]], ptr [[A]], ptr [[TMP17]]
252+
; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP18]]
253+
; CHECK-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP14]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00)
254+
; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP19]], align 4
255+
; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP20]], align 4
256+
; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP21]], align 4
257+
; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP22]], align 4
258+
; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x float> poison, float [[TMP24]], i32 0
259+
; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP25]], i32 1
260+
; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP26]], i32 2
261+
; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i32 3
262+
; CHECK-NEXT: [[TMP32:%.*]] = fdiv <4 x float> splat (float 4.000000e+00), [[TMP31]]
263+
; CHECK-NEXT: [[TMP33:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[TMP23]], <4 x float> [[TMP32]])
264+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
265+
; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
266+
; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
267+
; CHECK: [[MIDDLE_BLOCK]]:
268+
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i32 3
269+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
270+
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
271+
; CHECK: [[SCALAR_PH]]:
272+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
273+
; CHECK-NEXT: br label %[[LOOP:.*]]
274+
; CHECK: [[LOOP]]:
275+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
276+
; CHECK-NEXT: [[L:%.*]] = load float, ptr [[A]], align 4
277+
; CHECK-NEXT: [[C:%.*]] = fcmp ogt float [[L]], 0.000000e+00
278+
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
279+
; CHECK-NEXT: [[SEL_PTR:%.*]] = select i1 [[C]], ptr [[A]], ptr [[GEP_B]]
280+
; CHECK-NEXT: [[BASE:%.*]] = select i1 [[C]], float 1.000000e+01, float 1.000000e+00
281+
; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[SEL_PTR]], align 4
282+
; CHECK-NEXT: [[DIV:%.*]] = fdiv float 4.000000e+00, [[L_2]]
283+
; CHECK-NEXT: [[POW:%.*]] = tail call float @llvm.pow.f32(float [[BASE]], float [[DIV]])
284+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
285+
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[TMP0]]
286+
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
287+
; CHECK: [[EXIT]]:
288+
; CHECK-NEXT: [[POW_LCSSA:%.*]] = phi float [ [[POW]], %[[LOOP]] ], [ [[TMP35]], %[[MIDDLE_BLOCK]] ]
289+
; CHECK-NEXT: ret float [[POW_LCSSA]]
290+
;
291+
entry:
292+
br label %loop
293+
294+
loop:
295+
%iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
296+
%l = load float, ptr %A, align 4
297+
%c = fcmp ogt float %l, 0.000000e+00
298+
%gep.B = getelementptr inbounds float, ptr %B, i64 %iv
299+
%sel.ptr = select i1 %c, ptr %A, ptr %gep.B
300+
%base = select i1 %c, float 10.000000e+00, float 1.000000e+00
301+
%l.2 = load float, ptr %sel.ptr, align 4
302+
%div = fdiv float 4.000000e+00, %l.2
303+
%pow = tail call float @llvm.pow.f32(float %base, float %div)
304+
%iv.next = add i64 %iv, 1
305+
%ec = icmp eq i64 %iv, %1
306+
br i1 %ec, label %exit, label %loop
307+
308+
exit:
309+
ret float %pow
310+
}
311+
312+
declare float @llvm.pow.f32(float, float)

0 commit comments

Comments
 (0)