Skip to content

Commit 1c7ec06

Browse files
authored
[VPlan] Optimize LastActiveLane to EVL - 1 (llvm#169766)
With EVL tail folding, the LastActiveLane can be computed with EVL - 1. This removes the need for a header mask and vfirst.m for loops with live outs on RISC-V: # %bb.5: # %for.cond.cleanup7 - vsetvli zero, zero, e32, m2, ta, ma - vmv.v.x v8, s1 - vmsleu.vv v10, v8, v22 - vfirst.m a0, v10 - srli a1, a0, 63 - czero.nez a0, a0, a1 - czero.eqz a1, s8, a1 - or a0, a0, a1 - addi a0, a0, -1 - vsetvli zero, zero, e64, m4, ta, ma - vslidedown.vx v8, v12, a0 + addi s1, s1, -1 + vslidedown.vx v8, v12, s1
1 parent 6abbbca commit 1c7ec06

File tree

7 files changed

+18
-41
lines changed

7 files changed

+18
-41
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2736,6 +2736,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
27362736
VPRecipeBase &CurRecipe,
27372737
VPTypeAnalysis &TypeInfo, VPValue &EVL) {
27382738
VPlan *Plan = CurRecipe.getParent()->getPlan();
2739+
DebugLoc DL = CurRecipe.getDebugLoc();
27392740
VPValue *Addr, *Mask, *EndPtr;
27402741

27412742
/// Adjust any end pointers so that they point to the end of EVL lanes not VF.
@@ -2787,13 +2788,21 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
27872788
m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
27882789
return new VPWidenIntrinsicRecipe(
27892790
Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2790-
TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
2791+
TypeInfo.inferScalarType(LHS), {}, {}, DL);
27912792

27922793
if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
27932794
m_VPValue(RHS))))
27942795
return new VPWidenIntrinsicRecipe(
27952796
Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2796-
TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
2797+
TypeInfo.inferScalarType(LHS), {}, {}, DL);
2798+
2799+
if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
2800+
Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
2801+
VPValue *ZExt =
2802+
VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
2803+
return new VPInstruction(Instruction::Sub,
2804+
{ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
2805+
}
27972806

27982807
return nullptr;
27992808
}

llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,9 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
8787
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
8888
; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 252, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
8989
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
90-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], i64 0
91-
; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
9290
; CHECK-NEXT: [[TMP3:%.*]] = mul i32 4, [[TMP2]]
9391
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
9492
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
95-
; CHECK-NEXT: [[TMP5:%.*]] = icmp uge <vscale x 16 x i32> [[TMP0]], [[BROADCAST_SPLAT3]]
9693
; CHECK-NEXT: [[TMP9:%.*]] = sext <vscale x 16 x i32> [[VEC_IND]] to <vscale x 16 x i64>
9794
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], <vscale x 16 x i64> [[TMP9]]
9895
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP6]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP2]]), !alias.scope [[META3:![0-9]+]]
@@ -103,7 +100,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
103100
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
104101
; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
105102
; CHECK: [[MIDDLE_BLOCK]]:
106-
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP5]], i1 false)
103+
; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP2]] to i64
107104
; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1
108105
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
109106
; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 16

llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -656,13 +656,9 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
656656
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 8 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
657657
; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 12, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
658658
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 8, i1 true)
659-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP3]], i64 0
660-
; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
661659
; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
662660
; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[TMP4]], i64 0
663661
; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT5]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
664-
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
665-
; CHECK-NEXT: [[TMP15:%.*]] = icmp uge <vscale x 8 x i32> [[TMP5]], [[BROADCAST_SPLAT8]]
666662
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> [[BROADCAST_SPLAT2]], <vscale x 8 x i8> splat (i8 1), i32 [[TMP3]])
667663
; CHECK-NEXT: [[TMP9:%.*]] = udiv <vscale x 8 x i8> [[VEC_IND]], [[TMP8]]
668664
; CHECK-NEXT: [[TMP10:%.*]] = zext <vscale x 8 x i8> [[TMP9]] to <vscale x 8 x i16>
@@ -675,7 +671,7 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
675671
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
676672
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
677673
; CHECK: middle.block:
678-
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[TMP15]], i1 false)
674+
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP3]] to i64
679675
; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[TMP16]], 1
680676
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
681677
; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8

llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,6 @@ define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
2424
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 23, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
2525
; CHECK-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
2626
; CHECK-NEXT: [[TMP6]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
27-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP6]], i64 0
28-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
29-
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
30-
; CHECK-NEXT: [[TMP8:%.*]] = icmp uge <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
3127
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[EVL_BASED_IV]]
3228
; CHECK-NEXT: [[VP_OP_LOAD]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
3329
; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR]], <vscale x 2 x i64> [[VP_OP_LOAD]], i32 -1, <vscale x 2 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP6]])
@@ -39,8 +35,7 @@ define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
3935
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
4036
; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
4137
; CHECK: [[MIDDLE_BLOCK]]:
42-
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP8]], i1 false)
43-
; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP14]], 1
38+
; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP12]], 1
4439
; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP15]], 1
4540
; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
4641
; CHECK-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2

llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -231,10 +231,6 @@ define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
231231
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
232232
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1025, [[ENTRY]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY]] ]
233233
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
234-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
235-
; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
236-
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
237-
; CHECK-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
238234
; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
239235
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
240236
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -246,8 +242,7 @@ define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
246242
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
247243
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
248244
; CHECK: middle.block:
249-
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 false)
250-
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
245+
; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP5]], 1
251246
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
252247
; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
253248
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 0

llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -417,10 +417,6 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
417417
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[TC]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
418418
; IF-EVL-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP4]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
419419
; IF-EVL-NEXT: [[TMP9]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
420-
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9]], i64 0
421-
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
422-
; IF-EVL-NEXT: [[TMP22:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
423-
; IF-EVL-NEXT: [[TMP23:%.*]] = icmp uge <vscale x 4 x i32> [[TMP22]], [[BROADCAST_SPLAT]]
424420
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
425421
; IF-EVL-NEXT: [[WIDE_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
426422
; IF-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP9]])
@@ -433,8 +429,7 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
433429
; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
434430
; IF-EVL-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
435431
; IF-EVL: [[MIDDLE_BLOCK]]:
436-
; IF-EVL-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP23]], i1 false)
437-
; IF-EVL-NEXT: [[TMP28:%.*]] = sub i64 [[TMP27]], 1
432+
; IF-EVL-NEXT: [[TMP28:%.*]] = sub i64 [[TMP13]], 1
438433
; IF-EVL-NEXT: [[TMP17:%.*]] = sub i64 [[TMP28]], 1
439434
; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
440435
; IF-EVL-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4

llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,6 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
117117
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
118118
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
119119
; SCALABLE-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
120-
; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
121-
; SCALABLE-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
122-
; SCALABLE-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
123-
; SCALABLE-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
124120
; SCALABLE-NEXT: [[TMP6:%.*]] = load i64, ptr [[B]], align 8
125121
; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
126122
; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -132,8 +128,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
132128
; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
133129
; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
134130
; SCALABLE: [[MIDDLE_BLOCK]]:
135-
; SCALABLE-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 false)
136-
; SCALABLE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
131+
; SCALABLE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP5]], 1
137132
; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
138133
; SCALABLE-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP7]], 2
139134
; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP11]], 0
@@ -186,10 +181,6 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
186181
; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
187182
; TF-SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
188183
; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
189-
; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
190-
; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
191-
; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
192-
; TF-SCALABLE-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
193184
; TF-SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8
194185
; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
195186
; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -201,8 +192,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
201192
; TF-SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
202193
; TF-SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
203194
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
204-
; TF-SCALABLE-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 false)
205-
; TF-SCALABLE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
195+
; TF-SCALABLE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP5]], 1
206196
; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
207197
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
208198
; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 0

0 commit comments

Comments
 (0)