Skip to content

Commit d8abaa6

Browse files
author
git apple-llvm automerger
committed
Merge commit '185ba025dadc' from llvm.org/main into next
2 parents e07cfaf + 185ba02 commit d8abaa6

File tree

3 files changed

+266
-1
lines changed

3 files changed

+266
-1
lines changed

llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "llvm/IR/PatternMatch.h"
2626
#include "llvm/InitializePasses.h"
2727
#include "llvm/Pass.h"
28+
#include "llvm/Transforms/Utils/Local.h"
2829

2930
using namespace llvm;
3031

@@ -58,6 +59,7 @@ class RISCVCodeGenPrepare : public FunctionPass,
5859
bool visitAnd(BinaryOperator &BO);
5960
bool visitIntrinsicInst(IntrinsicInst &I);
6061
bool expandVPStrideLoad(IntrinsicInst &I);
62+
bool widenVPMerge(IntrinsicInst &I);
6163
};
6264

6365
} // end anonymous namespace
@@ -103,6 +105,76 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
103105
return true;
104106
}
105107

108+
// With EVL tail folding, an AnyOf reduction will generate an i1 vp.merge like
109+
// follows:
110+
//
111+
// loop:
112+
// %phi = phi <vscale x 4 x i1> [ zeroinitializer, %entry ], [ %rec, %loop ]
113+
// %cmp = icmp ...
114+
// %rec = call <vscale x 4 x i1> @llvm.vp.merge(%cmp, i1 true, %phi, %evl)
115+
// ...
116+
// middle:
117+
// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
118+
//
119+
// However RVV doesn't have any tail undisturbed mask instructions and so we
120+
// need a convoluted sequence of mask instructions to lower the i1 vp.merge: see
121+
// llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll.
122+
//
123+
// To avoid that this widens the i1 vp.merge to an i8 vp.merge, which will
124+
// generate a single vmerge.vim:
125+
//
126+
// loop:
127+
// %phi = phi <vscale x 4 x i8> [ zeroinitializer, %entry ], [ %rec, %loop ]
128+
// %cmp = icmp ...
129+
// %rec = call <vscale x 4 x i8> @llvm.vp.merge(%cmp, i8 true, %phi, %evl)
130+
// %trunc = trunc <vscale x 4 x i8> %rec to <vscale x 4 x i1>
131+
// ...
132+
// middle:
133+
// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
134+
//
135+
// The trunc will normally be sunk outside of the loop, but even if there are
136+
// users inside the loop it is still profitable.
137+
bool RISCVCodeGenPrepare::widenVPMerge(IntrinsicInst &II) {
138+
if (!II.getType()->getScalarType()->isIntegerTy(1))
139+
return false;
140+
141+
Value *Mask, *True, *PhiV, *EVL;
142+
using namespace PatternMatch;
143+
if (!match(&II,
144+
m_Intrinsic<Intrinsic::vp_merge>(m_Value(Mask), m_Value(True),
145+
m_Value(PhiV), m_Value(EVL))))
146+
return false;
147+
148+
auto *Phi = dyn_cast<PHINode>(PhiV);
149+
if (!Phi || !Phi->hasOneUse() || Phi->getNumIncomingValues() != 2 ||
150+
!match(Phi->getIncomingValue(0), m_Zero()) ||
151+
Phi->getIncomingValue(1) != &II)
152+
return false;
153+
154+
Type *WideTy =
155+
VectorType::get(IntegerType::getInt8Ty(II.getContext()),
156+
cast<VectorType>(II.getType())->getElementCount());
157+
158+
IRBuilder<> Builder(Phi);
159+
PHINode *WidePhi = Builder.CreatePHI(WideTy, 2);
160+
WidePhi->addIncoming(ConstantAggregateZero::get(WideTy),
161+
Phi->getIncomingBlock(0));
162+
Builder.SetInsertPoint(&II);
163+
Value *WideTrue = Builder.CreateZExt(True, WideTy);
164+
Value *WideMerge = Builder.CreateIntrinsic(Intrinsic::vp_merge, {WideTy},
165+
{Mask, WideTrue, WidePhi, EVL});
166+
WidePhi->addIncoming(WideMerge, Phi->getIncomingBlock(1));
167+
Value *Trunc = Builder.CreateTrunc(WideMerge, II.getType());
168+
169+
II.replaceAllUsesWith(Trunc);
170+
171+
// Break the cycle and delete the old chain.
172+
Phi->setIncomingValue(1, Phi->getIncomingValue(0));
173+
llvm::RecursivelyDeleteTriviallyDeadInstructions(&II);
174+
175+
return true;
176+
}
177+
106178
// LLVM vector reduction intrinsics return a scalar result, but on RISC-V vector
107179
// reduction instructions write the result in the first element of a vector
108180
// register. So when a reduction in a loop uses a scalar phi, we end up with
@@ -138,6 +210,9 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
138210
if (expandVPStrideLoad(I))
139211
return true;
140212

213+
if (widenVPMerge(I))
214+
return true;
215+
141216
if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd &&
142217
!isa<VPReductionIntrinsic>(&I))
143218
return false;

llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=riscv64 | FileCheck %s
2+
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
33

44

55
; Make sure we don't emit a pair of shift for the zext in the preheader. We
@@ -127,3 +127,101 @@ for.body: ; preds = %for.body, %for.body
127127
%niter.ncmp.1 = icmp eq i64 %niter.next.1, %unroll_iter
128128
br i1 %niter.ncmp.1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
129129
}
130+
131+
define i1 @widen_anyof_rdx(ptr %p, i64 %n) {
132+
; CHECK-LABEL: widen_anyof_rdx:
133+
; CHECK: # %bb.0: # %entry
134+
; CHECK-NEXT: li a2, 0
135+
; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
136+
; CHECK-NEXT: vmv.v.i v8, 0
137+
; CHECK-NEXT: .LBB2_1: # %loop
138+
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
139+
; CHECK-NEXT: sub a3, a1, a2
140+
; CHECK-NEXT: slli a4, a2, 2
141+
; CHECK-NEXT: vsetvli a3, a3, e32, m2, ta, ma
142+
; CHECK-NEXT: add a4, a0, a4
143+
; CHECK-NEXT: vle32.v v10, (a4)
144+
; CHECK-NEXT: vmsne.vi v0, v10, 0
145+
; CHECK-NEXT: add a2, a2, a3
146+
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma
147+
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
148+
; CHECK-NEXT: blt a2, a1, .LBB2_1
149+
; CHECK-NEXT: # %bb.2: # %exit
150+
; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
151+
; CHECK-NEXT: vand.vi v8, v8, 1
152+
; CHECK-NEXT: vmsne.vi v8, v8, 0
153+
; CHECK-NEXT: vcpop.m a0, v8
154+
; CHECK-NEXT: snez a0, a0
155+
; CHECK-NEXT: ret
156+
entry:
157+
br label %loop
158+
loop:
159+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
160+
%phi = phi <vscale x 4 x i1> [ zeroinitializer, %entry ], [ %rec, %loop ]
161+
%avl = sub i64 %n, %iv
162+
%evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true)
163+
164+
%gep = getelementptr i32, ptr %p, i64 %iv
165+
%x = call <vscale x 4 x i32> @llvm.vp.load(ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %evl)
166+
%cmp = icmp ne <vscale x 4 x i32> %x, zeroinitializer
167+
%rec = call <vscale x 4 x i1> @llvm.vp.merge(<vscale x 4 x i1> %cmp, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> %phi, i32 %evl)
168+
169+
%evl.zext = zext i32 %evl to i64
170+
%iv.next = add i64 %iv, %evl.zext
171+
%done = icmp sge i64 %iv.next, %n
172+
br i1 %done, label %exit, label %loop
173+
exit:
174+
%res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
175+
ret i1 %res
176+
}
177+
178+
179+
define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) {
180+
; CHECK-LABEL: widen_anyof_rdx_use_in_loop:
181+
; CHECK: # %bb.0: # %entry
182+
; CHECK-NEXT: li a2, 0
183+
; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
184+
; CHECK-NEXT: vmv.v.i v8, 0
185+
; CHECK-NEXT: .LBB3_1: # %loop
186+
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
187+
; CHECK-NEXT: sub a3, a1, a2
188+
; CHECK-NEXT: slli a4, a2, 2
189+
; CHECK-NEXT: vsetvli a3, a3, e32, m2, ta, ma
190+
; CHECK-NEXT: add a4, a0, a4
191+
; CHECK-NEXT: vle32.v v10, (a4)
192+
; CHECK-NEXT: vmsne.vi v0, v10, 0
193+
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma
194+
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
195+
; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
196+
; CHECK-NEXT: vand.vi v9, v8, 1
197+
; CHECK-NEXT: vmsne.vi v9, v9, 0
198+
; CHECK-NEXT: add a2, a2, a3
199+
; CHECK-NEXT: vsm.v v9, (a4)
200+
; CHECK-NEXT: blt a2, a1, .LBB3_1
201+
; CHECK-NEXT: # %bb.2: # %exit
202+
; CHECK-NEXT: vcpop.m a0, v9
203+
; CHECK-NEXT: snez a0, a0
204+
; CHECK-NEXT: ret
205+
entry:
206+
br label %loop
207+
loop:
208+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
209+
%phi = phi <vscale x 4 x i1> [ zeroinitializer, %entry ], [ %rec, %loop ]
210+
%avl = sub i64 %n, %iv
211+
%evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true)
212+
213+
%gep = getelementptr i32, ptr %p, i64 %iv
214+
%x = call <vscale x 4 x i32> @llvm.vp.load(ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %evl)
215+
%cmp = icmp ne <vscale x 4 x i32> %x, zeroinitializer
216+
%rec = call <vscale x 4 x i1> @llvm.vp.merge(<vscale x 4 x i1> %cmp, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> %phi, i32 %evl)
217+
218+
store <vscale x 4 x i1> %rec, ptr %gep
219+
220+
%evl.zext = zext i32 %evl to i64
221+
%iv.next = add i64 %iv, %evl.zext
222+
%done = icmp sge i64 %iv.next, %n
223+
br i1 %done, label %exit, label %loop
224+
exit:
225+
%res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
226+
ret i1 %res
227+
}

llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,95 @@ define i64 @bug(i32 %x) {
103103
%b = and i64 %a, 4294967295
104104
ret i64 %b
105105
}
106+
107+
define i1 @widen_anyof_rdx(ptr %p, i64 %n) {
108+
; CHECK-LABEL: @widen_anyof_rdx(
109+
; CHECK-NEXT: entry:
110+
; CHECK-NEXT: br label [[LOOP:%.*]]
111+
; CHECK: loop:
112+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
113+
; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP]] ]
114+
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]]
115+
; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
116+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]]
117+
; CHECK-NEXT: [[X:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
118+
; CHECK-NEXT: [[CMP:%.*]] = icmp ne <vscale x 4 x i32> [[X]], zeroinitializer
119+
; CHECK-NEXT: [[TMP1]] = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> [[CMP]], <vscale x 4 x i8> splat (i8 1), <vscale x 4 x i8> [[TMP0]], i32 [[EVL]])
120+
; CHECK-NEXT: [[TMP4:%.*]] = trunc <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i1>
121+
; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
122+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]]
123+
; CHECK-NEXT: [[DONE:%.*]] = icmp sge i64 [[IV_NEXT]], [[N]]
124+
; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
125+
; CHECK: exit:
126+
; CHECK-NEXT: [[RES:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP4]])
127+
; CHECK-NEXT: ret i1 [[RES]]
128+
;
129+
entry:
130+
br label %loop
131+
loop:
132+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
133+
%phi = phi <vscale x 4 x i1> [ zeroinitializer, %entry ], [ %rec, %loop ]
134+
%avl = sub i64 %n, %iv
135+
%evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true)
136+
137+
%gep = getelementptr i32, ptr %p, i64 %iv
138+
%x = call <vscale x 4 x i32> @llvm.vp.load(ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %evl)
139+
%cmp = icmp ne <vscale x 4 x i32> %x, zeroinitializer
140+
%rec = call <vscale x 4 x i1> @llvm.vp.merge(<vscale x 4 x i1> %cmp, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> %phi, i32 %evl)
141+
142+
%evl.zext = zext i32 %evl to i64
143+
%iv.next = add i64 %iv, %evl.zext
144+
%done = icmp sge i64 %iv.next, %n
145+
br i1 %done, label %exit, label %loop
146+
exit:
147+
%res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
148+
ret i1 %res
149+
}
150+
151+
152+
define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) {
153+
; CHECK-LABEL: @widen_anyof_rdx_use_in_loop(
154+
; CHECK-NEXT: entry:
155+
; CHECK-NEXT: br label [[LOOP:%.*]]
156+
; CHECK: loop:
157+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
158+
; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP]] ]
159+
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]]
160+
; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
161+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]]
162+
; CHECK-NEXT: [[X:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
163+
; CHECK-NEXT: [[CMP:%.*]] = icmp ne <vscale x 4 x i32> [[X]], zeroinitializer
164+
; CHECK-NEXT: [[TMP1]] = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> [[CMP]], <vscale x 4 x i8> splat (i8 1), <vscale x 4 x i8> [[TMP0]], i32 [[EVL]])
165+
; CHECK-NEXT: [[REC:%.*]] = trunc <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i1>
166+
; CHECK-NEXT: store <vscale x 4 x i1> [[REC]], ptr [[GEP]], align 1
167+
; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
168+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]]
169+
; CHECK-NEXT: [[DONE:%.*]] = icmp sge i64 [[IV_NEXT]], [[N]]
170+
; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
171+
; CHECK: exit:
172+
; CHECK-NEXT: [[RES:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[REC]])
173+
; CHECK-NEXT: ret i1 [[RES]]
174+
;
175+
entry:
176+
br label %loop
177+
loop:
178+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
179+
%phi = phi <vscale x 4 x i1> [ zeroinitializer, %entry ], [ %rec, %loop ]
180+
%avl = sub i64 %n, %iv
181+
%evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true)
182+
183+
%gep = getelementptr i32, ptr %p, i64 %iv
184+
%x = call <vscale x 4 x i32> @llvm.vp.load(ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %evl)
185+
%cmp = icmp ne <vscale x 4 x i32> %x, zeroinitializer
186+
%rec = call <vscale x 4 x i1> @llvm.vp.merge(<vscale x 4 x i1> %cmp, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> %phi, i32 %evl)
187+
188+
store <vscale x 4 x i1> %rec, ptr %gep
189+
190+
%evl.zext = zext i32 %evl to i64
191+
%iv.next = add i64 %iv, %evl.zext
192+
%done = icmp sge i64 %iv.next, %n
193+
br i1 %done, label %exit, label %loop
194+
exit:
195+
%res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
196+
ret i1 %res
197+
}

0 commit comments

Comments
 (0)