Skip to content

Commit 8185a93

Browse files
committed
[RISCV} Add DAG combine for forming VAADDU_VL from VP intrinsics.
This adds a VP version of an existing DAG combine. I've put it in RISCV since we would need to add a ISD::VP_AVGCEIL opcode otherwise. This pattern appears in 525.264_r.
1 parent a06c893 commit 8185a93

File tree

2 files changed

+276
-12
lines changed

2 files changed

+276
-12
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 107 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1526,18 +1526,16 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
15261526
setTargetDAGCombine({ISD::ZERO_EXTEND, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
15271527
ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT});
15281528
if (Subtarget.hasVInstructions())
1529-
setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER,
1530-
ISD::MSCATTER, ISD::VP_GATHER,
1531-
ISD::VP_SCATTER, ISD::SRA,
1532-
ISD::SRL, ISD::SHL,
1533-
ISD::STORE, ISD::SPLAT_VECTOR,
1534-
ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
1535-
ISD::VP_STORE, ISD::EXPERIMENTAL_VP_REVERSE,
1536-
ISD::MUL, ISD::SDIV,
1537-
ISD::UDIV, ISD::SREM,
1538-
ISD::UREM, ISD::INSERT_VECTOR_ELT,
1539-
ISD::ABS, ISD::CTPOP,
1540-
ISD::VECTOR_SHUFFLE, ISD::VSELECT});
1529+
setTargetDAGCombine(
1530+
{ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER,
1531+
ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA,
1532+
ISD::SRL, ISD::SHL, ISD::STORE,
1533+
ISD::SPLAT_VECTOR, ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
1534+
ISD::VP_STORE, ISD::VP_TRUNCATE, ISD::EXPERIMENTAL_VP_REVERSE,
1535+
ISD::MUL, ISD::SDIV, ISD::UDIV,
1536+
ISD::SREM, ISD::UREM, ISD::INSERT_VECTOR_ELT,
1537+
ISD::ABS, ISD::CTPOP, ISD::VECTOR_SHUFFLE,
1538+
ISD::VSELECT});
15411539

15421540
if (Subtarget.hasVendorXTHeadMemPair())
15431541
setTargetDAGCombine({ISD::LOAD, ISD::STORE});
@@ -16373,6 +16371,101 @@ static SDValue performVP_STORECombine(SDNode *N, SelectionDAG &DAG,
1637316371
VPStore->isTruncatingStore(), VPStore->isCompressingStore());
1637416372
}
1637516373

16374+
// Peephole avgceil pattern.
16375+
// %1 = zext <N x i8> %a to <N x i32>
16376+
// %2 = zext <N x i8> %b to <N x i32>
16377+
// %3 = add nuw nsw <N x i32> %1, splat (i32 1)
16378+
// %4 = add nuw nsw <N x i32> %3, %2
16379+
// %5 = lshr <N x i32> %N, <i32 1 x N>
16380+
// %6 = trunc <N x i32> %5 to <N x i8>
16381+
static SDValue performVP_TRUNCATECombine(SDNode *N, SelectionDAG &DAG,
16382+
const RISCVSubtarget &Subtarget) {
16383+
EVT VT = N->getValueType(0);
16384+
16385+
// Ignore fixed vectors.
16386+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16387+
if (!VT.isScalableVector() || !TLI.isTypeLegal(VT))
16388+
return SDValue();
16389+
16390+
SDValue In = N->getOperand(0);
16391+
SDValue Mask = N->getOperand(1);
16392+
SDValue VL = N->getOperand(2);
16393+
16394+
// Input should be a vp_srl with same mask and VL.
16395+
if (In.getOpcode() != ISD::VP_SRL || In.getOperand(2) != Mask ||
16396+
In.getOperand(3) != VL)
16397+
return SDValue();
16398+
16399+
// Shift amount should be 1.
16400+
if (!isOneOrOneSplat(In.getOperand(1)))
16401+
return SDValue();
16402+
16403+
// Shifted value should be a vp_add with same mask and VL.
16404+
SDValue LHS = In.getOperand(0);
16405+
if (LHS.getOpcode() != ISD::VP_ADD || LHS.getOperand(2) != Mask ||
16406+
LHS.getOperand(3) != VL)
16407+
return SDValue();
16408+
16409+
SDValue Operands[3];
16410+
Operands[0] = LHS.getOperand(0);
16411+
Operands[1] = LHS.getOperand(1);
16412+
16413+
// Matches another VP_ADD with same VL and Mask.
16414+
auto FindAdd = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
16415+
if (V.getOpcode() != ISD::VP_ADD || V.getOperand(2) != Mask ||
16416+
V.getOperand(3) != VL)
16417+
return false;
16418+
16419+
Op0 = V.getOperand(0);
16420+
Op1 = V.getOperand(1);
16421+
return true;
16422+
};
16423+
16424+
// We need to find another VP_ADD in one of the operands.
16425+
SDValue Op0, Op1;
16426+
if (FindAdd(Operands[0], Op0, Op1))
16427+
Operands[0] = Operands[1];
16428+
else if (!FindAdd(Operands[1], Op0, Op1))
16429+
return SDValue();
16430+
Operands[2] = Op0;
16431+
Operands[1] = Op1;
16432+
16433+
// Now we have three operands of two additions. Check that one of them is a
16434+
// constant vector with ones.
16435+
auto I = llvm::find_if(Operands,
16436+
[](const SDValue &Op) { return isOneOrOneSplat(Op); });
16437+
if (I == std::end(Operands))
16438+
return SDValue();
16439+
// We found a vector with ones, move if it to the end of the Operands array.
16440+
std::swap(Operands[I - std::begin(Operands)], Operands[2]);
16441+
16442+
// Make sure the other 2 operands can be promoted from the result type.
16443+
for (int i = 0; i < 2; ++i) {
16444+
if (Operands[i].getOpcode() != ISD::VP_ZERO_EXTEND ||
16445+
Operands[i].getOperand(1) != Mask || Operands[i].getOperand(2) != VL)
16446+
return SDValue();
16447+
// Input must be smaller than our result.
16448+
if (Operands[i].getOperand(0).getScalarValueSizeInBits() >
16449+
VT.getScalarSizeInBits())
16450+
return SDValue();
16451+
}
16452+
16453+
// Pattern is detected.
16454+
Op0 = Operands[0].getOperand(0);
16455+
Op1 = Operands[1].getOperand(0);
16456+
// Rebuild the zero extends if the inputs are smaller than our result.
16457+
if (Op0.getValueType() != VT)
16458+
Op0 =
16459+
DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[0]), VT, Op0, Mask, VL);
16460+
if (Op1.getValueType() != VT)
16461+
Op1 =
16462+
DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[1]), VT, Op1, Mask, VL);
16463+
// Build a VAADDU with RNU rounding mode.
16464+
SDLoc DL(N);
16465+
return DAG.getNode(RISCVISD::AVGCEILU_VL, DL, VT,
16466+
{Op0, Op1, DAG.getUNDEF(VT), Mask, VL});
16467+
}
16468+
1637616469
// Convert from one FMA opcode to another based on whether we are negating the
1637716470
// multiply result and/or the accumulator.
1637816471
// NOTE: Only supports RVV operations with VL.
@@ -17930,6 +18023,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
1793018023
if (SDValue V = combineTruncOfSraSext(N, DAG))
1793118024
return V;
1793218025
return combineTruncToVnclip(N, DAG, Subtarget);
18026+
case ISD::VP_TRUNCATE:
18027+
return performVP_TRUNCATECombine(N, DAG, Subtarget);
1793318028
case ISD::TRUNCATE:
1793418029
return performTRUNCATECombine(N, DAG, Subtarget);
1793518030
case ISD::SELECT:
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
3+
4+
declare <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i32)
5+
declare <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i32)
6+
declare <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i32)
7+
declare <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i32)
8+
declare <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32)
9+
declare <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32)
10+
declare <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
11+
declare <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
12+
declare <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
13+
declare <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
14+
15+
define <vscale x 2 x i8> @vaaddu_1(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
16+
; CHECK-LABEL: vaaddu_1:
17+
; CHECK: # %bb.0:
18+
; CHECK-NEXT: csrwi vxrm, 0
19+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
20+
; CHECK-NEXT: vaaddu.vv v8, v8, v9, v0.t
21+
; CHECK-NEXT: ret
22+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
23+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
24+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
25+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
26+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
27+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
28+
ret <vscale x 2 x i8> %d
29+
}
30+
31+
define <vscale x 2 x i8> @vaaddu_2(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
32+
; CHECK-LABEL: vaaddu_2:
33+
; CHECK: # %bb.0:
34+
; CHECK-NEXT: csrwi vxrm, 0
35+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
36+
; CHECK-NEXT: vaaddu.vv v8, v8, v9, v0.t
37+
; CHECK-NEXT: ret
38+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
39+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
40+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
41+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
42+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
43+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
44+
ret <vscale x 2 x i8> %d
45+
}
46+
47+
define <vscale x 2 x i8> @vaaddu_3(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
48+
; CHECK-LABEL: vaaddu_3:
49+
; CHECK: # %bb.0:
50+
; CHECK-NEXT: csrwi vxrm, 0
51+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
52+
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
53+
; CHECK-NEXT: ret
54+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
55+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
56+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
57+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
58+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
59+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
60+
ret <vscale x 2 x i8> %d
61+
}
62+
63+
define <vscale x 2 x i8> @vaaddu_4(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
64+
; CHECK-LABEL: vaaddu_4:
65+
; CHECK: # %bb.0:
66+
; CHECK-NEXT: csrwi vxrm, 0
67+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
68+
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
69+
; CHECK-NEXT: ret
70+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
71+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
72+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
73+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %yz, <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
74+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
75+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
76+
ret <vscale x 2 x i8> %d
77+
}
78+
79+
define <vscale x 2 x i8> @vaaddu_5(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
80+
; CHECK-LABEL: vaaddu_5:
81+
; CHECK: # %bb.0:
82+
; CHECK-NEXT: csrwi vxrm, 0
83+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
84+
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
85+
; CHECK-NEXT: ret
86+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
87+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
88+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i16> %xz, <vscale x 2 x i1> %m, i32 %vl)
89+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
90+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
91+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
92+
ret <vscale x 2 x i8> %d
93+
}
94+
95+
define <vscale x 2 x i8> @vaaddu_6(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
96+
; CHECK-LABEL: vaaddu_6:
97+
; CHECK: # %bb.0:
98+
; CHECK-NEXT: csrwi vxrm, 0
99+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
100+
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
101+
; CHECK-NEXT: ret
102+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
103+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
104+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i16> %xz, <vscale x 2 x i1> %m, i32 %vl)
105+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %yz, <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
106+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
107+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
108+
ret <vscale x 2 x i8> %d
109+
}
110+
111+
; Test where the size is reduced by 4x instead of 2x.
112+
define <vscale x 2 x i8> @vaaddu_7(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
113+
; CHECK-LABEL: vaaddu_7:
114+
; CHECK: # %bb.0:
115+
; CHECK-NEXT: csrwi vxrm, 0
116+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
117+
; CHECK-NEXT: vaaddu.vv v8, v8, v9, v0.t
118+
; CHECK-NEXT: ret
119+
%xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
120+
%yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
121+
%a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
122+
%b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
123+
%c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
124+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> %c, <vscale x 2 x i1> %m, i32 %vl)
125+
ret <vscale x 2 x i8> %d
126+
}
127+
128+
; Test where the zext can't be completely removed.
129+
define <vscale x 2 x i16> @vaaddu_8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
130+
; CHECK-LABEL: vaaddu_8:
131+
; CHECK: # %bb.0:
132+
; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
133+
; CHECK-NEXT: vzext.vf2 v10, v8, v0.t
134+
; CHECK-NEXT: csrwi vxrm, 0
135+
; CHECK-NEXT: vzext.vf2 v8, v9, v0.t
136+
; CHECK-NEXT: vaaddu.vv v8, v10, v8, v0.t
137+
; CHECK-NEXT: ret
138+
%xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
139+
%yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
140+
%a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
141+
%b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
142+
%c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
143+
%d = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> %c, <vscale x 2 x i1> %m, i32 %vl)
144+
ret <vscale x 2 x i16> %d
145+
}
146+
147+
; Negative test. The truncate has a smaller type than the zero extend.
148+
; TODO: Could still handle this by truncating after an i16 vaaddu.
149+
define <vscale x 2 x i8> @vaaddu_9(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
150+
; CHECK-LABEL: vaaddu_9:
151+
; CHECK: # %bb.0:
152+
; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
153+
; CHECK-NEXT: vwaddu.vv v10, v8, v9, v0.t
154+
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
155+
; CHECK-NEXT: vadd.vi v8, v10, 1, v0.t
156+
; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t
157+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
158+
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
159+
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
160+
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
161+
; CHECK-NEXT: ret
162+
%xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i1> %m, i32 %vl)
163+
%yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %y, <vscale x 2 x i1> %m, i32 %vl)
164+
%a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
165+
%b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
166+
%c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
167+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> %c, <vscale x 2 x i1> %m, i32 %vl)
168+
ret <vscale x 2 x i8> %d
169+
}

0 commit comments

Comments
 (0)