Skip to content

Commit 8fac33d

Browse files
Check for non-legal types for the wide add instructions
Check for non-legal types for the wide add instructions so that instructions that require promotion or splitting just go to the default case.
1 parent 9cb5482 commit 8fac33d

File tree

2 files changed

+72
-0
lines changed

2 files changed

+72
-0
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21818,6 +21818,11 @@ SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
2181821818
auto Input = ExtInput->getOperand(0);
2181921819
EVT InputVT = Input.getValueType();
2182021820

21821+
if (!(InputVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
21822+
!(InputVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
21823+
!(InputVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
21824+
return SDValue();
21825+
2182121826
// To do this transformation, output element size needs to be double input
2182221827
// element size, and output number of elements needs to be half the input
2182321828
// number of elements

llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,70 @@ entry:
7272
%partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
7373
ret <vscale x 8 x i16> %partial.reduce
7474
}
75+
76+
define <vscale x 2 x i32> @signed_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){
77+
; CHECK-LABEL: signed_wide_add_nxv4i16:
78+
; CHECK: // %bb.0: // %entry
79+
; CHECK-NEXT: ptrue p0.s
80+
; CHECK-NEXT: sxth z1.s, p0/m, z1.s
81+
; CHECK-NEXT: uunpklo z2.d, z1.s
82+
; CHECK-NEXT: uunpkhi z1.d, z1.s
83+
; CHECK-NEXT: add z0.d, z0.d, z2.d
84+
; CHECK-NEXT: add z0.d, z1.d, z0.d
85+
; CHECK-NEXT: ret
86+
entry:
87+
%input.wide = sext <vscale x 4 x i16> %input to <vscale x 4 x i32>
88+
%partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
89+
ret <vscale x 2 x i32> %partial.reduce
90+
}
91+
92+
define <vscale x 2 x i32> @unsigned_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){
93+
; CHECK-LABEL: unsigned_wide_add_nxv4i16:
94+
; CHECK: // %bb.0: // %entry
95+
; CHECK-NEXT: and z1.s, z1.s, #0xffff
96+
; CHECK-NEXT: uunpklo z2.d, z1.s
97+
; CHECK-NEXT: uunpkhi z1.d, z1.s
98+
; CHECK-NEXT: add z0.d, z0.d, z2.d
99+
; CHECK-NEXT: add z0.d, z1.d, z0.d
100+
; CHECK-NEXT: ret
101+
entry:
102+
%input.wide = zext <vscale x 4 x i16> %input to <vscale x 4 x i32>
103+
%partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
104+
ret <vscale x 2 x i32> %partial.reduce
105+
}
106+
107+
define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
108+
; CHECK-LABEL: signed_wide_add_nxv8i32:
109+
; CHECK: // %bb.0: // %entry
110+
; CHECK-NEXT: sunpkhi z4.d, z2.s
111+
; CHECK-NEXT: sunpklo z2.d, z2.s
112+
; CHECK-NEXT: sunpkhi z5.d, z3.s
113+
; CHECK-NEXT: sunpklo z3.d, z3.s
114+
; CHECK-NEXT: add z0.d, z0.d, z2.d
115+
; CHECK-NEXT: add z1.d, z1.d, z4.d
116+
; CHECK-NEXT: add z0.d, z3.d, z0.d
117+
; CHECK-NEXT: add z1.d, z5.d, z1.d
118+
; CHECK-NEXT: ret
119+
entry:
120+
%input.wide = sext <vscale x 8 x i32> %input to <vscale x 8 x i64>
121+
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
122+
ret <vscale x 4 x i64> %partial.reduce
123+
}
124+
125+
define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
126+
; CHECK-LABEL: unsigned_wide_add_nxv8i32:
127+
; CHECK: // %bb.0: // %entry
128+
; CHECK-NEXT: uunpkhi z4.d, z2.s
129+
; CHECK-NEXT: uunpklo z2.d, z2.s
130+
; CHECK-NEXT: uunpkhi z5.d, z3.s
131+
; CHECK-NEXT: uunpklo z3.d, z3.s
132+
; CHECK-NEXT: add z0.d, z0.d, z2.d
133+
; CHECK-NEXT: add z1.d, z1.d, z4.d
134+
; CHECK-NEXT: add z0.d, z3.d, z0.d
135+
; CHECK-NEXT: add z1.d, z5.d, z1.d
136+
; CHECK-NEXT: ret
137+
entry:
138+
%input.wide = zext <vscale x 8 x i32> %input to <vscale x 8 x i64>
139+
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
140+
ret <vscale x 4 x i64> %partial.reduce
141+
}

0 commit comments

Comments
 (0)