Skip to content

Commit 7c9aa14

Browse files
authored
[AArch64] Use sve instructions for fixed-width smulh/umulh. (#166168)
Like v2i64 mul and operations like divide, we should be able to use the SVE umulh and smulh instructions with 128bit vectors, providing that we have SVE/SVE2. There are a number of other instructions that look like they should presumably be treated the same way.
1 parent a38f847 commit 7c9aa14

File tree

3 files changed

+148
-127
lines changed

3 files changed

+148
-127
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1842,11 +1842,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
18421842
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
18431843
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
18441844

1845-
// NEON doesn't support integer divides, but SVE does
1845+
// A number of operations like MULH and integer divides are not supported by
1846+
// NEON but are available in SVE.
18461847
for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
18471848
MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
18481849
setOperationAction(ISD::SDIV, VT, Custom);
18491850
setOperationAction(ISD::UDIV, VT, Custom);
1851+
setOperationAction(ISD::MULHS, VT, Custom);
1852+
setOperationAction(ISD::MULHU, VT, Custom);
18501853
}
18511854

18521855
// NEON doesn't support 64-bit vector integer muls, but SVE does.
@@ -1883,10 +1886,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
18831886
setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
18841887
setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
18851888
setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1886-
setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1887-
setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1888-
setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1889-
setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
18901889
setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
18911890
setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
18921891
setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
@@ -1908,8 +1907,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
19081907
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
19091908
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
19101909
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1911-
setOperationAction(ISD::MULHS, VT, Custom);
1912-
setOperationAction(ISD::MULHU, VT, Custom);
19131910
}
19141911

19151912
// Use SVE for vectors with more than 2 elements.

llvm/test/CodeGen/AArch64/sve-int-mulh-pred.ll

Lines changed: 80 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,11 @@ define <vscale x 2 x i64> @umulh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %
127127
define <16 x i8> @smulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
128128
; CHECK-LABEL: smulh_v16i8:
129129
; CHECK: // %bb.0:
130-
; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b
131-
; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
132-
; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
130+
; CHECK-NEXT: ptrue p0.b, vl16
131+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
132+
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
133+
; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
134+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
133135
; CHECK-NEXT: ret
134136
%1 = sext <16 x i8> %a to <16 x i16>
135137
%2 = sext <16 x i8> %b to <16 x i16>
@@ -142,9 +144,11 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
142144
define <8 x i16> @smulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
143145
; CHECK-LABEL: smulh_v8i16:
144146
; CHECK: // %bb.0:
145-
; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h
146-
; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
147-
; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
147+
; CHECK-NEXT: ptrue p0.h, vl8
148+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
149+
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
150+
; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
151+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
148152
; CHECK-NEXT: ret
149153
%1 = sext <8 x i16> %a to <8 x i32>
150154
%2 = sext <8 x i16> %b to <8 x i32>
@@ -157,9 +161,11 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
157161
define <4 x i32> @smulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
158162
; CHECK-LABEL: smulh_v4i32:
159163
; CHECK: // %bb.0:
160-
; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
161-
; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
162-
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
164+
; CHECK-NEXT: ptrue p0.s, vl4
165+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
166+
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
167+
; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
168+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
163169
; CHECK-NEXT: ret
164170
%1 = sext <4 x i32> %a to <4 x i64>
165171
%2 = sext <4 x i32> %b to <4 x i64>
@@ -172,15 +178,11 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
172178
define <2 x i64> @smulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
173179
; CHECK-LABEL: smulh_v2i64:
174180
; CHECK: // %bb.0:
175-
; CHECK-NEXT: mov x8, v0.d[1]
176-
; CHECK-NEXT: mov x9, v1.d[1]
177-
; CHECK-NEXT: fmov x10, d0
178-
; CHECK-NEXT: fmov x11, d1
179-
; CHECK-NEXT: smulh x10, x10, x11
180-
; CHECK-NEXT: smulh x8, x8, x9
181-
; CHECK-NEXT: fmov d0, x10
182-
; CHECK-NEXT: fmov d1, x8
183-
; CHECK-NEXT: mov v0.d[1], v1.d[0]
181+
; CHECK-NEXT: ptrue p0.d, vl2
182+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
183+
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
184+
; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
185+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
184186
; CHECK-NEXT: ret
185187
%1 = sext <2 x i64> %a to <2 x i128>
186188
%2 = sext <2 x i64> %b to <2 x i128>
@@ -193,9 +195,11 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
193195
define <16 x i8> @umulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
194196
; CHECK-LABEL: umulh_v16i8:
195197
; CHECK: // %bb.0:
196-
; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
197-
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
198-
; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
198+
; CHECK-NEXT: ptrue p0.b, vl16
199+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
200+
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
201+
; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
202+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
199203
; CHECK-NEXT: ret
200204
%1 = zext <16 x i8> %a to <16 x i16>
201205
%2 = zext <16 x i8> %b to <16 x i16>
@@ -208,9 +212,11 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %a, <16 x i8> %b) {
208212
define <8 x i16> @umulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
209213
; CHECK-LABEL: umulh_v8i16:
210214
; CHECK: // %bb.0:
211-
; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
212-
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
213-
; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
215+
; CHECK-NEXT: ptrue p0.h, vl8
216+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
217+
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
218+
; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
219+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
214220
; CHECK-NEXT: ret
215221
%1 = zext <8 x i16> %a to <8 x i32>
216222
%2 = zext <8 x i16> %b to <8 x i32>
@@ -223,9 +229,11 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %a, <8 x i16> %b) {
223229
define <4 x i32> @umulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
224230
; CHECK-LABEL: umulh_v4i32:
225231
; CHECK: // %bb.0:
226-
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
227-
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
228-
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
232+
; CHECK-NEXT: ptrue p0.s, vl4
233+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
234+
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
235+
; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
236+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
229237
; CHECK-NEXT: ret
230238
%1 = zext <4 x i32> %a to <4 x i64>
231239
%2 = zext <4 x i32> %b to <4 x i64>
@@ -238,15 +246,11 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %a, <4 x i32> %b) {
238246
define <2 x i64> @umulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
239247
; CHECK-LABEL: umulh_v2i64:
240248
; CHECK: // %bb.0:
241-
; CHECK-NEXT: mov x8, v0.d[1]
242-
; CHECK-NEXT: mov x9, v1.d[1]
243-
; CHECK-NEXT: fmov x10, d0
244-
; CHECK-NEXT: fmov x11, d1
245-
; CHECK-NEXT: umulh x10, x10, x11
246-
; CHECK-NEXT: umulh x8, x8, x9
247-
; CHECK-NEXT: fmov d0, x10
248-
; CHECK-NEXT: fmov d1, x8
249-
; CHECK-NEXT: mov v0.d[1], v1.d[0]
249+
; CHECK-NEXT: ptrue p0.d, vl2
250+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
251+
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
252+
; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
253+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
250254
; CHECK-NEXT: ret
251255
%1 = zext <2 x i64> %a to <2 x i128>
252256
%2 = zext <2 x i64> %b to <2 x i128>
@@ -263,8 +267,11 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %a, <2 x i64> %b) {
263267
define <8 x i8> @smulh_v8i8(<8 x i8> %a, <8 x i8> %b) {
264268
; CHECK-LABEL: smulh_v8i8:
265269
; CHECK: // %bb.0:
266-
; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
267-
; CHECK-NEXT: shrn v0.8b, v0.8h, #8
270+
; CHECK-NEXT: ptrue p0.b, vl8
271+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
272+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
273+
; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
274+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
268275
; CHECK-NEXT: ret
269276
%1 = sext <8 x i8> %a to <8 x i16>
270277
%2 = sext <8 x i8> %b to <8 x i16>
@@ -277,8 +284,11 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %a, <8 x i8> %b) {
277284
define <4 x i16> @smulh_v4i16(<4 x i16> %a, <4 x i16> %b) {
278285
; CHECK-LABEL: smulh_v4i16:
279286
; CHECK: // %bb.0:
280-
; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
281-
; CHECK-NEXT: shrn v0.4h, v0.4s, #16
287+
; CHECK-NEXT: ptrue p0.h, vl4
288+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
289+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
290+
; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
291+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
282292
; CHECK-NEXT: ret
283293
%1 = sext <4 x i16> %a to <4 x i32>
284294
%2 = sext <4 x i16> %b to <4 x i32>
@@ -291,8 +301,11 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %a, <4 x i16> %b) {
291301
define <2 x i32> @smulh_v2i32(<2 x i32> %a, <2 x i32> %b) {
292302
; CHECK-LABEL: smulh_v2i32:
293303
; CHECK: // %bb.0:
294-
; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
295-
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
304+
; CHECK-NEXT: ptrue p0.s, vl2
305+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
306+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
307+
; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
308+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
296309
; CHECK-NEXT: ret
297310
%1 = sext <2 x i32> %a to <2 x i64>
298311
%2 = sext <2 x i32> %b to <2 x i64>
@@ -305,12 +318,11 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %a, <2 x i32> %b) {
305318
define <1 x i64> @smulh_v1i64(<1 x i64> %a, <1 x i64> %b) {
306319
; CHECK-LABEL: smulh_v1i64:
307320
; CHECK: // %bb.0:
308-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
309-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
310-
; CHECK-NEXT: fmov x8, d0
311-
; CHECK-NEXT: fmov x9, d1
312-
; CHECK-NEXT: smulh x8, x8, x9
313-
; CHECK-NEXT: fmov d0, x8
321+
; CHECK-NEXT: ptrue p0.d, vl1
322+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
323+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
324+
; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
325+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
314326
; CHECK-NEXT: ret
315327
%1 = sext <1 x i64> %a to <1 x i128>
316328
%2 = sext <1 x i64> %b to <1 x i128>
@@ -323,8 +335,11 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %a, <1 x i64> %b) {
323335
define <8 x i8> @umulh_v8i8(<8 x i8> %a, <8 x i8> %b) {
324336
; CHECK-LABEL: umulh_v8i8:
325337
; CHECK: // %bb.0:
326-
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
327-
; CHECK-NEXT: shrn v0.8b, v0.8h, #8
338+
; CHECK-NEXT: ptrue p0.b, vl8
339+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
340+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
341+
; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
342+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
328343
; CHECK-NEXT: ret
329344
%1 = zext <8 x i8> %a to <8 x i16>
330345
%2 = zext <8 x i8> %b to <8 x i16>
@@ -337,8 +352,11 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %a, <8 x i8> %b) {
337352
define <4 x i16> @umulh_v4i16(<4 x i16> %a, <4 x i16> %b) {
338353
; CHECK-LABEL: umulh_v4i16:
339354
; CHECK: // %bb.0:
340-
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
341-
; CHECK-NEXT: shrn v0.4h, v0.4s, #16
355+
; CHECK-NEXT: ptrue p0.h, vl4
356+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
357+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
358+
; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
359+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
342360
; CHECK-NEXT: ret
343361
%1 = zext <4 x i16> %a to <4 x i32>
344362
%2 = zext <4 x i16> %b to <4 x i32>
@@ -351,8 +369,11 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %a, <4 x i16> %b) {
351369
define <2 x i32> @umulh_v2i32(<2 x i32> %a, <2 x i32> %b) {
352370
; CHECK-LABEL: umulh_v2i32:
353371
; CHECK: // %bb.0:
354-
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
355-
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
372+
; CHECK-NEXT: ptrue p0.s, vl2
373+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
374+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
375+
; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
376+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
356377
; CHECK-NEXT: ret
357378
%1 = zext <2 x i32> %a to <2 x i64>
358379
%2 = zext <2 x i32> %b to <2 x i64>
@@ -365,12 +386,11 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %a, <2 x i32> %b) {
365386
define <1 x i64> @umulh_v1i64(<1 x i64> %a, <1 x i64> %b) {
366387
; CHECK-LABEL: umulh_v1i64:
367388
; CHECK: // %bb.0:
368-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
369-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
370-
; CHECK-NEXT: fmov x8, d0
371-
; CHECK-NEXT: fmov x9, d1
372-
; CHECK-NEXT: umulh x8, x8, x9
373-
; CHECK-NEXT: fmov d0, x8
389+
; CHECK-NEXT: ptrue p0.d, vl1
390+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
391+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
392+
; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
393+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
374394
; CHECK-NEXT: ret
375395
%1 = zext <1 x i64> %a to <1 x i128>
376396
%2 = zext <1 x i64> %b to <1 x i128>

0 commit comments

Comments
 (0)