11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2- ; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
2+ ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
33
44;
55; SMULH
66;
77
8- define <vscale x 16 x i8 > @smulh_i8 (<vscale x 16 x i8 > %a , <vscale x 16 x i8 > %b ) # 0 {
8+ define <vscale x 16 x i8 > @smulh_i8 (<vscale x 16 x i8 > %a , <vscale x 16 x i8 > %b ) {
99; CHECK-LABEL: smulh_i8:
1010; CHECK: // %bb.0:
1111; CHECK-NEXT: ptrue p0.b
@@ -19,7 +19,7 @@ define <vscale x 16 x i8> @smulh_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b
1919 ret <vscale x 16 x i8 > %tr
2020}
2121
22- define <vscale x 8 x i16 > @smulh_i16 (<vscale x 8 x i16 > %a , <vscale x 8 x i16 > %b ) # 0 {
22+ define <vscale x 8 x i16 > @smulh_i16 (<vscale x 8 x i16 > %a , <vscale x 8 x i16 > %b ) {
2323; CHECK-LABEL: smulh_i16:
2424; CHECK: // %bb.0:
2525; CHECK-NEXT: ptrue p0.h
@@ -33,7 +33,7 @@ define <vscale x 8 x i16> @smulh_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %
3333 ret <vscale x 8 x i16 > %tr
3434}
3535
36- define <vscale x 4 x i32 > @smulh_i32 (<vscale x 4 x i32 > %a , <vscale x 4 x i32 > %b ) # 0 {
36+ define <vscale x 4 x i32 > @smulh_i32 (<vscale x 4 x i32 > %a , <vscale x 4 x i32 > %b ) {
3737; CHECK-LABEL: smulh_i32:
3838; CHECK: // %bb.0:
3939; CHECK-NEXT: ptrue p0.s
@@ -47,7 +47,7 @@ define <vscale x 4 x i32> @smulh_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %
4747 ret <vscale x 4 x i32 > %tr
4848}
4949
50- define <vscale x 2 x i64 > @smulh_i64 (<vscale x 2 x i64 > %a , <vscale x 2 x i64 > %b ) # 0 {
50+ define <vscale x 2 x i64 > @smulh_i64 (<vscale x 2 x i64 > %a , <vscale x 2 x i64 > %b ) {
5151; CHECK-LABEL: smulh_i64:
5252; CHECK: // %bb.0:
5353; CHECK-NEXT: ptrue p0.d
@@ -65,7 +65,7 @@ define <vscale x 2 x i64> @smulh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %
6565; UMULH
6666;
6767
68- define <vscale x 16 x i8 > @umulh_i8 (<vscale x 16 x i8 > %a , <vscale x 16 x i8 > %b ) # 0 {
68+ define <vscale x 16 x i8 > @umulh_i8 (<vscale x 16 x i8 > %a , <vscale x 16 x i8 > %b ) {
6969; CHECK-LABEL: umulh_i8:
7070; CHECK: // %bb.0:
7171; CHECK-NEXT: ptrue p0.b
@@ -79,7 +79,7 @@ define <vscale x 16 x i8> @umulh_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b
7979 ret <vscale x 16 x i8 > %tr
8080}
8181
82- define <vscale x 8 x i16 > @umulh_i16 (<vscale x 8 x i16 > %a , <vscale x 8 x i16 > %b ) # 0 {
82+ define <vscale x 8 x i16 > @umulh_i16 (<vscale x 8 x i16 > %a , <vscale x 8 x i16 > %b ) {
8383; CHECK-LABEL: umulh_i16:
8484; CHECK: // %bb.0:
8585; CHECK-NEXT: ptrue p0.h
@@ -93,7 +93,7 @@ define <vscale x 8 x i16> @umulh_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %
9393 ret <vscale x 8 x i16 > %tr
9494}
9595
96- define <vscale x 4 x i32 > @umulh_i32 (<vscale x 4 x i32 > %a , <vscale x 4 x i32 > %b ) # 0 {
96+ define <vscale x 4 x i32 > @umulh_i32 (<vscale x 4 x i32 > %a , <vscale x 4 x i32 > %b ) {
9797; CHECK-LABEL: umulh_i32:
9898; CHECK: // %bb.0:
9999; CHECK-NEXT: ptrue p0.s
@@ -107,7 +107,7 @@ define <vscale x 4 x i32> @umulh_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %
107107 ret <vscale x 4 x i32 > %tr
108108}
109109
110- define <vscale x 2 x i64 > @umulh_i64 (<vscale x 2 x i64 > %a , <vscale x 2 x i64 > %b ) # 0 {
110+ define <vscale x 2 x i64 > @umulh_i64 (<vscale x 2 x i64 > %a , <vscale x 2 x i64 > %b ) {
111111; CHECK-LABEL: umulh_i64:
112112; CHECK: // %bb.0:
113113; CHECK-NEXT: ptrue p0.d
@@ -121,4 +121,262 @@ define <vscale x 2 x i64> @umulh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %
121121 ret <vscale x 2 x i64 > %tr
122122}
123123
124- attributes #0 = { "target-features" ="+sve" }
124+
125+ ; Fixed-length 128bits
126+
127+ define <16 x i8 > @smulh_v16i8 (<16 x i8 > %a , <16 x i8 > %b ) {
128+ ; CHECK-LABEL: smulh_v16i8:
129+ ; CHECK: // %bb.0:
130+ ; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b
131+ ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
132+ ; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
133+ ; CHECK-NEXT: ret
134+ %1 = sext <16 x i8 > %a to <16 x i16 >
135+ %2 = sext <16 x i8 > %b to <16 x i16 >
136+ %mul = mul <16 x i16 > %1 , %2
137+ %shr = lshr <16 x i16 > %mul , splat(i16 8 )
138+ %tr = trunc <16 x i16 > %shr to <16 x i8 >
139+ ret <16 x i8 > %tr
140+ }
141+
142+ define <8 x i16 > @smulh_v8i16 (<8 x i16 > %a , <8 x i16 > %b ) {
143+ ; CHECK-LABEL: smulh_v8i16:
144+ ; CHECK: // %bb.0:
145+ ; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h
146+ ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
147+ ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
148+ ; CHECK-NEXT: ret
149+ %1 = sext <8 x i16 > %a to <8 x i32 >
150+ %2 = sext <8 x i16 > %b to <8 x i32 >
151+ %mul = mul <8 x i32 > %1 , %2
152+ %shr = lshr <8 x i32 > %mul , splat(i32 16 )
153+ %tr = trunc <8 x i32 > %shr to <8 x i16 >
154+ ret <8 x i16 > %tr
155+ }
156+
157+ define <4 x i32 > @smulh_v4i32 (<4 x i32 > %a , <4 x i32 > %b ) {
158+ ; CHECK-LABEL: smulh_v4i32:
159+ ; CHECK: // %bb.0:
160+ ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
161+ ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
162+ ; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
163+ ; CHECK-NEXT: ret
164+ %1 = sext <4 x i32 > %a to <4 x i64 >
165+ %2 = sext <4 x i32 > %b to <4 x i64 >
166+ %mul = mul <4 x i64 > %1 , %2
167+ %shr = lshr <4 x i64 > %mul , splat(i64 32 )
168+ %tr = trunc <4 x i64 > %shr to <4 x i32 >
169+ ret <4 x i32 > %tr
170+ }
171+
172+ define <2 x i64 > @smulh_v2i64 (<2 x i64 > %a , <2 x i64 > %b ) {
173+ ; CHECK-LABEL: smulh_v2i64:
174+ ; CHECK: // %bb.0:
175+ ; CHECK-NEXT: mov x8, v0.d[1]
176+ ; CHECK-NEXT: mov x9, v1.d[1]
177+ ; CHECK-NEXT: fmov x10, d0
178+ ; CHECK-NEXT: fmov x11, d1
179+ ; CHECK-NEXT: smulh x10, x10, x11
180+ ; CHECK-NEXT: smulh x8, x8, x9
181+ ; CHECK-NEXT: fmov d0, x10
182+ ; CHECK-NEXT: fmov d1, x8
183+ ; CHECK-NEXT: mov v0.d[1], v1.d[0]
184+ ; CHECK-NEXT: ret
185+ %1 = sext <2 x i64 > %a to <2 x i128 >
186+ %2 = sext <2 x i64 > %b to <2 x i128 >
187+ %mul = mul <2 x i128 > %1 , %2
188+ %shr = lshr <2 x i128 > %mul , splat(i128 64 )
189+ %tr = trunc <2 x i128 > %shr to <2 x i64 >
190+ ret <2 x i64 > %tr
191+ }
192+
193+ define <16 x i8 > @umulh_v16i8 (<16 x i8 > %a , <16 x i8 > %b ) {
194+ ; CHECK-LABEL: umulh_v16i8:
195+ ; CHECK: // %bb.0:
196+ ; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
197+ ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
198+ ; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
199+ ; CHECK-NEXT: ret
200+ %1 = zext <16 x i8 > %a to <16 x i16 >
201+ %2 = zext <16 x i8 > %b to <16 x i16 >
202+ %mul = mul <16 x i16 > %1 , %2
203+ %shr = lshr <16 x i16 > %mul , splat(i16 8 )
204+ %tr = trunc <16 x i16 > %shr to <16 x i8 >
205+ ret <16 x i8 > %tr
206+ }
207+
208+ define <8 x i16 > @umulh_v8i16 (<8 x i16 > %a , <8 x i16 > %b ) {
209+ ; CHECK-LABEL: umulh_v8i16:
210+ ; CHECK: // %bb.0:
211+ ; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
212+ ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
213+ ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
214+ ; CHECK-NEXT: ret
215+ %1 = zext <8 x i16 > %a to <8 x i32 >
216+ %2 = zext <8 x i16 > %b to <8 x i32 >
217+ %mul = mul <8 x i32 > %1 , %2
218+ %shr = lshr <8 x i32 > %mul , splat(i32 16 )
219+ %tr = trunc <8 x i32 > %shr to <8 x i16 >
220+ ret <8 x i16 > %tr
221+ }
222+
223+ define <4 x i32 > @umulh_v4i32 (<4 x i32 > %a , <4 x i32 > %b ) {
224+ ; CHECK-LABEL: umulh_v4i32:
225+ ; CHECK: // %bb.0:
226+ ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
227+ ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
228+ ; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
229+ ; CHECK-NEXT: ret
230+ %1 = zext <4 x i32 > %a to <4 x i64 >
231+ %2 = zext <4 x i32 > %b to <4 x i64 >
232+ %mul = mul <4 x i64 > %1 , %2
233+ %shr = lshr <4 x i64 > %mul , splat(i64 32 )
234+ %tr = trunc <4 x i64 > %shr to <4 x i32 >
235+ ret <4 x i32 > %tr
236+ }
237+
238+ define <2 x i64 > @umulh_v2i64 (<2 x i64 > %a , <2 x i64 > %b ) {
239+ ; CHECK-LABEL: umulh_v2i64:
240+ ; CHECK: // %bb.0:
241+ ; CHECK-NEXT: mov x8, v0.d[1]
242+ ; CHECK-NEXT: mov x9, v1.d[1]
243+ ; CHECK-NEXT: fmov x10, d0
244+ ; CHECK-NEXT: fmov x11, d1
245+ ; CHECK-NEXT: umulh x10, x10, x11
246+ ; CHECK-NEXT: umulh x8, x8, x9
247+ ; CHECK-NEXT: fmov d0, x10
248+ ; CHECK-NEXT: fmov d1, x8
249+ ; CHECK-NEXT: mov v0.d[1], v1.d[0]
250+ ; CHECK-NEXT: ret
251+ %1 = zext <2 x i64 > %a to <2 x i128 >
252+ %2 = zext <2 x i64 > %b to <2 x i128 >
253+ %mul = mul <2 x i128 > %1 , %2
254+ %shr = lshr <2 x i128 > %mul , splat(i128 64 )
255+ %tr = trunc <2 x i128 > %shr to <2 x i64 >
256+ ret <2 x i64 > %tr
257+ }
258+
259+
260+
261+ ; Fixed-length 64bits
262+
263+ define <8 x i8 > @smulh_v8i8 (<8 x i8 > %a , <8 x i8 > %b ) {
264+ ; CHECK-LABEL: smulh_v8i8:
265+ ; CHECK: // %bb.0:
266+ ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
267+ ; CHECK-NEXT: shrn v0.8b, v0.8h, #8
268+ ; CHECK-NEXT: ret
269+ %1 = sext <8 x i8 > %a to <8 x i16 >
270+ %2 = sext <8 x i8 > %b to <8 x i16 >
271+ %mul = mul <8 x i16 > %1 , %2
272+ %shr = lshr <8 x i16 > %mul , splat(i16 8 )
273+ %tr = trunc <8 x i16 > %shr to <8 x i8 >
274+ ret <8 x i8 > %tr
275+ }
276+
277+ define <4 x i16 > @smulh_v4i16 (<4 x i16 > %a , <4 x i16 > %b ) {
278+ ; CHECK-LABEL: smulh_v4i16:
279+ ; CHECK: // %bb.0:
280+ ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
281+ ; CHECK-NEXT: shrn v0.4h, v0.4s, #16
282+ ; CHECK-NEXT: ret
283+ %1 = sext <4 x i16 > %a to <4 x i32 >
284+ %2 = sext <4 x i16 > %b to <4 x i32 >
285+ %mul = mul <4 x i32 > %1 , %2
286+ %shr = lshr <4 x i32 > %mul , splat(i32 16 )
287+ %tr = trunc <4 x i32 > %shr to <4 x i16 >
288+ ret <4 x i16 > %tr
289+ }
290+
291+ define <2 x i32 > @smulh_v2i32 (<2 x i32 > %a , <2 x i32 > %b ) {
292+ ; CHECK-LABEL: smulh_v2i32:
293+ ; CHECK: // %bb.0:
294+ ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
295+ ; CHECK-NEXT: shrn v0.2s, v0.2d, #32
296+ ; CHECK-NEXT: ret
297+ %1 = sext <2 x i32 > %a to <2 x i64 >
298+ %2 = sext <2 x i32 > %b to <2 x i64 >
299+ %mul = mul <2 x i64 > %1 , %2
300+ %shr = lshr <2 x i64 > %mul , splat(i64 32 )
301+ %tr = trunc <2 x i64 > %shr to <2 x i32 >
302+ ret <2 x i32 > %tr
303+ }
304+
305+ define <1 x i64 > @smulh_v1i64 (<1 x i64 > %a , <1 x i64 > %b ) {
306+ ; CHECK-LABEL: smulh_v1i64:
307+ ; CHECK: // %bb.0:
308+ ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
309+ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
310+ ; CHECK-NEXT: fmov x8, d0
311+ ; CHECK-NEXT: fmov x9, d1
312+ ; CHECK-NEXT: smulh x8, x8, x9
313+ ; CHECK-NEXT: fmov d0, x8
314+ ; CHECK-NEXT: ret
315+ %1 = sext <1 x i64 > %a to <1 x i128 >
316+ %2 = sext <1 x i64 > %b to <1 x i128 >
317+ %mul = mul <1 x i128 > %1 , %2
318+ %shr = lshr <1 x i128 > %mul , splat(i128 64 )
319+ %tr = trunc <1 x i128 > %shr to <1 x i64 >
320+ ret <1 x i64 > %tr
321+ }
322+
323+ define <8 x i8 > @umulh_v8i8 (<8 x i8 > %a , <8 x i8 > %b ) {
324+ ; CHECK-LABEL: umulh_v8i8:
325+ ; CHECK: // %bb.0:
326+ ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
327+ ; CHECK-NEXT: shrn v0.8b, v0.8h, #8
328+ ; CHECK-NEXT: ret
329+ %1 = zext <8 x i8 > %a to <8 x i16 >
330+ %2 = zext <8 x i8 > %b to <8 x i16 >
331+ %mul = mul <8 x i16 > %1 , %2
332+ %shr = lshr <8 x i16 > %mul , splat(i16 8 )
333+ %tr = trunc <8 x i16 > %shr to <8 x i8 >
334+ ret <8 x i8 > %tr
335+ }
336+
337+ define <4 x i16 > @umulh_v4i16 (<4 x i16 > %a , <4 x i16 > %b ) {
338+ ; CHECK-LABEL: umulh_v4i16:
339+ ; CHECK: // %bb.0:
340+ ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
341+ ; CHECK-NEXT: shrn v0.4h, v0.4s, #16
342+ ; CHECK-NEXT: ret
343+ %1 = zext <4 x i16 > %a to <4 x i32 >
344+ %2 = zext <4 x i16 > %b to <4 x i32 >
345+ %mul = mul <4 x i32 > %1 , %2
346+ %shr = lshr <4 x i32 > %mul , splat(i32 16 )
347+ %tr = trunc <4 x i32 > %shr to <4 x i16 >
348+ ret <4 x i16 > %tr
349+ }
350+
351+ define <2 x i32 > @umulh_v2i32 (<2 x i32 > %a , <2 x i32 > %b ) {
352+ ; CHECK-LABEL: umulh_v2i32:
353+ ; CHECK: // %bb.0:
354+ ; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
355+ ; CHECK-NEXT: shrn v0.2s, v0.2d, #32
356+ ; CHECK-NEXT: ret
357+ %1 = zext <2 x i32 > %a to <2 x i64 >
358+ %2 = zext <2 x i32 > %b to <2 x i64 >
359+ %mul = mul <2 x i64 > %1 , %2
360+ %shr = lshr <2 x i64 > %mul , splat(i64 32 )
361+ %tr = trunc <2 x i64 > %shr to <2 x i32 >
362+ ret <2 x i32 > %tr
363+ }
364+
365+ define <1 x i64 > @umulh_v1i64 (<1 x i64 > %a , <1 x i64 > %b ) {
366+ ; CHECK-LABEL: umulh_v1i64:
367+ ; CHECK: // %bb.0:
368+ ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
369+ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
370+ ; CHECK-NEXT: fmov x8, d0
371+ ; CHECK-NEXT: fmov x9, d1
372+ ; CHECK-NEXT: umulh x8, x8, x9
373+ ; CHECK-NEXT: fmov d0, x8
374+ ; CHECK-NEXT: ret
375+ %1 = zext <1 x i64 > %a to <1 x i128 >
376+ %2 = zext <1 x i64 > %b to <1 x i128 >
377+ %mul = mul <1 x i128 > %1 , %2
378+ %shr = lshr <1 x i128 > %mul , splat(i128 64 )
379+ %tr = trunc <1 x i128 > %shr to <1 x i64 >
380+ ret <1 x i64 > %tr
381+ }
382+
0 commit comments