1+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
23
34define <8 x i8 > @vtrni8 (ptr %A , ptr %B ) nounwind {
@@ -20,11 +21,11 @@ define <8 x i8> @vtrni8(ptr %A, ptr %B) nounwind {
2021define <16 x i8 > @vtrni8_Qres (ptr %A , ptr %B ) nounwind {
2122; CHECK-LABEL: vtrni8_Qres:
2223; CHECK: @ %bb.0:
23- ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]] , [r1]
24- ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]] , [r0]
25- ; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]]
26- ; CHECK-NEXT: vmov r0, r1, [[LDR0]]
27- ; CHECK-NEXT: vmov r2, r3, [[LDR1]]
24+ ; CHECK-NEXT: vldr d16 , [r1]
25+ ; CHECK-NEXT: vldr d17 , [r0]
26+ ; CHECK-NEXT: vtrn.8 d17, d16
27+ ; CHECK-NEXT: vmov r0, r1, d17
28+ ; CHECK-NEXT: vmov r2, r3, d16
2829; CHECK-NEXT: mov pc, lr
2930 %tmp1 = load <8 x i8 >, ptr %A
3031 %tmp2 = load <8 x i8 >, ptr %B
@@ -52,11 +53,11 @@ define <4 x i16> @vtrni16(ptr %A, ptr %B) nounwind {
5253define <8 x i16 > @vtrni16_Qres (ptr %A , ptr %B ) nounwind {
5354; CHECK-LABEL: vtrni16_Qres:
5455; CHECK: @ %bb.0:
55- ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]] , [r1]
56- ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]] , [r0]
57- ; CHECK-NEXT: vtrn.16 [[LDR0]], [[LDR1]]
58- ; CHECK-NEXT: vmov r0, r1, [[LDR0]]
59- ; CHECK-NEXT: vmov r2, r3, [[LDR1]]
56+ ; CHECK-NEXT: vldr d16 , [r1]
57+ ; CHECK-NEXT: vldr d17 , [r0]
58+ ; CHECK-NEXT: vtrn.16 d17, d16
59+ ; CHECK-NEXT: vmov r0, r1, d17
60+ ; CHECK-NEXT: vmov r2, r3, d16
6061; CHECK-NEXT: mov pc, lr
6162 %tmp1 = load <4 x i16 >, ptr %A
6263 %tmp2 = load <4 x i16 >, ptr %B
@@ -84,11 +85,11 @@ define <2 x i32> @vtrni32(ptr %A, ptr %B) nounwind {
8485define <4 x i32 > @vtrni32_Qres (ptr %A , ptr %B ) nounwind {
8586; CHECK-LABEL: vtrni32_Qres:
8687; CHECK: @ %bb.0:
87- ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]] , [r1]
88- ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]] , [r0]
89- ; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]]
90- ; CHECK-NEXT: vmov r0, r1, [[LDR0]]
91- ; CHECK-NEXT: vmov r2, r3, [[LDR1]]
88+ ; CHECK-NEXT: vldr d16 , [r1]
89+ ; CHECK-NEXT: vldr d17 , [r0]
90+ ; CHECK-NEXT: vtrn.32 d17, d16
91+ ; CHECK-NEXT: vmov r0, r1, d17
92+ ; CHECK-NEXT: vmov r2, r3, d16
9293; CHECK-NEXT: mov pc, lr
9394 %tmp1 = load <2 x i32 >, ptr %A
9495 %tmp2 = load <2 x i32 >, ptr %B
@@ -116,11 +117,11 @@ define <2 x float> @vtrnf(ptr %A, ptr %B) nounwind {
116117define <4 x float > @vtrnf_Qres (ptr %A , ptr %B ) nounwind {
117118; CHECK-LABEL: vtrnf_Qres:
118119; CHECK: @ %bb.0:
119- ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]] , [r1]
120- ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]] , [r0]
121- ; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]]
122- ; CHECK-NEXT: vmov r0, r1, [[LDR0]]
123- ; CHECK-NEXT: vmov r2, r3, [[LDR1]]
120+ ; CHECK-NEXT: vldr d16 , [r1]
121+ ; CHECK-NEXT: vldr d17 , [r0]
122+ ; CHECK-NEXT: vtrn.32 d17, d16
123+ ; CHECK-NEXT: vmov r0, r1, d17
124+ ; CHECK-NEXT: vmov r2, r3, d16
124125; CHECK-NEXT: mov pc, lr
125126 %tmp1 = load <2 x float >, ptr %A
126127 %tmp2 = load <2 x float >, ptr %B
@@ -281,11 +282,11 @@ define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind {
281282define <16 x i8 > @vtrni8_undef_Qres (ptr %A , ptr %B ) nounwind {
282283; CHECK-LABEL: vtrni8_undef_Qres:
283284; CHECK: @ %bb.0:
284- ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]] , [r1]
285- ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]] , [r0]
286- ; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]]
287- ; CHECK-NEXT: vmov r0, r1, [[LDR0]]
288- ; CHECK-NEXT: vmov r2, r3, [[LDR1]]
285+ ; CHECK-NEXT: vldr d16 , [r1]
286+ ; CHECK-NEXT: vldr d17 , [r0]
287+ ; CHECK-NEXT: vtrn.8 d17, d16
288+ ; CHECK-NEXT: vmov r0, r1, d17
289+ ; CHECK-NEXT: vmov r2, r3, d16
289290; CHECK-NEXT: mov pc, lr
290291 %tmp1 = load <8 x i8 >, ptr %A
291292 %tmp2 = load <8 x i8 >, ptr %B
@@ -327,9 +328,15 @@ define <16 x i16> @vtrnQi16_undef_QQres(ptr %A, ptr %B) nounwind {
327328}
328329
329330define <8 x i16 > @vtrn_lower_shufflemask_undef (ptr %A , ptr %B ) {
331+ ; CHECK-LABEL: vtrn_lower_shufflemask_undef:
332+ ; CHECK: @ %bb.0: @ %entry
333+ ; CHECK-NEXT: vldr d16, [r1]
334+ ; CHECK-NEXT: vldr d17, [r0]
335+ ; CHECK-NEXT: vtrn.16 d17, d16
336+ ; CHECK-NEXT: vmov r0, r1, d16
337+ ; CHECK-NEXT: vmov r2, r3, d16
338+ ; CHECK-NEXT: mov pc, lr
330339entry:
331- ; CHECK-LABEL: vtrn_lower_shufflemask_undef
332- ; CHECK: vtrn
333340 %tmp1 = load <4 x i16 >, ptr %A
334341 %tmp2 = load <4 x i16 >, ptr %B
335342 %0 = shufflevector <4 x i16 > %tmp1 , <4 x i16 > %tmp2 , <8 x i32 > <i32 undef , i32 undef , i32 undef , i32 undef , i32 1 , i32 5 , i32 3 , i32 7 >
@@ -340,12 +347,26 @@ entry:
340347; values do modify the type. However, we get different input types, as some of
341348; them get truncated from i32 to i8 (from comparing cmp0 with cmp1) and some of
342349; them get truncated from i16 to i8 (from comparing cmp2 with cmp3).
343- define <8 x i8 > @vtrn_mismatched_builvector0 (<8 x i8 > %tr0 , <8 x i8 > %tr1 ,
344- <4 x i32 > %cmp0 , <4 x i32 > %cmp1 ,
345- <4 x i16 > %cmp2 , <4 x i16 > %cmp3 ) {
346- ; CHECK-LABEL: vtrn_mismatched_builvector0:
347- ; CHECK: vmovn.i32
348- ; CHECK: vbsl
350+ define <8 x i8 > @vtrn_mismatched_builvector0 (<8 x i8 > %tr0 , <8 x i8 > %tr1 , <4 x i32 > %cmp0 , <4 x i32 > %cmp1 , <4 x i16 > %cmp2 , <4 x i16 > %cmp3 ) {
351+ ; CHECK-LABEL: vtrn_mismatched_builvector0:
352+ ; CHECK: @ %bb.0:
353+ ; CHECK-NEXT: mov r12, sp
354+ ; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
355+ ; CHECK-NEXT: add r12, sp, #16
356+ ; CHECK-NEXT: vld1.64 {d18, d19}, [r12]
357+ ; CHECK-NEXT: vcgt.u32 q8, q9, q8
358+ ; CHECK-NEXT: vldr d20, [sp, #32]
359+ ; CHECK-NEXT: vldr d18, [sp, #40]
360+ ; CHECK-NEXT: vcgt.u16 d18, d18, d20
361+ ; CHECK-NEXT: vmovn.i32 d16, q8
362+ ; CHECK-NEXT: vmov d17, r2, r3
363+ ; CHECK-NEXT: vtrn.8 d16, d18
364+ ; CHECK-NEXT: vmov d18, r0, r1
365+ ; CHECK-NEXT: vshl.i8 d16, d16, #7
366+ ; CHECK-NEXT: vshr.s8 d16, d16, #7
367+ ; CHECK-NEXT: vbsl d16, d18, d17
368+ ; CHECK-NEXT: vmov r0, r1, d16
369+ ; CHECK-NEXT: mov pc, lr
349370 %c0 = icmp ult <4 x i32 > %cmp0 , %cmp1
350371 %c1 = icmp ult <4 x i16 > %cmp2 , %cmp3
351372 %c = shufflevector <4 x i1 > %c0 , <4 x i1 > %c1 , <8 x i32 > <i32 0 , i32 4 , i32 1 , i32 5 , i32 2 , i32 6 , i32 3 , i32 7 >
@@ -356,12 +377,30 @@ define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1,
356377; Here we get a build_vector node, where half the incoming extract_element
357378; values do not modify the type (the values form cmp2), but half of them do
358379; (from the icmp operation).
359- define <8 x i8 > @vtrn_mismatched_builvector1 (<8 x i8 > %tr0 , <8 x i8 > %tr1 ,
360- <4 x i32 > %cmp0 , <4 x i32 > %cmp1 , ptr %cmp2_ptr ) {
361- ; CHECK-LABEL: vtrn_mismatched_builvector1:
362- ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
363- ; CHECK: vmovl
364- ; CHECK: vbsl
380+ ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
381+ define <8 x i8 > @vtrn_mismatched_builvector1 (<8 x i8 > %tr0 , <8 x i8 > %tr1 , <4 x i32 > %cmp0 , <4 x i32 > %cmp1 , ptr %cmp2_ptr ) {
382+ ; CHECK-LABEL: vtrn_mismatched_builvector1:
383+ ; CHECK: @ %bb.0:
384+ ; CHECK-NEXT: .save {r11, lr}
385+ ; CHECK-NEXT: push {r11, lr}
386+ ; CHECK-NEXT: add r12, sp, #8
387+ ; CHECK-NEXT: add lr, sp, #24
388+ ; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
389+ ; CHECK-NEXT: ldr r12, [sp, #40]
390+ ; CHECK-NEXT: vld1.64 {d18, d19}, [lr]
391+ ; CHECK-NEXT: vcgt.u32 q8, q9, q8
392+ ; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32]
393+ ; CHECK-NEXT: vmovl.u8 q9, d18
394+ ; CHECK-NEXT: vmovn.i32 d16, q8
395+ ; CHECK-NEXT: vmov d17, r2, r3
396+ ; CHECK-NEXT: vtrn.8 d16, d18
397+ ; CHECK-NEXT: vmov d18, r0, r1
398+ ; CHECK-NEXT: vshl.i8 d16, d16, #7
399+ ; CHECK-NEXT: vshr.s8 d16, d16, #7
400+ ; CHECK-NEXT: vbsl d16, d18, d17
401+ ; CHECK-NEXT: vmov r0, r1, d16
402+ ; CHECK-NEXT: pop {r11, lr}
403+ ; CHECK-NEXT: mov pc, lr
365404 %cmp2_load = load <4 x i8 >, ptr %cmp2_ptr , align 4
366405 %cmp2 = trunc <4 x i8 > %cmp2_load to <4 x i1 >
367406 %c0 = icmp ult <4 x i32 > %cmp0 , %cmp1
@@ -373,15 +412,15 @@ define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
373412; The shuffle mask is half a vtrn; we duplicate the half to produce the
374413; full result.
375414define void @lower_twice_no_vtrn (ptr %A , ptr %B , ptr %C ) {
415+ ; CHECK-LABEL: lower_twice_no_vtrn:
416+ ; CHECK: @ %bb.0: @ %entry
417+ ; CHECK-NEXT: vldr d16, [r1]
418+ ; CHECK-NEXT: vldr d18, [r0]
419+ ; CHECK-NEXT: vtrn.16 d18, d16
420+ ; CHECK-NEXT: vorr d17, d16, d16
421+ ; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
422+ ; CHECK-NEXT: mov pc, lr
376423entry:
377- ; CHECK-LABEL: lower_twice_no_vtrn:
378- ; CHECK: @ %bb.0:
379- ; CHECK-NEXT: vldr d16, [r1]
380- ; CHECK-NEXT: vldr d18, [r0]
381- ; CHECK-NEXT: vtrn.16 d18, d16
382- ; CHECK-NEXT: vorr d17, d16, d16
383- ; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
384- ; CHECK-NEXT: mov pc, lr
385424 %tmp1 = load <4 x i16 >, ptr %A
386425 %tmp2 = load <4 x i16 >, ptr %B
387426 %0 = shufflevector <4 x i16 > %tmp1 , <4 x i16 > %tmp2 , <8 x i32 > <i32 undef , i32 5 , i32 3 , i32 7 , i32 1 , i32 5 , i32 3 , i32 7 >
@@ -392,18 +431,49 @@ entry:
392431; The shuffle mask is half a vtrn; we duplicate the half to produce the
393432; full result.
394433define void @upper_twice_no_vtrn (ptr %A , ptr %B , ptr %C ) {
434+ ; CHECK-LABEL: upper_twice_no_vtrn:
435+ ; CHECK: @ %bb.0: @ %entry
436+ ; CHECK-NEXT: vldr d16, [r1]
437+ ; CHECK-NEXT: vldr d18, [r0]
438+ ; CHECK-NEXT: vtrn.16 d18, d16
439+ ; CHECK-NEXT: vorr d19, d18, d18
440+ ; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
441+ ; CHECK-NEXT: mov pc, lr
395442entry:
396- ; CHECK-LABEL: upper_twice_no_vtrn:
397- ; CHECK: @ %bb.0:
398- ; CHECK-NEXT: vldr d16, [r1]
399- ; CHECK-NEXT: vldr d18, [r0]
400- ; CHECK-NEXT: vtrn.16 d18, d16
401- ; CHECK-NEXT: vorr d19, d18, d18
402- ; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
403- ; CHECK-NEXT: mov pc, lr
404443 %tmp1 = load <4 x i16 >, ptr %A
405444 %tmp2 = load <4 x i16 >, ptr %B
406445 %0 = shufflevector <4 x i16 > %tmp1 , <4 x i16 > %tmp2 , <8 x i32 > <i32 0 , i32 undef , i32 2 , i32 6 , i32 0 , i32 4 , i32 2 , i32 6 >
407446 store <8 x i16 > %0 , ptr %C
408447 ret void
409448}
449+
450+ define void @test_15xi16 (ptr %next.gep , ptr %next.gep13 ) {
451+ ; CHECK-LABEL: test_15xi16:
452+ ; CHECK: @ %bb.0:
453+ ; CHECK-NEXT: add r2, r0, #2
454+ ; CHECK-NEXT: add r3, r0, #6
455+ ; CHECK-NEXT: vld1.16 {d16, d17}, [r2]!
456+ ; CHECK-NEXT: vld1.16 {d18}, [r2]!
457+ ; CHECK-NEXT: vld1.16 {d20, d21}, [r3]!
458+ ; CHECK-NEXT: ldr r2, [r2]
459+ ; CHECK-NEXT: vld1.16 {d22}, [r3]!
460+ ; CHECK-NEXT: vmov.16 d19[0], r2
461+ ; CHECK-NEXT: ldr r3, [r3]
462+ ; CHECK-NEXT: add r2, r0, #30
463+ ; CHECK-NEXT: add r0, r0, #34
464+ ; CHECK-NEXT: vmov.16 d19[1], r3
465+ ; CHECK-NEXT: vld1.16 {d19[2]}, [r2:16]
466+ ; CHECK-NEXT: vtrn.16 q8, q10
467+ ; CHECK-NEXT: vld1.16 {d19[3]}, [r0:16]
468+ ; CHECK-NEXT: vtrn.16 d18, d22
469+ ; CHECK-NEXT: vst1.16 {d16, d17}, [r1]!
470+ ; CHECK-NEXT: vst1.16 {d18, d19}, [r1]
471+ ; CHECK-NEXT: mov pc, lr
472+ %a = getelementptr inbounds nuw i8 , ptr %next.gep , i32 2
473+ %b = load <15 x i16 >, ptr %a , align 2
474+ %c = getelementptr inbounds nuw i8 , ptr %next.gep , i32 6
475+ %d = load <15 x i16 >, ptr %c , align 2
476+ %interleaved.vec = shufflevector <15 x i16 > %b , <15 x i16 > %d , <16 x i32 > <i32 0 , i32 15 , i32 2 , i32 17 , i32 4 , i32 19 , i32 6 , i32 21 , i32 8 , i32 23 , i32 10 , i32 25 , i32 12 , i32 27 , i32 14 , i32 29 >
477+ store <16 x i16 > %interleaved.vec , ptr %next.gep13 , align 2
478+ ret void
479+ }
0 commit comments