1
+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
1
2
; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
2
3
3
4
define <8 x i8 > @vtrni8 (ptr %A , ptr %B ) nounwind {
@@ -20,11 +21,11 @@ define <8 x i8> @vtrni8(ptr %A, ptr %B) nounwind {
20
21
define <16 x i8 > @vtrni8_Qres (ptr %A , ptr %B ) nounwind {
21
22
; CHECK-LABEL: vtrni8_Qres:
22
23
; CHECK: @ %bb.0:
23
- ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]] , [r1]
24
- ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]] , [r0]
25
- ; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]]
26
- ; CHECK-NEXT: vmov r0, r1, [[LDR0]]
27
- ; CHECK-NEXT: vmov r2, r3, [[LDR1]]
24
+ ; CHECK-NEXT: vldr d16 , [r1]
25
+ ; CHECK-NEXT: vldr d17 , [r0]
26
+ ; CHECK-NEXT: vtrn.8 d17, d16
27
+ ; CHECK-NEXT: vmov r0, r1, d17
28
+ ; CHECK-NEXT: vmov r2, r3, d16
28
29
; CHECK-NEXT: mov pc, lr
29
30
%tmp1 = load <8 x i8 >, ptr %A
30
31
%tmp2 = load <8 x i8 >, ptr %B
@@ -52,11 +53,11 @@ define <4 x i16> @vtrni16(ptr %A, ptr %B) nounwind {
52
53
define <8 x i16 > @vtrni16_Qres (ptr %A , ptr %B ) nounwind {
53
54
; CHECK-LABEL: vtrni16_Qres:
54
55
; CHECK: @ %bb.0:
55
- ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]] , [r1]
56
- ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]] , [r0]
57
- ; CHECK-NEXT: vtrn.16 [[LDR0]], [[LDR1]]
58
- ; CHECK-NEXT: vmov r0, r1, [[LDR0]]
59
- ; CHECK-NEXT: vmov r2, r3, [[LDR1]]
56
+ ; CHECK-NEXT: vldr d16 , [r1]
57
+ ; CHECK-NEXT: vldr d17 , [r0]
58
+ ; CHECK-NEXT: vtrn.16 d17, d16
59
+ ; CHECK-NEXT: vmov r0, r1, d17
60
+ ; CHECK-NEXT: vmov r2, r3, d16
60
61
; CHECK-NEXT: mov pc, lr
61
62
%tmp1 = load <4 x i16 >, ptr %A
62
63
%tmp2 = load <4 x i16 >, ptr %B
@@ -84,11 +85,11 @@ define <2 x i32> @vtrni32(ptr %A, ptr %B) nounwind {
84
85
define <4 x i32 > @vtrni32_Qres (ptr %A , ptr %B ) nounwind {
85
86
; CHECK-LABEL: vtrni32_Qres:
86
87
; CHECK: @ %bb.0:
87
- ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]] , [r1]
88
- ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]] , [r0]
89
- ; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]]
90
- ; CHECK-NEXT: vmov r0, r1, [[LDR0]]
91
- ; CHECK-NEXT: vmov r2, r3, [[LDR1]]
88
+ ; CHECK-NEXT: vldr d16 , [r1]
89
+ ; CHECK-NEXT: vldr d17 , [r0]
90
+ ; CHECK-NEXT: vtrn.32 d17, d16
91
+ ; CHECK-NEXT: vmov r0, r1, d17
92
+ ; CHECK-NEXT: vmov r2, r3, d16
92
93
; CHECK-NEXT: mov pc, lr
93
94
%tmp1 = load <2 x i32 >, ptr %A
94
95
%tmp2 = load <2 x i32 >, ptr %B
@@ -116,11 +117,11 @@ define <2 x float> @vtrnf(ptr %A, ptr %B) nounwind {
116
117
define <4 x float > @vtrnf_Qres (ptr %A , ptr %B ) nounwind {
117
118
; CHECK-LABEL: vtrnf_Qres:
118
119
; CHECK: @ %bb.0:
119
- ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]] , [r1]
120
- ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]] , [r0]
121
- ; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]]
122
- ; CHECK-NEXT: vmov r0, r1, [[LDR0]]
123
- ; CHECK-NEXT: vmov r2, r3, [[LDR1]]
120
+ ; CHECK-NEXT: vldr d16 , [r1]
121
+ ; CHECK-NEXT: vldr d17 , [r0]
122
+ ; CHECK-NEXT: vtrn.32 d17, d16
123
+ ; CHECK-NEXT: vmov r0, r1, d17
124
+ ; CHECK-NEXT: vmov r2, r3, d16
124
125
; CHECK-NEXT: mov pc, lr
125
126
%tmp1 = load <2 x float >, ptr %A
126
127
%tmp2 = load <2 x float >, ptr %B
@@ -281,11 +282,11 @@ define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind {
281
282
define <16 x i8 > @vtrni8_undef_Qres (ptr %A , ptr %B ) nounwind {
282
283
; CHECK-LABEL: vtrni8_undef_Qres:
283
284
; CHECK: @ %bb.0:
284
- ; CHECK-NEXT: vldr [[LDR1:d[0-9]+]] , [r1]
285
- ; CHECK-NEXT: vldr [[LDR0:d[0-9]+]] , [r0]
286
- ; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]]
287
- ; CHECK-NEXT: vmov r0, r1, [[LDR0]]
288
- ; CHECK-NEXT: vmov r2, r3, [[LDR1]]
285
+ ; CHECK-NEXT: vldr d16 , [r1]
286
+ ; CHECK-NEXT: vldr d17 , [r0]
287
+ ; CHECK-NEXT: vtrn.8 d17, d16
288
+ ; CHECK-NEXT: vmov r0, r1, d17
289
+ ; CHECK-NEXT: vmov r2, r3, d16
289
290
; CHECK-NEXT: mov pc, lr
290
291
%tmp1 = load <8 x i8 >, ptr %A
291
292
%tmp2 = load <8 x i8 >, ptr %B
@@ -327,9 +328,15 @@ define <16 x i16> @vtrnQi16_undef_QQres(ptr %A, ptr %B) nounwind {
327
328
}
328
329
329
330
define <8 x i16 > @vtrn_lower_shufflemask_undef (ptr %A , ptr %B ) {
331
+ ; CHECK-LABEL: vtrn_lower_shufflemask_undef:
332
+ ; CHECK: @ %bb.0: @ %entry
333
+ ; CHECK-NEXT: vldr d16, [r1]
334
+ ; CHECK-NEXT: vldr d17, [r0]
335
+ ; CHECK-NEXT: vtrn.16 d17, d16
336
+ ; CHECK-NEXT: vmov r0, r1, d16
337
+ ; CHECK-NEXT: vmov r2, r3, d16
338
+ ; CHECK-NEXT: mov pc, lr
330
339
entry:
331
- ; CHECK-LABEL: vtrn_lower_shufflemask_undef
332
- ; CHECK: vtrn
333
340
%tmp1 = load <4 x i16 >, ptr %A
334
341
%tmp2 = load <4 x i16 >, ptr %B
335
342
%0 = shufflevector <4 x i16 > %tmp1 , <4 x i16 > %tmp2 , <8 x i32 > <i32 undef , i32 undef , i32 undef , i32 undef , i32 1 , i32 5 , i32 3 , i32 7 >
@@ -340,12 +347,26 @@ entry:
340
347
; values do modify the type. However, we get different input types, as some of
341
348
; them get truncated from i32 to i8 (from comparing cmp0 with cmp1) and some of
342
349
; them get truncated from i16 to i8 (from comparing cmp2 with cmp3).
343
- define <8 x i8 > @vtrn_mismatched_builvector0 (<8 x i8 > %tr0 , <8 x i8 > %tr1 ,
344
- <4 x i32 > %cmp0 , <4 x i32 > %cmp1 ,
345
- <4 x i16 > %cmp2 , <4 x i16 > %cmp3 ) {
346
- ; CHECK-LABEL: vtrn_mismatched_builvector0:
347
- ; CHECK: vmovn.i32
348
- ; CHECK: vbsl
350
+ define <8 x i8 > @vtrn_mismatched_builvector0 (<8 x i8 > %tr0 , <8 x i8 > %tr1 , <4 x i32 > %cmp0 , <4 x i32 > %cmp1 , <4 x i16 > %cmp2 , <4 x i16 > %cmp3 ) {
351
+ ; CHECK-LABEL: vtrn_mismatched_builvector0:
352
+ ; CHECK: @ %bb.0:
353
+ ; CHECK-NEXT: mov r12, sp
354
+ ; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
355
+ ; CHECK-NEXT: add r12, sp, #16
356
+ ; CHECK-NEXT: vld1.64 {d18, d19}, [r12]
357
+ ; CHECK-NEXT: vcgt.u32 q8, q9, q8
358
+ ; CHECK-NEXT: vldr d20, [sp, #32]
359
+ ; CHECK-NEXT: vldr d18, [sp, #40]
360
+ ; CHECK-NEXT: vcgt.u16 d18, d18, d20
361
+ ; CHECK-NEXT: vmovn.i32 d16, q8
362
+ ; CHECK-NEXT: vmov d17, r2, r3
363
+ ; CHECK-NEXT: vtrn.8 d16, d18
364
+ ; CHECK-NEXT: vmov d18, r0, r1
365
+ ; CHECK-NEXT: vshl.i8 d16, d16, #7
366
+ ; CHECK-NEXT: vshr.s8 d16, d16, #7
367
+ ; CHECK-NEXT: vbsl d16, d18, d17
368
+ ; CHECK-NEXT: vmov r0, r1, d16
369
+ ; CHECK-NEXT: mov pc, lr
349
370
%c0 = icmp ult <4 x i32 > %cmp0 , %cmp1
350
371
%c1 = icmp ult <4 x i16 > %cmp2 , %cmp3
351
372
%c = shufflevector <4 x i1 > %c0 , <4 x i1 > %c1 , <8 x i32 > <i32 0 , i32 4 , i32 1 , i32 5 , i32 2 , i32 6 , i32 3 , i32 7 >
@@ -356,12 +377,30 @@ define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1,
356
377
; Here we get a build_vector node, where half the incoming extract_element
357
378
; values do not modify the type (the values form cmp2), but half of them do
358
379
; (from the icmp operation).
359
- define <8 x i8 > @vtrn_mismatched_builvector1 (<8 x i8 > %tr0 , <8 x i8 > %tr1 ,
360
- <4 x i32 > %cmp0 , <4 x i32 > %cmp1 , ptr %cmp2_ptr ) {
361
- ; CHECK-LABEL: vtrn_mismatched_builvector1:
362
- ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
363
- ; CHECK: vmovl
364
- ; CHECK: vbsl
380
+ ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
381
+ define <8 x i8 > @vtrn_mismatched_builvector1 (<8 x i8 > %tr0 , <8 x i8 > %tr1 , <4 x i32 > %cmp0 , <4 x i32 > %cmp1 , ptr %cmp2_ptr ) {
382
+ ; CHECK-LABEL: vtrn_mismatched_builvector1:
383
+ ; CHECK: @ %bb.0:
384
+ ; CHECK-NEXT: .save {r11, lr}
385
+ ; CHECK-NEXT: push {r11, lr}
386
+ ; CHECK-NEXT: add r12, sp, #8
387
+ ; CHECK-NEXT: add lr, sp, #24
388
+ ; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
389
+ ; CHECK-NEXT: ldr r12, [sp, #40]
390
+ ; CHECK-NEXT: vld1.64 {d18, d19}, [lr]
391
+ ; CHECK-NEXT: vcgt.u32 q8, q9, q8
392
+ ; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32]
393
+ ; CHECK-NEXT: vmovl.u8 q9, d18
394
+ ; CHECK-NEXT: vmovn.i32 d16, q8
395
+ ; CHECK-NEXT: vmov d17, r2, r3
396
+ ; CHECK-NEXT: vtrn.8 d16, d18
397
+ ; CHECK-NEXT: vmov d18, r0, r1
398
+ ; CHECK-NEXT: vshl.i8 d16, d16, #7
399
+ ; CHECK-NEXT: vshr.s8 d16, d16, #7
400
+ ; CHECK-NEXT: vbsl d16, d18, d17
401
+ ; CHECK-NEXT: vmov r0, r1, d16
402
+ ; CHECK-NEXT: pop {r11, lr}
403
+ ; CHECK-NEXT: mov pc, lr
365
404
%cmp2_load = load <4 x i8 >, ptr %cmp2_ptr , align 4
366
405
%cmp2 = trunc <4 x i8 > %cmp2_load to <4 x i1 >
367
406
%c0 = icmp ult <4 x i32 > %cmp0 , %cmp1
@@ -373,15 +412,15 @@ define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
373
412
; The shuffle mask is half a vtrn; we duplicate the half to produce the
374
413
; full result.
375
414
define void @lower_twice_no_vtrn (ptr %A , ptr %B , ptr %C ) {
415
+ ; CHECK-LABEL: lower_twice_no_vtrn:
416
+ ; CHECK: @ %bb.0: @ %entry
417
+ ; CHECK-NEXT: vldr d16, [r1]
418
+ ; CHECK-NEXT: vldr d18, [r0]
419
+ ; CHECK-NEXT: vtrn.16 d18, d16
420
+ ; CHECK-NEXT: vorr d17, d16, d16
421
+ ; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
422
+ ; CHECK-NEXT: mov pc, lr
376
423
entry:
377
- ; CHECK-LABEL: lower_twice_no_vtrn:
378
- ; CHECK: @ %bb.0:
379
- ; CHECK-NEXT: vldr d16, [r1]
380
- ; CHECK-NEXT: vldr d18, [r0]
381
- ; CHECK-NEXT: vtrn.16 d18, d16
382
- ; CHECK-NEXT: vorr d17, d16, d16
383
- ; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
384
- ; CHECK-NEXT: mov pc, lr
385
424
%tmp1 = load <4 x i16 >, ptr %A
386
425
%tmp2 = load <4 x i16 >, ptr %B
387
426
%0 = shufflevector <4 x i16 > %tmp1 , <4 x i16 > %tmp2 , <8 x i32 > <i32 undef , i32 5 , i32 3 , i32 7 , i32 1 , i32 5 , i32 3 , i32 7 >
@@ -392,18 +431,49 @@ entry:
392
431
; The shuffle mask is half a vtrn; we duplicate the half to produce the
393
432
; full result.
394
433
define void @upper_twice_no_vtrn (ptr %A , ptr %B , ptr %C ) {
434
+ ; CHECK-LABEL: upper_twice_no_vtrn:
435
+ ; CHECK: @ %bb.0: @ %entry
436
+ ; CHECK-NEXT: vldr d16, [r1]
437
+ ; CHECK-NEXT: vldr d18, [r0]
438
+ ; CHECK-NEXT: vtrn.16 d18, d16
439
+ ; CHECK-NEXT: vorr d19, d18, d18
440
+ ; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
441
+ ; CHECK-NEXT: mov pc, lr
395
442
entry:
396
- ; CHECK-LABEL: upper_twice_no_vtrn:
397
- ; CHECK: @ %bb.0:
398
- ; CHECK-NEXT: vldr d16, [r1]
399
- ; CHECK-NEXT: vldr d18, [r0]
400
- ; CHECK-NEXT: vtrn.16 d18, d16
401
- ; CHECK-NEXT: vorr d19, d18, d18
402
- ; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
403
- ; CHECK-NEXT: mov pc, lr
404
443
%tmp1 = load <4 x i16 >, ptr %A
405
444
%tmp2 = load <4 x i16 >, ptr %B
406
445
%0 = shufflevector <4 x i16 > %tmp1 , <4 x i16 > %tmp2 , <8 x i32 > <i32 0 , i32 undef , i32 2 , i32 6 , i32 0 , i32 4 , i32 2 , i32 6 >
407
446
store <8 x i16 > %0 , ptr %C
408
447
ret void
409
448
}
449
+
450
+ define void @test_15xi16 (ptr %next.gep , ptr %next.gep13 ) {
451
+ ; CHECK-LABEL: test_15xi16:
452
+ ; CHECK: @ %bb.0:
453
+ ; CHECK-NEXT: add r2, r0, #2
454
+ ; CHECK-NEXT: add r3, r0, #6
455
+ ; CHECK-NEXT: vld1.16 {d16, d17}, [r2]!
456
+ ; CHECK-NEXT: vld1.16 {d18}, [r2]!
457
+ ; CHECK-NEXT: vld1.16 {d20, d21}, [r3]!
458
+ ; CHECK-NEXT: ldr r2, [r2]
459
+ ; CHECK-NEXT: vld1.16 {d22}, [r3]!
460
+ ; CHECK-NEXT: vmov.16 d19[0], r2
461
+ ; CHECK-NEXT: ldr r3, [r3]
462
+ ; CHECK-NEXT: add r2, r0, #30
463
+ ; CHECK-NEXT: add r0, r0, #34
464
+ ; CHECK-NEXT: vmov.16 d19[1], r3
465
+ ; CHECK-NEXT: vld1.16 {d19[2]}, [r2:16]
466
+ ; CHECK-NEXT: vtrn.16 q8, q10
467
+ ; CHECK-NEXT: vld1.16 {d19[3]}, [r0:16]
468
+ ; CHECK-NEXT: vtrn.16 d18, d22
469
+ ; CHECK-NEXT: vst1.16 {d16, d17}, [r1]!
470
+ ; CHECK-NEXT: vst1.16 {d18, d19}, [r1]
471
+ ; CHECK-NEXT: mov pc, lr
472
+ %a = getelementptr inbounds nuw i8 , ptr %next.gep , i32 2
473
+ %b = load <15 x i16 >, ptr %a , align 2
474
+ %c = getelementptr inbounds nuw i8 , ptr %next.gep , i32 6
475
+ %d = load <15 x i16 >, ptr %c , align 2
476
+ %interleaved.vec = shufflevector <15 x i16 > %b , <15 x i16 > %d , <16 x i32 > <i32 0 , i32 15 , i32 2 , i32 17 , i32 4 , i32 19 , i32 6 , i32 21 , i32 8 , i32 23 , i32 10 , i32 25 , i32 12 , i32 27 , i32 14 , i32 29 >
477
+ store <16 x i16 > %interleaved.vec , ptr %next.gep13 , align 2
478
+ ret void
479
+ }
0 commit comments