Skip to content

Commit 056b694

Browse files
committed
Change register allocation order for ARM VFP and NEON registers to put the
callee-saved registers at the end of the lists. Also prefer to avoid using the low registers that are in register subclasses required by certain instructions, so that those registers will more likely be available when needed. This change makes a huge improvement in spilling in some cases. Thanks to Jakob for helping me realize the problem. Most of this patch is fixing the testsuite. There are quite a few places where we're checking for specific registers. I changed those to wildcards in places where that doesn't weaken the tests. The spill-q.ll and thumb2-spill-q.ll tests stopped spilling with this change, so I added a bunch of live values to force spills on those tests. llvm-svn: 116055
1 parent 3e210eb commit 056b694

20 files changed

+235
-111
lines changed

llvm/lib/Target/ARM/ARMRegisterInfo.td

Lines changed: 72 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -387,16 +387,18 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
387387
ARM::D4, ARM::D5, ARM::D6, ARM::D7,
388388
ARM::D8, ARM::D9, ARM::D10, ARM::D11,
389389
ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
390-
// VFP3
390+
// VFP3: D8-D15 are callee saved and should be allocated last.
391+
// Save other low registers for use as DPR_VFP2 and DPR_8 classes.
391392
static const unsigned ARM_DPR_VFP3[] = {
392-
ARM::D0, ARM::D1, ARM::D2, ARM::D3,
393-
ARM::D4, ARM::D5, ARM::D6, ARM::D7,
394-
ARM::D8, ARM::D9, ARM::D10, ARM::D11,
395-
ARM::D12, ARM::D13, ARM::D14, ARM::D15,
396393
ARM::D16, ARM::D17, ARM::D18, ARM::D19,
397394
ARM::D20, ARM::D21, ARM::D22, ARM::D23,
398395
ARM::D24, ARM::D25, ARM::D26, ARM::D27,
399-
ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
396+
ARM::D28, ARM::D29, ARM::D30, ARM::D31,
397+
ARM::D0, ARM::D1, ARM::D2, ARM::D3,
398+
ARM::D4, ARM::D5, ARM::D6, ARM::D7,
399+
ARM::D8, ARM::D9, ARM::D10, ARM::D11,
400+
ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
401+
400402
DPRClass::iterator
401403
DPRClass::allocation_order_begin(const MachineFunction &MF) const {
402404
const TargetMachine &TM = MF.getTarget();
@@ -438,6 +440,29 @@ def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
438440
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
439441
Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15]> {
440442
let SubRegClasses = [(DPR dsub_0, dsub_1)];
443+
let MethodProtos = [{
444+
iterator allocation_order_begin(const MachineFunction &MF) const;
445+
iterator allocation_order_end(const MachineFunction &MF) const;
446+
}];
447+
let MethodBodies = [{
448+
// Q4-Q7 are callee saved and should be allocated last.
449+
// Save other low registers for use as QPR_VFP2 and QPR_8 classes.
450+
static const unsigned ARM_QPR[] = {
451+
ARM::Q8, ARM::Q9, ARM::Q10, ARM::Q11,
452+
ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15,
453+
ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3,
454+
ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7 };
455+
456+
QPRClass::iterator
457+
QPRClass::allocation_order_begin(const MachineFunction &MF) const {
458+
return ARM_QPR;
459+
}
460+
461+
QPRClass::iterator
462+
QPRClass::allocation_order_end(const MachineFunction &MF) const {
463+
return ARM_QPR + (sizeof(ARM_QPR)/sizeof(unsigned));
464+
}
465+
}];
441466
}
442467

443468
// Subset of QPR that have 32-bit SPR subregs.
@@ -463,6 +488,27 @@ def QQPR : RegisterClass<"ARM", [v4i64],
463488
[QQ0, QQ1, QQ2, QQ3, QQ4, QQ5, QQ6, QQ7]> {
464489
let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3),
465490
(QPR qsub_0, qsub_1)];
491+
let MethodProtos = [{
492+
iterator allocation_order_begin(const MachineFunction &MF) const;
493+
iterator allocation_order_end(const MachineFunction &MF) const;
494+
}];
495+
let MethodBodies = [{
496+
// QQ2-QQ3 are callee saved and should be allocated last.
497+
// Save other low registers for use as QPR_VFP2 and QPR_8 classes.
498+
static const unsigned ARM_QQPR[] = {
499+
ARM::QQ4, ARM::QQ5, ARM::QQ6, ARM::QQ7,
500+
ARM::QQ0, ARM::QQ1, ARM::QQ2, ARM::QQ3 };
501+
502+
QQPRClass::iterator
503+
QQPRClass::allocation_order_begin(const MachineFunction &MF) const {
504+
return ARM_QQPR;
505+
}
506+
507+
QQPRClass::iterator
508+
QQPRClass::allocation_order_end(const MachineFunction &MF) const {
509+
return ARM_QQPR + (sizeof(ARM_QQPR)/sizeof(unsigned));
510+
}
511+
}];
466512
}
467513

468514
// Subset of QQPR that have 32-bit SPR subregs.
@@ -483,6 +529,26 @@ def QQQQPR : RegisterClass<"ARM", [v8i64],
483529
let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3,
484530
dsub_4, dsub_5, dsub_6, dsub_7),
485531
(QPR qsub_0, qsub_1, qsub_2, qsub_3)];
532+
let MethodProtos = [{
533+
iterator allocation_order_begin(const MachineFunction &MF) const;
534+
iterator allocation_order_end(const MachineFunction &MF) const;
535+
}];
536+
let MethodBodies = [{
537+
// QQQQ1 is callee saved and should be allocated last.
538+
// Save QQQQ0 for use as QPR_VFP2 and QPR_8 classes.
539+
static const unsigned ARM_QQQQPR[] = {
540+
ARM::QQQQ2, ARM::QQQQ3, ARM::QQQQ0, ARM::QQQQ1 };
541+
542+
QQQQPRClass::iterator
543+
QQQQPRClass::allocation_order_begin(const MachineFunction &MF) const {
544+
return ARM_QQQQPR;
545+
}
546+
547+
QQQQPRClass::iterator
548+
QQQQPRClass::allocation_order_end(const MachineFunction &MF) const {
549+
return ARM_QQQQPR + (sizeof(ARM_QQQQPR)/sizeof(unsigned));
550+
}
551+
}];
486552
}
487553

488554
// Condition code registers.

llvm/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ target triple = "thumbv7-apple-darwin10"
1010
; %reg1028 gets allocated %Q0, and if %reg1030 is reloaded for the partial
1111
; redef, it cannot also get %Q0.
1212

13-
; CHECK: vld1.64 {d0, d1}, [r{{.}}]
14-
; CHECK-NOT: vld1.64 {d0, d1}
15-
; CHECK: vmov.f64 d3, d0
13+
; CHECK: vld1.64 {d16, d17}, [r{{.}}]
14+
; CHECK-NOT: vld1.64 {d16, d17}
15+
; CHECK: vmov.f64 d19, d16
1616

1717
define i32 @test(i8* %arg) nounwind {
1818
entry:

llvm/test/CodeGen/ARM/fpconsts.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@ entry:
1111
define double @t2(double %x) nounwind readnone optsize {
1212
entry:
1313
; CHECK: t2:
14-
; CHECK: vmov.f64 d1, #3.000000e+00
14+
; CHECK: vmov.f64 d{{.*}}, #3.000000e+00
1515
%0 = fadd double %x, 3.000000e+00
1616
ret double %0
1717
}
1818

1919
define double @t3(double %x) nounwind readnone optsize {
2020
entry:
2121
; CHECK: t3:
22-
; CHECK: vmov.f64 d1, #-1.300000e+01
22+
; CHECK: vmov.f64 d{{.*}}, #-1.300000e+01
2323
%0 = fmul double %x, -1.300000e+01
2424
ret double %0
2525
}

llvm/test/CodeGen/ARM/inlineasm3.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ define void @t() nounwind {
77
entry:
88
; CHECK: vmov.I64 q15, #0
99
; CHECK: vmov.32 d30[0], r0
10-
; CHECK: vmov q0, q15
10+
; CHECK: vmov q8, q15
1111
%tmp = alloca %struct.int32x4_t, align 16
1212
call void asm sideeffect "vmov.I64 q15, #0\0Avmov.32 d30[0], $1\0Avmov ${0:q}, q15\0A", "=*w,r,~{d31},~{d30}"(%struct.int32x4_t* %tmp, i32 8192) nounwind
1313
ret void
@@ -18,7 +18,7 @@ entry:
1818

1919
define void @t2() nounwind {
2020
entry:
21-
; CHECK: vmov d30, d0
21+
; CHECK: vmov d30, d16
2222
; CHECK: vmov.32 r0, d30[0]
2323
%asmtmp2 = tail call i32 asm sideeffect "vmov d30, $1\0Avmov.32 $0, d30[0]\0A", "=r,w,~{d30}"(<2 x i32> undef) nounwind
2424
ret void

llvm/test/CodeGen/ARM/reg_sequence.ll

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,9 @@ return1:
122122
return2:
123123
; CHECK: %return2
124124
; CHECK: vadd.i32
125-
; CHECK: vmov q1, q3
125+
; CHECK: vmov q9, q11
126126
; CHECK-NOT: vmov
127-
; CHECK: vst2.32 {d0, d1, d2, d3}
127+
; CHECK: vst2.32 {d16, d17, d18, d19}
128128
%tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
129129
%tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
130130
%tmp102 = add <4 x i32> %tmp100, %tmp101 ; <<4 x i32>> [#uses=1]
@@ -136,9 +136,9 @@ return2:
136136
define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
137137
; CHECK: t5:
138138
; CHECK: vldmia
139-
; CHECK: vmov q1, q0
139+
; CHECK: vmov q9, q8
140140
; CHECK-NOT: vmov
141-
; CHECK: vld2.16 {d0[1], d2[1]}, [r0]
141+
; CHECK: vld2.16 {d16[1], d18[1]}, [r0]
142142
; CHECK-NOT: vmov
143143
; CHECK: vadd.i16
144144
%tmp0 = bitcast i16* %A to i8* ; <i8*> [#uses=1]
@@ -153,8 +153,8 @@ define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
153153
define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
154154
; CHECK: t6:
155155
; CHECK: vldr.64
156-
; CHECK: vmov d1, d0
157-
; CHECK-NEXT: vld2.8 {d0[1], d1[1]}
156+
; CHECK: vmov d17, d16
157+
; CHECK-NEXT: vld2.8 {d16[1], d17[1]}
158158
%tmp1 = load <8 x i8>* %B ; <<8 x i8>> [#uses=2]
159159
%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
160160
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
@@ -168,10 +168,10 @@ entry:
168168
; CHECK: t7:
169169
; CHECK: vld2.32
170170
; CHECK: vst2.32
171-
; CHECK: vld1.32 {d0, d1},
172-
; CHECK: vmov q1, q0
171+
; CHECK: vld1.32 {d16, d17},
172+
; CHECK: vmov q9, q8
173173
; CHECK-NOT: vmov
174-
; CHECK: vuzp.32 q0, q1
174+
; CHECK: vuzp.32 q8, q9
175175
; CHECK: vst1.32
176176
%0 = bitcast i32* %iptr to i8* ; <i8*> [#uses=2]
177177
%1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
@@ -188,7 +188,7 @@ entry:
188188
; PR7156
189189
define arm_aapcs_vfpcc i32 @t8() nounwind {
190190
; CHECK: t8:
191-
; CHECK: vrsqrte.f32 q0, q0
191+
; CHECK: vrsqrte.f32 q8, q8
192192
bb.nph55.bb.nph55.split_crit_edge:
193193
br label %bb3
194194

@@ -238,10 +238,10 @@ bb14: ; preds = %bb6
238238
define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
239239
; CHECK: t9:
240240
; CHECK: vldr.64
241-
; CHECK-NOT: vmov d{{.*}}, d0
242-
; CHECK: vmov.i32 d1
243-
; CHECK-NEXT: vstmia r0, {d0, d1}
244-
; CHECK-NEXT: vstmia r0, {d0, d1}
241+
; CHECK-NOT: vmov d{{.*}}, d16
242+
; CHECK: vmov.i32 d17
243+
; CHECK-NEXT: vstmia r0, {d16, d17}
244+
; CHECK-NEXT: vstmia r0, {d16, d17}
245245
%3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
246246
%4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
247247
store <4 x float> %4, <4 x float>* undef, align 16
@@ -269,9 +269,9 @@ define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
269269
define arm_aapcs_vfpcc i32 @t10() nounwind {
270270
entry:
271271
; CHECK: t10:
272-
; CHECK: vmov.i32 q1, #0x3F000000
273-
; CHECK: vmov d0, d1
274-
; CHECK: vmla.f32 q0, q0, d0[0]
272+
; CHECK: vmov.i32 q9, #0x3F000000
273+
; CHECK: vmov d0, d17
274+
; CHECK: vmla.f32 q8, q8, d0[0]
275275
%0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
276276
%1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
277277
%2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1]

llvm/test/CodeGen/ARM/spill-q.ll

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,26 @@ entry:
2020
%1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
2121
store float 0.000000e+00, float* undef, align 4
2222
%2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
23+
%ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
24+
store float 0.000000e+00, float* undef, align 4
25+
%ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
26+
store float 0.000000e+00, float* undef, align 4
27+
%ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
28+
store float 0.000000e+00, float* undef, align 4
29+
%ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
30+
store float 0.000000e+00, float* undef, align 4
31+
%ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
32+
store float 0.000000e+00, float* undef, align 4
33+
%ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
34+
store float 0.000000e+00, float* undef, align 4
35+
%ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
36+
store float 0.000000e+00, float* undef, align 4
37+
%ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
38+
store float 0.000000e+00, float* undef, align 4
39+
%ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
40+
store float 0.000000e+00, float* undef, align 4
41+
%ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
42+
store float 0.000000e+00, float* undef, align 4
2343
%val173 = load <4 x float>* undef ; <<4 x float>> [#uses=1]
2444
br label %bb4
2545

@@ -44,7 +64,16 @@ bb4: ; preds = %bb193, %entry
4464
%18 = fmul <4 x float> %17, %val173 ; <<4 x float>> [#uses=1]
4565
%19 = shufflevector <4 x float> %18, <4 x float> undef, <2 x i32> <i32 2, i32 3> ; <<2 x float>> [#uses=1]
4666
%20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
47-
%21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2]
67+
%tmp1 = fadd <4 x float> %20, %ld3
68+
%tmp2 = fadd <4 x float> %tmp1, %ld4
69+
%tmp3 = fadd <4 x float> %tmp2, %ld5
70+
%tmp4 = fadd <4 x float> %tmp3, %ld6
71+
%tmp5 = fadd <4 x float> %tmp4, %ld7
72+
%tmp6 = fadd <4 x float> %tmp5, %ld8
73+
%tmp7 = fadd <4 x float> %tmp6, %ld9
74+
%tmp8 = fadd <4 x float> %tmp7, %ld10
75+
%tmp9 = fadd <4 x float> %tmp8, %ld11
76+
%21 = fadd <4 x float> %tmp9, %ld12
4877
%22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0]
4978
%tmp = extractelement <4 x i1> %22, i32 0
5079
br i1 %tmp, label %bb193, label %bb186

llvm/test/CodeGen/ARM/vcgt.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,9 @@ define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
161161
; rdar://7923010
162162
define <4 x i32> @vcgt_zext(<4 x float>* %A, <4 x float>* %B) nounwind {
163163
;CHECK: vcgt_zext:
164-
;CHECK: vcgt.f32 q0
165-
;CHECK: vmov.i32 q1, #0x1
166-
;CHECK: vand q0, q0, q1
164+
;CHECK: vcgt.f32 q8
165+
;CHECK: vmov.i32 q9, #0x1
166+
;CHECK: vand q8, q8, q9
167167
%tmp1 = load <4 x float>* %A
168168
%tmp2 = load <4 x float>* %B
169169
%tmp3 = fcmp ogt <4 x float> %tmp1, %tmp2

llvm/test/CodeGen/ARM/vget_lane.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind {
9696

9797
define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind {
9898
entry:
99-
; CHECK: vmov.u16 r0, d0[1]
99+
; CHECK: vmov.u16 r0, d{{.*}}[1]
100100
%arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1]
101101
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
102102
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
@@ -111,7 +111,7 @@ return: ; preds = %entry
111111

112112
define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind {
113113
entry:
114-
; CHECK: vmov.u8 r0, d0[1]
114+
; CHECK: vmov.u8 r0, d{{.*}}[1]
115115
%arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1]
116116
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
117117
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
@@ -126,7 +126,7 @@ return: ; preds = %entry
126126

127127
define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind {
128128
entry:
129-
; CHECK: vmov.u16 r0, d0[1]
129+
; CHECK: vmov.u16 r0, d{{.*}}[1]
130130
%arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1]
131131
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
132132
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
@@ -141,7 +141,7 @@ return: ; preds = %entry
141141

142142
define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind {
143143
entry:
144-
; CHECK: vmov.u8 r0, d0[1]
144+
; CHECK: vmov.u8 r0, d{{.*}}[1]
145145
%arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1]
146146
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
147147
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]

llvm/test/CodeGen/ARM/vld1.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
define <8 x i8> @vld1i8(i8* %A) nounwind {
44
;CHECK: vld1i8:
55
;Check the alignment value. Max for this instruction is 64 bits:
6-
;CHECK: vld1.8 {d0}, [r0, :64]
6+
;CHECK: vld1.8 {d16}, [r0, :64]
77
%tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A, i32 16)
88
ret <8 x i8> %tmp1
99
}
@@ -43,15 +43,15 @@ define <1 x i64> @vld1i64(i64* %A) nounwind {
4343
define <16 x i8> @vld1Qi8(i8* %A) nounwind {
4444
;CHECK: vld1Qi8:
4545
;Check the alignment value. Max for this instruction is 128 bits:
46-
;CHECK: vld1.8 {d0, d1}, [r0, :64]
46+
;CHECK: vld1.8 {d16, d17}, [r0, :64]
4747
%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
4848
ret <16 x i8> %tmp1
4949
}
5050

5151
define <8 x i16> @vld1Qi16(i16* %A) nounwind {
5252
;CHECK: vld1Qi16:
5353
;Check the alignment value. Max for this instruction is 128 bits:
54-
;CHECK: vld1.16 {d0, d1}, [r0, :128]
54+
;CHECK: vld1.16 {d16, d17}, [r0, :128]
5555
%tmp0 = bitcast i16* %A to i8*
5656
%tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0, i32 32)
5757
ret <8 x i16> %tmp1

0 commit comments

Comments
 (0)