44; Test optimization of DUP with extended narrow loads
55; This should avoid GPR->SIMD transfers by loading directly into vector registers
66
7- define <4 x i32 > @test_dup_zextload_i8_v4i32 (ptr %p ) {
8- ; CHECK-LABEL: test_dup_zextload_i8_v4i32 :
7+ define <4 x i16 > @test_dup_zextload_i8_v4i16 (ptr %p ) {
8+ ; CHECK-LABEL: test_dup_zextload_i8_v4i16 :
99; CHECK: // %bb.0:
1010; CHECK-NEXT: ldr b0, [x0]
11- ; CHECK-NEXT: dup v0.4s , v0.s [0]
11+ ; CHECK-NEXT: dup v0.4h , v0.h [0]
1212; CHECK-NEXT: ret
1313 %load = load i8 , ptr %p , align 1
14- %ext = zext i8 %load to i32
15- %vec = insertelement <4 x i32 > poison, i32 %ext , i32 0
16- %dup = shufflevector <4 x i32 > %vec , <4 x i32 > poison, <4 x i32 > zeroinitializer
17- ret <4 x i32 > %dup
14+ %ext = zext i8 %load to i16
15+ %vec = insertelement <4 x i16 > poison, i16 %ext , i32 0
16+ %dup = shufflevector <4 x i16 > %vec , <4 x i16 > poison, <4 x i32 > zeroinitializer
17+ ret <4 x i16 > %dup
1818}
1919
20- define <4 x i32 > @test_dup_zextload_i16_v4i32 (ptr %p ) {
21- ; CHECK-LABEL: test_dup_zextload_i16_v4i32 :
20+ define <8 x i16 > @test_dup_zextload_i8_v8i16 (ptr %p ) {
21+ ; CHECK-LABEL: test_dup_zextload_i8_v8i16 :
2222; CHECK: // %bb.0:
23- ; CHECK-NEXT: ldr h0 , [x0]
24- ; CHECK-NEXT: dup v0.4s , v0.s [0]
23+ ; CHECK-NEXT: ldr b0 , [x0]
24+ ; CHECK-NEXT: dup v0.8h , v0.h [0]
2525; CHECK-NEXT: ret
26- %load = load i16 , ptr %p , align 2
27- %ext = zext i16 %load to i32
28- %vec = insertelement <4 x i32 > poison, i32 %ext , i32 0
29- %dup = shufflevector <4 x i32 > %vec , <4 x i32 > poison, <4 x i32 > zeroinitializer
30- ret <4 x i32 > %dup
26+ %load = load i8 , ptr %p , align 1
27+ %ext = zext i8 %load to i16
28+ %vec = insertelement <8 x i16 > poison, i16 %ext , i32 0
29+ %dup = shufflevector <8 x i16 > %vec , <8 x i16 > poison, <8 x i32 > zeroinitializer
30+ ret <8 x i16 > %dup
3131}
3232
3333define <2 x i32 > @test_dup_zextload_i8_v2i32 (ptr %p ) {
@@ -43,43 +43,17 @@ define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
4343 ret <2 x i32 > %dup
4444}
4545
46- define <2 x i32 > @test_dup_zextload_i16_v2i32 (ptr %p ) {
47- ; CHECK-LABEL: test_dup_zextload_i16_v2i32:
48- ; CHECK: // %bb.0:
49- ; CHECK-NEXT: ldr h0, [x0]
50- ; CHECK-NEXT: dup v0.2s, v0.s[0]
51- ; CHECK-NEXT: ret
52- %load = load i16 , ptr %p , align 2
53- %ext = zext i16 %load to i32
54- %vec = insertelement <2 x i32 > poison, i32 %ext , i32 0
55- %dup = shufflevector <2 x i32 > %vec , <2 x i32 > poison, <2 x i32 > zeroinitializer
56- ret <2 x i32 > %dup
57- }
58-
59- define <8 x i16 > @test_dup_zextload_i8_v8i16 (ptr %p ) {
60- ; CHECK-LABEL: test_dup_zextload_i8_v8i16:
61- ; CHECK: // %bb.0:
62- ; CHECK-NEXT: ldr b0, [x0]
63- ; CHECK-NEXT: dup v0.8h, v0.h[0]
64- ; CHECK-NEXT: ret
65- %load = load i8 , ptr %p , align 1
66- %ext = zext i8 %load to i16
67- %vec = insertelement <8 x i16 > poison, i16 %ext , i32 0
68- %dup = shufflevector <8 x i16 > %vec , <8 x i16 > poison, <8 x i32 > zeroinitializer
69- ret <8 x i16 > %dup
70- }
71-
72- define <4 x i16 > @test_dup_zextload_i8_v4i16 (ptr %p ) {
73- ; CHECK-LABEL: test_dup_zextload_i8_v4i16:
46+ define <4 x i32 > @test_dup_zextload_i8_v4i32 (ptr %p ) {
47+ ; CHECK-LABEL: test_dup_zextload_i8_v4i32:
7448; CHECK: // %bb.0:
7549; CHECK-NEXT: ldr b0, [x0]
76- ; CHECK-NEXT: dup v0.4h , v0.h [0]
50+ ; CHECK-NEXT: dup v0.4s , v0.s [0]
7751; CHECK-NEXT: ret
7852 %load = load i8 , ptr %p , align 1
79- %ext = zext i8 %load to i16
80- %vec = insertelement <4 x i16 > poison, i16 %ext , i32 0
81- %dup = shufflevector <4 x i16 > %vec , <4 x i16 > poison, <4 x i32 > zeroinitializer
82- ret <4 x i16 > %dup
53+ %ext = zext i8 %load to i32
54+ %vec = insertelement <4 x i32 > poison, i32 %ext , i32 0
55+ %dup = shufflevector <4 x i32 > %vec , <4 x i32 > poison, <4 x i32 > zeroinitializer
56+ ret <4 x i32 > %dup
8357}
8458
8559define <4 x i32 > @test_dup_zextload_i8_v4i32_offset (ptr %p ) {
@@ -96,29 +70,68 @@ define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
9670 ret <4 x i32 > %dup
9771}
9872
99- define <4 x i32 > @test_dup_zextload_i16_v4i32_offset (ptr %p ) {
100- ; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset :
73+ define <4 x i32 > @test_dup_zextload_i8_v4i32_reg_offset (ptr %p , i64 %offset ) {
74+ ; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset :
10175; CHECK: // %bb.0:
102- ; CHECK-NEXT: ldr h0 , [x0, #8 ]
76+ ; CHECK-NEXT: ldr b0 , [x0, x1 ]
10377; CHECK-NEXT: dup v0.4s, v0.s[0]
10478; CHECK-NEXT: ret
105- %addr = getelementptr inbounds i16 , ptr %p , i64 4
106- %load = load i16 , ptr %addr , align 2
79+ %addr = getelementptr inbounds i8 , ptr %p , i64 %offset
80+ %load = load i8 , ptr %addr , align 1
81+ %ext = zext i8 %load to i32
82+ %vec = insertelement <4 x i32 > poison, i32 %ext , i32 0
83+ %dup = shufflevector <4 x i32 > %vec , <4 x i32 > poison, <4 x i32 > zeroinitializer
84+ ret <4 x i32 > %dup
85+ }
86+
87+ define <2 x i64 > @test_dup_zextload_i8_v2i64 (ptr %p ) {
88+ ; CHECK-LABEL: test_dup_zextload_i8_v2i64:
89+ ; CHECK: // %bb.0:
90+ ; CHECK-NEXT: ldr b0, [x0]
91+ ; CHECK-NEXT: dup v0.2d, v0.d[0]
92+ ; CHECK-NEXT: ret
93+ %load = load i8 , ptr %p , align 1
94+ %ext = zext i8 %load to i64
95+ %vec = insertelement <2 x i64 > poison, i64 %ext , i32 0
96+ %dup = shufflevector <2 x i64 > %vec , <2 x i64 > poison, <2 x i32 > zeroinitializer
97+ ret <2 x i64 > %dup
98+ }
99+
100+ define <2 x i32 > @test_dup_zextload_i16_v2i32 (ptr %p ) {
101+ ; CHECK-LABEL: test_dup_zextload_i16_v2i32:
102+ ; CHECK: // %bb.0:
103+ ; CHECK-NEXT: ldr h0, [x0]
104+ ; CHECK-NEXT: dup v0.2s, v0.s[0]
105+ ; CHECK-NEXT: ret
106+ %load = load i16 , ptr %p , align 1
107+ %ext = zext i16 %load to i32
108+ %vec = insertelement <2 x i32 > poison, i32 %ext , i32 0
109+ %dup = shufflevector <2 x i32 > %vec , <2 x i32 > poison, <2 x i32 > zeroinitializer
110+ ret <2 x i32 > %dup
111+ }
112+
113+ define <4 x i32 > @test_dup_zextload_i16_v4i32 (ptr %p ) {
114+ ; CHECK-LABEL: test_dup_zextload_i16_v4i32:
115+ ; CHECK: // %bb.0:
116+ ; CHECK-NEXT: ldr h0, [x0]
117+ ; CHECK-NEXT: dup v0.4s, v0.s[0]
118+ ; CHECK-NEXT: ret
119+ %load = load i16 , ptr %p , align 1
107120 %ext = zext i16 %load to i32
108121 %vec = insertelement <4 x i32 > poison, i32 %ext , i32 0
109122 %dup = shufflevector <4 x i32 > %vec , <4 x i32 > poison, <4 x i32 > zeroinitializer
110123 ret <4 x i32 > %dup
111124}
112125
113- define <4 x i32 > @test_dup_zextload_i8_v4i32_reg_offset (ptr %p , i64 %offset ) {
114- ; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset :
126+ define <4 x i32 > @test_dup_zextload_i16_v4i32_offset (ptr %p ) {
127+ ; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset :
115128; CHECK: // %bb.0:
116- ; CHECK-NEXT: ldr b0 , [x0, x1 ]
129+ ; CHECK-NEXT: ldr h0 , [x0, #8 ]
117130; CHECK-NEXT: dup v0.4s, v0.s[0]
118131; CHECK-NEXT: ret
119- %addr = getelementptr inbounds i8 , ptr %p , i64 %offset
120- %load = load i8 , ptr %addr , align 1
121- %ext = zext i8 %load to i32
132+ %addr = getelementptr inbounds i16 , ptr %p , i64 4
133+ %load = load i16 , ptr %addr , align 1
134+ %ext = zext i16 %load to i32
122135 %vec = insertelement <4 x i32 > poison, i32 %ext , i32 0
123136 %dup = shufflevector <4 x i32 > %vec , <4 x i32 > poison, <4 x i32 > zeroinitializer
124137 ret <4 x i32 > %dup
@@ -131,9 +144,35 @@ define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
131144; CHECK-NEXT: dup v0.4s, v0.s[0]
132145; CHECK-NEXT: ret
133146 %addr = getelementptr inbounds i16 , ptr %p , i64 %offset
134- %load = load i16 , ptr %addr , align 2
147+ %load = load i16 , ptr %addr , align 1
135148 %ext = zext i16 %load to i32
136149 %vec = insertelement <4 x i32 > poison, i32 %ext , i32 0
137150 %dup = shufflevector <4 x i32 > %vec , <4 x i32 > poison, <4 x i32 > zeroinitializer
138151 ret <4 x i32 > %dup
139152}
153+
154+ define <2 x i64 > @test_dup_zextload_i16_v2i64 (ptr %p ) {
155+ ; CHECK-LABEL: test_dup_zextload_i16_v2i64:
156+ ; CHECK: // %bb.0:
157+ ; CHECK-NEXT: ldr h0, [x0]
158+ ; CHECK-NEXT: dup v0.2d, v0.d[0]
159+ ; CHECK-NEXT: ret
160+ %load = load i16 , ptr %p , align 1
161+ %ext = zext i16 %load to i64
162+ %vec = insertelement <2 x i64 > poison, i64 %ext , i32 0
163+ %dup = shufflevector <2 x i64 > %vec , <2 x i64 > poison, <2 x i32 > zeroinitializer
164+ ret <2 x i64 > %dup
165+ }
166+
167+ define <2 x i64 > @test_dup_zextload_i32_v2i64 (ptr %p ) {
168+ ; CHECK-LABEL: test_dup_zextload_i32_v2i64:
169+ ; CHECK: // %bb.0:
170+ ; CHECK-NEXT: ldr s0, [x0]
171+ ; CHECK-NEXT: dup v0.2d, v0.d[0]
172+ ; CHECK-NEXT: ret
173+ %load = load i32 , ptr %p , align 1
174+ %ext = zext i32 %load to i64
175+ %vec = insertelement <2 x i64 > poison, i64 %ext , i32 0
176+ %dup = shufflevector <2 x i64 > %vec , <2 x i64 > poison, <2 x i32 > zeroinitializer
177+ ret <2 x i64 > %dup
178+ }
0 commit comments