Skip to content

Commit 4edadb5

Browse files
committed
Addressed comments
1 parent 4b18219 commit 4edadb5

File tree

2 files changed

+102
-63
lines changed

2 files changed

+102
-63
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26678,8 +26678,8 @@ static SDValue performDUPCombine(SDNode *N,
2667826678
EVT MemVT = LD->getMemoryVT();
2667926679
EVT ElemVT = VT.getVectorElementType();
2668026680
if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
26681-
(MemVT == MVT::i8 || MemVT == MVT::i16) && ElemVT != MemVT &&
26682-
LD->hasOneUse()) {
26681+
(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) &&
26682+
ElemVT != MemVT && LD->hasOneUse()) {
2668326683
EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
2668426684
128 / ElemVT.getSizeInBits());
2668526685
SDValue ScalarToVec =

llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll

Lines changed: 100 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,30 @@
44
; Test optimization of DUP with extended narrow loads
55
; This should avoid GPR->SIMD transfers by loading directly into vector registers
66

7-
define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
8-
; CHECK-LABEL: test_dup_zextload_i8_v4i32:
7+
define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
8+
; CHECK-LABEL: test_dup_zextload_i8_v4i16:
99
; CHECK: // %bb.0:
1010
; CHECK-NEXT: ldr b0, [x0]
11-
; CHECK-NEXT: dup v0.4s, v0.s[0]
11+
; CHECK-NEXT: dup v0.4h, v0.h[0]
1212
; CHECK-NEXT: ret
1313
%load = load i8, ptr %p, align 1
14-
%ext = zext i8 %load to i32
15-
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
16-
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
17-
ret <4 x i32> %dup
14+
%ext = zext i8 %load to i16
15+
%vec = insertelement <4 x i16> poison, i16 %ext, i32 0
16+
%dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer
17+
ret <4 x i16> %dup
1818
}
1919

20-
define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
21-
; CHECK-LABEL: test_dup_zextload_i16_v4i32:
20+
define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
21+
; CHECK-LABEL: test_dup_zextload_i8_v8i16:
2222
; CHECK: // %bb.0:
23-
; CHECK-NEXT: ldr h0, [x0]
24-
; CHECK-NEXT: dup v0.4s, v0.s[0]
23+
; CHECK-NEXT: ldr b0, [x0]
24+
; CHECK-NEXT: dup v0.8h, v0.h[0]
2525
; CHECK-NEXT: ret
26-
%load = load i16, ptr %p, align 2
27-
%ext = zext i16 %load to i32
28-
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
29-
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
30-
ret <4 x i32> %dup
26+
%load = load i8, ptr %p, align 1
27+
%ext = zext i8 %load to i16
28+
%vec = insertelement <8 x i16> poison, i16 %ext, i32 0
29+
%dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer
30+
ret <8 x i16> %dup
3131
}
3232

3333
define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
@@ -43,43 +43,17 @@ define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
4343
ret <2 x i32> %dup
4444
}
4545

46-
define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
47-
; CHECK-LABEL: test_dup_zextload_i16_v2i32:
48-
; CHECK: // %bb.0:
49-
; CHECK-NEXT: ldr h0, [x0]
50-
; CHECK-NEXT: dup v0.2s, v0.s[0]
51-
; CHECK-NEXT: ret
52-
%load = load i16, ptr %p, align 2
53-
%ext = zext i16 %load to i32
54-
%vec = insertelement <2 x i32> poison, i32 %ext, i32 0
55-
%dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
56-
ret <2 x i32> %dup
57-
}
58-
59-
define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
60-
; CHECK-LABEL: test_dup_zextload_i8_v8i16:
61-
; CHECK: // %bb.0:
62-
; CHECK-NEXT: ldr b0, [x0]
63-
; CHECK-NEXT: dup v0.8h, v0.h[0]
64-
; CHECK-NEXT: ret
65-
%load = load i8, ptr %p, align 1
66-
%ext = zext i8 %load to i16
67-
%vec = insertelement <8 x i16> poison, i16 %ext, i32 0
68-
%dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer
69-
ret <8 x i16> %dup
70-
}
71-
72-
define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
73-
; CHECK-LABEL: test_dup_zextload_i8_v4i16:
46+
define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
47+
; CHECK-LABEL: test_dup_zextload_i8_v4i32:
7448
; CHECK: // %bb.0:
7549
; CHECK-NEXT: ldr b0, [x0]
76-
; CHECK-NEXT: dup v0.4h, v0.h[0]
50+
; CHECK-NEXT: dup v0.4s, v0.s[0]
7751
; CHECK-NEXT: ret
7852
%load = load i8, ptr %p, align 1
79-
%ext = zext i8 %load to i16
80-
%vec = insertelement <4 x i16> poison, i16 %ext, i32 0
81-
%dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer
82-
ret <4 x i16> %dup
53+
%ext = zext i8 %load to i32
54+
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
55+
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
56+
ret <4 x i32> %dup
8357
}
8458

8559
define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
@@ -96,29 +70,68 @@ define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
9670
ret <4 x i32> %dup
9771
}
9872

99-
define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
100-
; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
73+
define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
74+
; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
10175
; CHECK: // %bb.0:
102-
; CHECK-NEXT: ldr h0, [x0, #8]
76+
; CHECK-NEXT: ldr b0, [x0, x1]
10377
; CHECK-NEXT: dup v0.4s, v0.s[0]
10478
; CHECK-NEXT: ret
105-
%addr = getelementptr inbounds i16, ptr %p, i64 4
106-
%load = load i16, ptr %addr, align 2
79+
%addr = getelementptr inbounds i8, ptr %p, i64 %offset
80+
%load = load i8, ptr %addr, align 1
81+
%ext = zext i8 %load to i32
82+
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
83+
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
84+
ret <4 x i32> %dup
85+
}
86+
87+
define <2 x i64> @test_dup_zextload_i8_v2i64(ptr %p) {
88+
; CHECK-LABEL: test_dup_zextload_i8_v2i64:
89+
; CHECK: // %bb.0:
90+
; CHECK-NEXT: ldr b0, [x0]
91+
; CHECK-NEXT: dup v0.2d, v0.d[0]
92+
; CHECK-NEXT: ret
93+
%load = load i8, ptr %p, align 1
94+
%ext = zext i8 %load to i64
95+
%vec = insertelement <2 x i64> poison, i64 %ext, i32 0
96+
%dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
97+
ret <2 x i64> %dup
98+
}
99+
100+
define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
101+
; CHECK-LABEL: test_dup_zextload_i16_v2i32:
102+
; CHECK: // %bb.0:
103+
; CHECK-NEXT: ldr h0, [x0]
104+
; CHECK-NEXT: dup v0.2s, v0.s[0]
105+
; CHECK-NEXT: ret
106+
%load = load i16, ptr %p, align 1
107+
%ext = zext i16 %load to i32
108+
%vec = insertelement <2 x i32> poison, i32 %ext, i32 0
109+
%dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
110+
ret <2 x i32> %dup
111+
}
112+
113+
define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
114+
; CHECK-LABEL: test_dup_zextload_i16_v4i32:
115+
; CHECK: // %bb.0:
116+
; CHECK-NEXT: ldr h0, [x0]
117+
; CHECK-NEXT: dup v0.4s, v0.s[0]
118+
; CHECK-NEXT: ret
119+
%load = load i16, ptr %p, align 1
107120
%ext = zext i16 %load to i32
108121
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
109122
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
110123
ret <4 x i32> %dup
111124
}
112125

113-
define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
114-
; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
126+
define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
127+
; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
115128
; CHECK: // %bb.0:
116-
; CHECK-NEXT: ldr b0, [x0, x1]
129+
; CHECK-NEXT: ldr h0, [x0, #8]
117130
; CHECK-NEXT: dup v0.4s, v0.s[0]
118131
; CHECK-NEXT: ret
119-
%addr = getelementptr inbounds i8, ptr %p, i64 %offset
120-
%load = load i8, ptr %addr, align 1
121-
%ext = zext i8 %load to i32
132+
%addr = getelementptr inbounds i16, ptr %p, i64 4
133+
%load = load i16, ptr %addr, align 1
134+
%ext = zext i16 %load to i32
122135
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
123136
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
124137
ret <4 x i32> %dup
@@ -131,9 +144,35 @@ define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
131144
; CHECK-NEXT: dup v0.4s, v0.s[0]
132145
; CHECK-NEXT: ret
133146
%addr = getelementptr inbounds i16, ptr %p, i64 %offset
134-
%load = load i16, ptr %addr, align 2
147+
%load = load i16, ptr %addr, align 1
135148
%ext = zext i16 %load to i32
136149
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
137150
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
138151
ret <4 x i32> %dup
139152
}
153+
154+
define <2 x i64> @test_dup_zextload_i16_v2i64(ptr %p) {
155+
; CHECK-LABEL: test_dup_zextload_i16_v2i64:
156+
; CHECK: // %bb.0:
157+
; CHECK-NEXT: ldr h0, [x0]
158+
; CHECK-NEXT: dup v0.2d, v0.d[0]
159+
; CHECK-NEXT: ret
160+
%load = load i16, ptr %p, align 1
161+
%ext = zext i16 %load to i64
162+
%vec = insertelement <2 x i64> poison, i64 %ext, i32 0
163+
%dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
164+
ret <2 x i64> %dup
165+
}
166+
167+
define <2 x i64> @test_dup_zextload_i32_v2i64(ptr %p) {
168+
; CHECK-LABEL: test_dup_zextload_i32_v2i64:
169+
; CHECK: // %bb.0:
170+
; CHECK-NEXT: ldr s0, [x0]
171+
; CHECK-NEXT: dup v0.2d, v0.d[0]
172+
; CHECK-NEXT: ret
173+
%load = load i32, ptr %p, align 1
174+
%ext = zext i32 %load to i64
175+
%vec = insertelement <2 x i64> poison, i64 %ext, i32 0
176+
%dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
177+
ret <2 x i64> %dup
178+
}

0 commit comments

Comments
 (0)