Skip to content

Commit 15600f9

Browse files
committed
[LoadStoreVectorizer] Propagate alignment through contiguous chain to improve vectorization
1 parent a76448c commit 15600f9

File tree

2 files changed

+331
-0
lines changed

2 files changed

+331
-0
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,9 @@ class Vectorizer {
343343
/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
344344
/// in the chain is the leader, and an instr touches distance 0 from itself.
345345
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
346+
347+
/// Propagates the best alignment in a chain of contiguous accesses
348+
void propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const;
346349
};
347350

348351
class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -716,6 +719,14 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
716719
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
717720
unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
718721

722+
// We know that the accesses are contiguous. Propagate alignment
723+
// information so that slices of the chain can still be vectorized.
724+
propagateBestAlignmentsInChain(C);
725+
LLVM_DEBUG({
726+
dbgs() << "LSV: Chain after alignment propagation:\n";
727+
dumpChain(C);
728+
});
729+
719730
std::vector<Chain> Ret;
720731
for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
721732
// Find candidate chains of size not greater than the largest vector reg.
@@ -1634,3 +1645,27 @@ std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
16341645
.sextOrTrunc(OrigBitWidth);
16351646
return std::nullopt;
16361647
}
1648+
1649+
void Vectorizer::propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const {
1650+
ChainElem BestAlignedElem = C[0];
1651+
Align BestAlignSoFar = getLoadStoreAlignment(C[0].Inst);
1652+
1653+
for (const ChainElem &E : C) {
1654+
Align OrigAlign = getLoadStoreAlignment(E.Inst);
1655+
if (OrigAlign > BestAlignSoFar) {
1656+
BestAlignedElem = E;
1657+
BestAlignSoFar = OrigAlign;
1658+
}
1659+
1660+
APInt OffsetFromBestAlignedElem =
1661+
E.OffsetFromLeader - BestAlignedElem.OffsetFromLeader;
1662+
assert(OffsetFromBestAlignedElem.isNonNegative());
1663+
// commonAlignment is equivalent to a greatest common power-of-two divisor;
1664+
// it returns the largest power of 2 that divides both A and B.
1665+
Align NewAlign = commonAlignment(
1666+
BestAlignSoFar, OffsetFromBestAlignedElem.getLimitedValue());
1667+
if (NewAlign > OrigAlign)
1668+
setLoadStoreAlignment(E.Inst, NewAlign);
1669+
}
1670+
return;
1671+
}
Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -passes=load-store-vectorizer -S < %s | FileCheck %s
3+
4+
; The IR has the first float3 labeled with align 16, and that 16 should
5+
; be propagated such that the second set of 4 values
6+
; can also be vectorized together.
7+
%struct.float3 = type { float, float, float }
8+
%struct.S1 = type { %struct.float3, %struct.float3, i32, i32 }
9+
10+
define void @testStore(ptr nocapture writeonly %1) {
11+
; CHECK-LABEL: define void @testStore(
12+
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
13+
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
14+
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
15+
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
16+
; CHECK-NEXT: ret void
17+
;
18+
store float 0.000000e+00, ptr %1, align 16
19+
%getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1
20+
store float 0.000000e+00, ptr %getElem, align 4
21+
%getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2
22+
store float 0.000000e+00, ptr %getElem8, align 8
23+
%getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1
24+
store float 0.000000e+00, ptr %getElem9, align 4
25+
%getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1
26+
store float 0.000000e+00, ptr %getElem10, align 4
27+
%getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2
28+
store float 0.000000e+00, ptr %getElem11, align 4
29+
%getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2
30+
store i32 0, ptr %getElem12, align 8
31+
%getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3
32+
store i32 0, ptr %getElem13, align 4
33+
ret void
34+
}
35+
36+
define void @testLoad(ptr nocapture writeonly %1) {
37+
; CHECK-LABEL: define void @testLoad(
38+
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
39+
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
40+
; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
41+
; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
42+
; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
43+
; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
44+
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
45+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
46+
; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
47+
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float
48+
; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
49+
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float
50+
; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
51+
; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
52+
; CHECK-NEXT: ret void
53+
;
54+
%l1 = load float, ptr %1, align 16
55+
%getElem = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 1
56+
%l2 = load float, ptr %getElem, align 4
57+
%getElem8 = getelementptr inbounds %struct.float3, ptr %1, i64 0, i32 2
58+
%l3 = load float, ptr %getElem8, align 8
59+
%getElem9 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1
60+
%l4 = load float, ptr %getElem9, align 4
61+
%getElem10 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 1
62+
%l5 = load float, ptr %getElem10, align 4
63+
%getElem11 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 1, i32 2
64+
%l6 = load float, ptr %getElem11, align 4
65+
%getElem12 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 2
66+
%l7 = load i32, ptr %getElem12, align 8
67+
%getElem13 = getelementptr inbounds %struct.S1, ptr %1, i64 0, i32 3
68+
%l8 = load i32, ptr %getElem13, align 4
69+
ret void
70+
}
71+
72+
; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
73+
74+
define void @testStorei8(ptr nocapture writeonly %1) {
75+
; CHECK-LABEL: define void @testStorei8(
76+
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
77+
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
78+
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
79+
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
80+
; CHECK-NEXT: ret void
81+
;
82+
store float 0.000000e+00, ptr %1, align 16
83+
%getElem = getelementptr inbounds i8, ptr %1, i64 4
84+
store float 0.000000e+00, ptr %getElem, align 4
85+
%getElem8 = getelementptr inbounds i8, ptr %1, i64 8
86+
store float 0.000000e+00, ptr %getElem8, align 8
87+
%getElem9 = getelementptr inbounds i8, ptr %1, i64 12
88+
store float 0.000000e+00, ptr %getElem9, align 4
89+
%getElem10 = getelementptr inbounds i8, ptr %1, i64 16
90+
store float 0.000000e+00, ptr %getElem10, align 4
91+
%getElem11 = getelementptr inbounds i8, ptr %1, i64 20
92+
store float 0.000000e+00, ptr %getElem11, align 4
93+
%getElem12 = getelementptr inbounds i8, ptr %1, i64 24
94+
store i32 0, ptr %getElem12, align 8
95+
%getElem13 = getelementptr inbounds i8, ptr %1, i64 28
96+
store i32 0, ptr %getElem13, align 4
97+
ret void
98+
}
99+
100+
define void @testLoadi8(ptr nocapture writeonly %1) {
101+
; CHECK-LABEL: define void @testLoadi8(
102+
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
103+
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
104+
; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
105+
; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
106+
; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
107+
; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
108+
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
109+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
110+
; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
111+
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[L55]] to float
112+
; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
113+
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L66]] to float
114+
; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
115+
; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
116+
; CHECK-NEXT: ret void
117+
;
118+
%l1 = load float, ptr %1, align 16
119+
%getElem = getelementptr inbounds i8, ptr %1, i64 4
120+
%l2 = load float, ptr %getElem, align 4
121+
%getElem8 = getelementptr inbounds i8, ptr %1, i64 8
122+
%l3 = load float, ptr %getElem8, align 8
123+
%getElem9 = getelementptr inbounds i8, ptr %1, i64 12
124+
%l4 = load float, ptr %getElem9, align 4
125+
%getElem10 = getelementptr inbounds i8, ptr %1, i64 16
126+
%l5 = load float, ptr %getElem10, align 4
127+
%getElem11 = getelementptr inbounds i8, ptr %1, i64 20
128+
%l6 = load float, ptr %getElem11, align 4
129+
%getElem12 = getelementptr inbounds i8, ptr %1, i64 24
130+
%l7 = load i32, ptr %getElem12, align 8
131+
%getElem13 = getelementptr inbounds i8, ptr %1, i64 28
132+
%l8 = load i32, ptr %getElem13, align 4
133+
ret void
134+
}
135+
136+
137+
; This version of the test adjusts the struct to hold two i32s at the beginning,
138+
; but still assumes that the first float3 is 16 aligned. If the alignment
139+
; propagation works correctly, it should be able to load this struct in three
140+
; loads: a 2x32, a 4x32, and a 4x32. Without the alignment propagation, the last
141+
; 4x32 will instead be a 2x32 and a 2x32
142+
%struct.S2 = type { i32, i32, %struct.float3, %struct.float3, i32, i32 }
143+
144+
define void @testStore_2(ptr nocapture writeonly %1) {
145+
; CHECK-LABEL: define void @testStore_2(
146+
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
147+
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
148+
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
149+
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
150+
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1
151+
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
152+
; CHECK-NEXT: ret void
153+
;
154+
store i32 0, ptr %1, align 8
155+
%getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1
156+
store i32 0, ptr %getElem, align 4
157+
%getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2
158+
store float 0.000000e+00, ptr %getElem1, align 16
159+
%getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1
160+
store float 0.000000e+00, ptr %getElem2, align 4
161+
%getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2
162+
store float 0.000000e+00, ptr %getElem8, align 8
163+
%getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3
164+
store float 0.000000e+00, ptr %getElem9, align 4
165+
%getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1
166+
store float 0.000000e+00, ptr %getElem10, align 4
167+
%getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2
168+
store float 0.000000e+00, ptr %getElem11, align 4
169+
%getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4
170+
store i32 0, ptr %getElem12, align 8
171+
%getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5
172+
store i32 0, ptr %getElem13, align 4
173+
ret void
174+
}
175+
176+
define void @testLoad_2(ptr nocapture writeonly %1) {
177+
; CHECK-LABEL: define void @testLoad_2(
178+
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
179+
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
180+
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
181+
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
182+
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
183+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
184+
; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
185+
; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
186+
; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
187+
; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
188+
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[TMP0]], i64 0, i32 3, i32 1
189+
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
190+
; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
191+
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float
192+
; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
193+
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float
194+
; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
195+
; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
196+
; CHECK-NEXT: ret void
197+
;
198+
%l = load i32, ptr %1, align 8
199+
%getElem = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 1
200+
%l2 = load i32, ptr %getElem, align 4
201+
%getElem1 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2
202+
%l3 = load float, ptr %getElem1, align 16
203+
%getElem2 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 1
204+
%l4 = load float, ptr %getElem2, align 4
205+
%getElem8 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 2, i32 2
206+
%l5 = load float, ptr %getElem8, align 8
207+
%getElem9 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3
208+
%l6 = load float, ptr %getElem9, align 4
209+
%getElem10 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 1
210+
%l7 = load float, ptr %getElem10, align 4
211+
%getElem11 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 3, i32 2
212+
%l8 = load float, ptr %getElem11, align 4
213+
%getElem12 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 4
214+
%l9 = load i32, ptr %getElem12, align 8
215+
%getElem13 = getelementptr inbounds %struct.S2, ptr %1, i64 0, i32 5
216+
%l0 = load i32, ptr %getElem13, align 4
217+
ret void
218+
}
219+
220+
; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
221+
222+
define void @testStorei8_2(ptr nocapture writeonly %1) {
223+
; CHECK-LABEL: define void @testStorei8_2(
224+
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
225+
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
226+
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
227+
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
228+
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
229+
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
230+
; CHECK-NEXT: ret void
231+
;
232+
store i32 0, ptr %1, align 8
233+
%getElem = getelementptr inbounds i8, ptr %1, i64 4
234+
store i32 0, ptr %getElem, align 4
235+
%getElem1 = getelementptr inbounds i8, ptr %1, i64 8
236+
store float 0.000000e+00, ptr %getElem1, align 16
237+
%getElem2 = getelementptr inbounds i8, ptr %1, i64 12
238+
store float 0.000000e+00, ptr %getElem2, align 4
239+
%getElem8 = getelementptr inbounds i8, ptr %1, i64 16
240+
store float 0.000000e+00, ptr %getElem8, align 8
241+
%getElem9 = getelementptr inbounds i8, ptr %1, i64 20
242+
store float 0.000000e+00, ptr %getElem9, align 4
243+
%getElem10 = getelementptr inbounds i8, ptr %1, i64 24
244+
store float 0.000000e+00, ptr %getElem10, align 4
245+
%getElem11 = getelementptr inbounds i8, ptr %1, i64 28
246+
store float 0.000000e+00, ptr %getElem11, align 4
247+
%getElem12 = getelementptr inbounds i8, ptr %1, i64 32
248+
store i32 0, ptr %getElem12, align 8
249+
%getElem13 = getelementptr inbounds i8, ptr %1, i64 36
250+
store i32 0, ptr %getElem13, align 4
251+
ret void
252+
}
253+
254+
define void @testLoadi8_2(ptr nocapture writeonly %1) {
255+
; CHECK-LABEL: define void @testLoadi8_2(
256+
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
257+
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
258+
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
259+
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
260+
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
261+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
262+
; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
263+
; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
264+
; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
265+
; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
266+
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
267+
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GETELEM10]], align 16
268+
; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
269+
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[L77]] to float
270+
; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
271+
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[L88]] to float
272+
; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
273+
; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
274+
; CHECK-NEXT: ret void
275+
;
276+
%l = load i32, ptr %1, align 8
277+
%getElem = getelementptr inbounds i8, ptr %1, i64 4
278+
%l2 = load i32, ptr %getElem, align 4
279+
%getElem1 = getelementptr inbounds i8, ptr %1, i64 8
280+
%l3 = load float, ptr %getElem1, align 16
281+
%getElem2 = getelementptr inbounds i8, ptr %1, i64 12
282+
%l4 = load float, ptr %getElem2, align 4
283+
%getElem8 = getelementptr inbounds i8, ptr %1, i64 16
284+
%l5 = load float, ptr %getElem8, align 8
285+
%getElem9 = getelementptr inbounds i8, ptr %1, i64 20
286+
%l6 = load float, ptr %getElem9, align 4
287+
%getElem10 = getelementptr inbounds i8, ptr %1, i64 24
288+
%l7 = load float, ptr %getElem10, align 4
289+
%getElem11 = getelementptr inbounds i8, ptr %1, i64 28
290+
%l8 = load float, ptr %getElem11, align 4
291+
%getElem12 = getelementptr inbounds i8, ptr %1, i64 32
292+
%l9 = load i32, ptr %getElem12, align 8
293+
%getElem13 = getelementptr inbounds i8, ptr %1, i64 36
294+
%l0 = load i32, ptr %getElem13, align 4
295+
ret void
296+
}

0 commit comments

Comments
 (0)