Skip to content

Commit b905f1c

Browse files
committed
Address feedback, add reverse propagation, simplify and expand unit tests
1 parent 15600f9 commit b905f1c

File tree

2 files changed

+194
-35
lines changed

2 files changed

+194
-35
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1647,25 +1647,30 @@ std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
16471647
}
16481648

16491649
void Vectorizer::propagateBestAlignmentsInChain(ArrayRef<ChainElem> C) const {
1650-
ChainElem BestAlignedElem = C[0];
1651-
Align BestAlignSoFar = getLoadStoreAlignment(C[0].Inst);
1652-
1653-
for (const ChainElem &E : C) {
1654-
Align OrigAlign = getLoadStoreAlignment(E.Inst);
1655-
if (OrigAlign > BestAlignSoFar) {
1656-
BestAlignedElem = E;
1657-
BestAlignSoFar = OrigAlign;
1650+
auto PropagateAlignments = [](auto ChainIt) {
1651+
ChainElem BestAlignedElem = *ChainIt.begin();
1652+
Align BestAlignSoFar = getLoadStoreAlignment(BestAlignedElem.Inst);
1653+
1654+
for (const ChainElem &E : ChainIt) {
1655+
Align OrigAlign = getLoadStoreAlignment(E.Inst);
1656+
if (OrigAlign > BestAlignSoFar) {
1657+
BestAlignedElem = E;
1658+
BestAlignSoFar = OrigAlign;
1659+
continue;
1660+
}
1661+
1662+
APInt DeltaFromBestAlignedElem =
1663+
APIntOps::abdu(E.OffsetFromLeader, BestAlignedElem.OffsetFromLeader);
1664+
// commonAlignment is equivalent to a greatest common power-of-two
1665+
// divisor; it returns the largest power of 2 that divides both A and B.
1666+
Align NewAlign = commonAlignment(
1667+
BestAlignSoFar, DeltaFromBestAlignedElem.getLimitedValue());
1668+
if (NewAlign > OrigAlign)
1669+
setLoadStoreAlignment(E.Inst, NewAlign);
16581670
}
1671+
};
16591672

1660-
APInt OffsetFromBestAlignedElem =
1661-
E.OffsetFromLeader - BestAlignedElem.OffsetFromLeader;
1662-
assert(OffsetFromBestAlignedElem.isNonNegative());
1663-
// commonAlignment is equivalent to a greatest common power-of-two divisor;
1664-
// it returns the largest power of 2 that divides both A and B.
1665-
Align NewAlign = commonAlignment(
1666-
BestAlignSoFar, OffsetFromBestAlignedElem.getLimitedValue());
1667-
if (NewAlign > OrigAlign)
1668-
setLoadStoreAlignment(E.Inst, NewAlign);
1669-
}
1670-
return;
1673+
// Propagate forwards and backwards.
1674+
PropagateAlignments(C);
1675+
PropagateAlignments(reverse(C));
16711676
}

llvm/test/Transforms/LoadStoreVectorizer/prop-align.ll

Lines changed: 170 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
%struct.float3 = type { float, float, float }
88
%struct.S1 = type { %struct.float3, %struct.float3, i32, i32 }
99

10-
define void @testStore(ptr nocapture writeonly %1) {
10+
define void @testStore(ptr %1) {
1111
; CHECK-LABEL: define void @testStore(
12-
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
12+
; CHECK-SAME: ptr [[TMP0:%.*]]) {
1313
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
1414
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i64 0, i32 1, i32 1
1515
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
@@ -33,9 +33,9 @@ define void @testStore(ptr nocapture writeonly %1) {
3333
ret void
3434
}
3535

36-
define void @testLoad(ptr nocapture writeonly %1) {
36+
define void @testLoad(ptr %1) {
3737
; CHECK-LABEL: define void @testLoad(
38-
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
38+
; CHECK-SAME: ptr [[TMP0:%.*]]) {
3939
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
4040
; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
4141
; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -71,9 +71,9 @@ define void @testLoad(ptr nocapture writeonly %1) {
7171

7272
; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
7373

74-
define void @testStorei8(ptr nocapture writeonly %1) {
74+
define void @testStorei8(ptr %1) {
7575
; CHECK-LABEL: define void @testStorei8(
76-
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
76+
; CHECK-SAME: ptr [[TMP0:%.*]]) {
7777
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[TMP0]], align 16
7878
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
7979
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[GETELEM10]], align 16
@@ -97,9 +97,9 @@ define void @testStorei8(ptr nocapture writeonly %1) {
9797
ret void
9898
}
9999

100-
define void @testLoadi8(ptr nocapture writeonly %1) {
100+
define void @testLoadi8(ptr %1) {
101101
; CHECK-LABEL: define void @testLoadi8(
102-
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
102+
; CHECK-SAME: ptr [[TMP0:%.*]]) {
103103
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0]], align 16
104104
; CHECK-NEXT: [[L11:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
105105
; CHECK-NEXT: [[L22:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -141,9 +141,9 @@ define void @testLoadi8(ptr nocapture writeonly %1) {
141141
; 4x32 will instead be a 2x32 and a 2x32
142142
%struct.S2 = type { i32, i32, %struct.float3, %struct.float3, i32, i32 }
143143

144-
define void @testStore_2(ptr nocapture writeonly %1) {
144+
define void @testStore_2(ptr %1) {
145145
; CHECK-LABEL: define void @testStore_2(
146-
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
146+
; CHECK-SAME: ptr [[TMP0:%.*]]) {
147147
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
148148
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_S2:%.*]], ptr [[TMP0]], i64 0, i32 2
149149
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
@@ -173,9 +173,9 @@ define void @testStore_2(ptr nocapture writeonly %1) {
173173
ret void
174174
}
175175

176-
define void @testLoad_2(ptr nocapture writeonly %1) {
176+
define void @testLoad_2(ptr %1) {
177177
; CHECK-LABEL: define void @testLoad_2(
178-
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
178+
; CHECK-SAME: ptr [[TMP0:%.*]]) {
179179
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
180180
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
181181
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
@@ -219,9 +219,9 @@ define void @testLoad_2(ptr nocapture writeonly %1) {
219219

220220
; Also, test without the struct geps, to see if it still works with i8 geps/ptradd
221221

222-
define void @testStorei8_2(ptr nocapture writeonly %1) {
222+
define void @testStorei8_2(ptr %1) {
223223
; CHECK-LABEL: define void @testStorei8_2(
224-
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
224+
; CHECK-SAME: ptr [[TMP0:%.*]]) {
225225
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP0]], align 8
226226
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
227227
; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[GETELEM1]], align 16
@@ -251,9 +251,9 @@ define void @testStorei8_2(ptr nocapture writeonly %1) {
251251
ret void
252252
}
253253

254-
define void @testLoadi8_2(ptr nocapture writeonly %1) {
254+
define void @testLoadi8_2(ptr %1) {
255255
; CHECK-LABEL: define void @testLoadi8_2(
256-
; CHECK-SAME: ptr writeonly captures(none) [[TMP0:%.*]]) {
256+
; CHECK-SAME: ptr [[TMP0:%.*]]) {
257257
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
258258
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
259259
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
@@ -294,3 +294,157 @@ define void @testLoadi8_2(ptr nocapture writeonly %1) {
294294
%l0 = load i32, ptr %getElem13, align 4
295295
ret void
296296
}
297+
298+
; Test that the alignment propagation works both forwards and backwards.
299+
; with the "align 16" placed where it is,
300+
; we should end up with a v2 followed by two v4s followed by a v2.
301+
define void @test_forward_and_reverse(ptr %1) {
302+
; CHECK-LABEL: define void @test_forward_and_reverse(
303+
; CHECK-SAME: ptr [[TMP0:%.*]]) {
304+
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
305+
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
306+
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
307+
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
308+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
309+
; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
310+
; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
311+
; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
312+
; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
313+
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
314+
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[GETELEM10]], align 16
315+
; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
316+
; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
317+
; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
318+
; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
319+
; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40
320+
; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[GETELEM14]], align 16
321+
; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
322+
; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
323+
; CHECK-NEXT: ret void
324+
;
325+
%l = load i32, ptr %1, align 4
326+
%getElem = getelementptr inbounds i8, ptr %1, i64 4
327+
%l2 = load i32, ptr %getElem, align 4
328+
%getElem1 = getelementptr inbounds i8, ptr %1, i64 8
329+
%l3 = load float, ptr %getElem1, align 4
330+
%getElem2 = getelementptr inbounds i8, ptr %1, i64 12
331+
%l4 = load float, ptr %getElem2, align 4
332+
%getElem8 = getelementptr inbounds i8, ptr %1, i64 16
333+
%l5 = load float, ptr %getElem8, align 4
334+
%getElem9 = getelementptr inbounds i8, ptr %1, i64 20
335+
%l6 = load float, ptr %getElem9, align 4
336+
%getElem10 = getelementptr inbounds i8, ptr %1, i64 24
337+
%l7 = load float, ptr %getElem10, align 16
338+
%getElem11 = getelementptr inbounds i8, ptr %1, i64 28
339+
%l8 = load float, ptr %getElem11, align 4
340+
%getElem12 = getelementptr inbounds i8, ptr %1, i64 32
341+
%l9 = load float, ptr %getElem12, align 4
342+
%getElem13 = getelementptr inbounds i8, ptr %1, i64 36
343+
%l0 = load float, ptr %getElem13, align 4
344+
%getElem14 = getelementptr inbounds i8, ptr %1, i64 40
345+
%l11 = load i32, ptr %getElem14, align 4
346+
%getElem15 = getelementptr inbounds i8, ptr %1, i64 44
347+
%l12 = load i32, ptr %getElem15, align 4
348+
ret void
349+
}
350+
351+
; Test an edge case where the defined alignment is max align
352+
define void @test_forward_and_reverse_max_align(ptr %1) {
353+
; CHECK-LABEL: define void @test_forward_and_reverse_max_align(
354+
; CHECK-SAME: ptr [[TMP0:%.*]]) {
355+
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP0]], align 8
356+
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
357+
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
358+
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
359+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[GETELEM1]], align 16
360+
; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
361+
; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
362+
; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
363+
; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
364+
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
365+
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[GETELEM10]], align 4294967296
366+
; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
367+
; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
368+
; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
369+
; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
370+
; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40
371+
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[GETELEM14]], align 16
372+
; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
373+
; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
374+
; CHECK-NEXT: ret void
375+
;
376+
%l = load i32, ptr %1, align 4
377+
%getElem = getelementptr inbounds i8, ptr %1, i64 4
378+
%l2 = load i32, ptr %getElem, align 4
379+
%getElem1 = getelementptr inbounds i8, ptr %1, i64 8
380+
%l3 = load float, ptr %getElem1, align 4
381+
%getElem2 = getelementptr inbounds i8, ptr %1, i64 12
382+
%l4 = load float, ptr %getElem2, align 4
383+
%getElem8 = getelementptr inbounds i8, ptr %1, i64 16
384+
%l5 = load float, ptr %getElem8, align 4
385+
%getElem9 = getelementptr inbounds i8, ptr %1, i64 20
386+
%l6 = load float, ptr %getElem9, align 4
387+
%getElem10 = getelementptr inbounds i8, ptr %1, i64 24
388+
%l7 = load float, ptr %getElem10, align 4294967296
389+
%getElem11 = getelementptr inbounds i8, ptr %1, i64 28
390+
%l8 = load float, ptr %getElem11, align 4
391+
%getElem12 = getelementptr inbounds i8, ptr %1, i64 32
392+
%l9 = load float, ptr %getElem12, align 4
393+
%getElem13 = getelementptr inbounds i8, ptr %1, i64 36
394+
%l0 = load float, ptr %getElem13, align 4
395+
%getElem14 = getelementptr inbounds i8, ptr %1, i64 40
396+
%l11 = load i32, ptr %getElem14, align 4
397+
%getElem15 = getelementptr inbounds i8, ptr %1, i64 44
398+
%l12 = load i32, ptr %getElem15, align 4
399+
ret void
400+
}
401+
402+
define void @test_i8_elements(ptr %1) {
403+
; CHECK-LABEL: define void @test_i8_elements(
404+
; CHECK-SAME: ptr [[TMP0:%.*]]) {
405+
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP0]], align 2
406+
; CHECK-NEXT: [[L1:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
407+
; CHECK-NEXT: [[L22:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
408+
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
409+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GETELEM1]], align 4
410+
; CHECK-NEXT: [[L33:%.*]] = extractelement <4 x i8> [[TMP3]], i32 0
411+
; CHECK-NEXT: [[L44:%.*]] = extractelement <4 x i8> [[TMP3]], i32 1
412+
; CHECK-NEXT: [[L55:%.*]] = extractelement <4 x i8> [[TMP3]], i32 2
413+
; CHECK-NEXT: [[L66:%.*]] = extractelement <4 x i8> [[TMP3]], i32 3
414+
; CHECK-NEXT: [[GETELEM10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 6
415+
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GETELEM10]], align 4
416+
; CHECK-NEXT: [[L77:%.*]] = extractelement <4 x i8> [[TMP4]], i32 0
417+
; CHECK-NEXT: [[L88:%.*]] = extractelement <4 x i8> [[TMP4]], i32 1
418+
; CHECK-NEXT: [[L99:%.*]] = extractelement <4 x i8> [[TMP4]], i32 2
419+
; CHECK-NEXT: [[L010:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
420+
; CHECK-NEXT: [[GETELEM14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 10
421+
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i8>, ptr [[GETELEM14]], align 4
422+
; CHECK-NEXT: [[L1111:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
423+
; CHECK-NEXT: [[L1212:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
424+
; CHECK-NEXT: ret void
425+
;
426+
%l = load i8, ptr %1, align 1
427+
%getElem = getelementptr inbounds i8, ptr %1, i64 1
428+
%l2 = load i8, ptr %getElem, align 1
429+
%getElem1 = getelementptr inbounds i8, ptr %1, i64 2
430+
%l3 = load i8, ptr %getElem1, align 1
431+
%getElem2 = getelementptr inbounds i8, ptr %1, i64 3
432+
%l4 = load i8, ptr %getElem2, align 1
433+
%getElem8 = getelementptr inbounds i8, ptr %1, i64 4
434+
%l5 = load i8, ptr %getElem8, align 1
435+
%getElem9 = getelementptr inbounds i8, ptr %1, i64 5
436+
%l6 = load i8, ptr %getElem9, align 1
437+
%getElem10 = getelementptr inbounds i8, ptr %1, i64 6
438+
%l7 = load i8, ptr %getElem10, align 4
439+
%getElem11 = getelementptr inbounds i8, ptr %1, i64 7
440+
%l8 = load i8, ptr %getElem11, align 1
441+
%getElem12 = getelementptr inbounds i8, ptr %1, i64 8
442+
%l9 = load i8, ptr %getElem12, align 1
443+
%getElem13 = getelementptr inbounds i8, ptr %1, i64 9
444+
%l0 = load i8, ptr %getElem13, align 1
445+
%getElem14 = getelementptr inbounds i8, ptr %1, i64 10
446+
%l11 = load i8, ptr %getElem14, align 1
447+
%getElem15 = getelementptr inbounds i8, ptr %1, i64 11
448+
%l12 = load i8, ptr %getElem15, align 1
449+
ret void
450+
}

0 commit comments

Comments
 (0)