Skip to content

Commit ea8b6e5

Browse files
committed
address comments
1 parent 728a898 commit ea8b6e5

File tree

4 files changed

+75
-62
lines changed

4 files changed

+75
-62
lines changed

llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp

Lines changed: 36 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@
66
//
77
//===----------------------------------------------------------------------===//
88
//
9-
// A simple pass that looks at local memory arrays that are statically
9+
// A simple pass that looks at local memory allocas that are statically
1010
// sized and potentially increases their alignment. This enables vectorization
11-
// of loads/stores to these arrays if not explicitly specified by the client.
11+
// of loads/stores to these allocas if not explicitly specified by the client.
1212
//
13-
// TODO: Ideally we should do a bin-packing of local arrays to maximize
13+
// TODO: Ideally we should do a bin-packing of local allocas to maximize
1414
// alignments while minimizing holes.
1515
//
1616
//===----------------------------------------------------------------------===//
@@ -28,10 +28,10 @@
2828

2929
using namespace llvm;
3030

31-
static cl::opt<bool>
32-
MaxLocalArrayAlignment("nvptx-use-max-local-array-alignment",
33-
cl::init(false), cl::Hidden,
34-
cl::desc("Use maximum alignment for local memory"));
31+
static cl::opt<unsigned> MinLocalArrayAlignment(
32+
"nvptx-ensure-minimum-local-alignment", cl::init(16), cl::Hidden,
33+
cl::desc(
34+
"Ensure local memory objects are at least this aligned (default 16)"));
3535

3636
static Align getMaxLocalArrayAlignment(const TargetTransformInfo &TTI) {
3737
const unsigned MaxBitWidth =
@@ -41,45 +41,46 @@ static Align getMaxLocalArrayAlignment(const TargetTransformInfo &TTI) {
4141

4242
namespace {
4343
struct NVPTXIncreaseLocalAlignment {
44-
const Align MaxAlign;
44+
const Align MaxUsableAlign;
4545

4646
NVPTXIncreaseLocalAlignment(const TargetTransformInfo &TTI)
47-
: MaxAlign(getMaxLocalArrayAlignment(TTI)) {}
47+
: MaxUsableAlign(getMaxLocalArrayAlignment(TTI)) {}
4848

4949
bool run(Function &F);
5050
bool updateAllocaAlignment(AllocaInst *Alloca, const DataLayout &DL);
51-
Align getAggressiveArrayAlignment(unsigned ArraySize);
52-
Align getConservativeArrayAlignment(unsigned ArraySize);
51+
Align getMaxUsefulArrayAlignment(unsigned ArraySize);
52+
Align getMaxSafeLocalAlignment(unsigned ArraySize);
5353
};
5454
} // namespace
5555

56-
/// Get the maximum useful alignment for an array. This is more likely to
56+
/// Get the maximum useful alignment for an allocation. This is more likely to
5757
/// produce holes in the local memory.
5858
///
59-
/// Choose an alignment large enough that the entire array could be loaded with
60-
/// a single vector load (if possible). Cap the alignment at
61-
/// MaxPTXArrayAlignment.
62-
Align NVPTXIncreaseLocalAlignment::getAggressiveArrayAlignment(
59+
/// Choose an alignment large enough that the entire alloca could be loaded
60+
/// with a single vector load (if possible). Cap the alignment at
61+
/// MinLocalArrayAlignment and MaxUsableAlign.
62+
Align NVPTXIncreaseLocalAlignment::getMaxUsefulArrayAlignment(
6363
const unsigned ArraySize) {
64-
return std::min(MaxAlign, Align(PowerOf2Ceil(ArraySize)));
64+
const Align UpperLimit =
65+
std::min(MaxUsableAlign, Align(MinLocalArrayAlignment));
66+
return std::min(UpperLimit, Align(PowerOf2Ceil(ArraySize)));
6567
}
6668

67-
/// Get the alignment of arrays that reduces the chances of leaving holes when
68-
/// arrays are allocated within a contiguous memory buffer (like shared memory
69-
/// and stack). Holes are still possible before and after the array allocation.
69+
/// Get the alignment of allocas that reduces the chances of leaving holes when
70+
/// they are allocated within a contiguous memory buffer (like the stack).
71+
/// Holes are still possible before and after the allocation.
7072
///
71-
/// Choose the largest alignment such that the array size is a multiple of the
72-
/// alignment. If all elements of the buffer are allocated in order of
73+
/// Choose the largest alignment such that the allocation size is a multiple of
74+
/// the alignment. If all elements of the buffer are allocated in order of
7375
/// alignment (higher to lower) no holes will be left.
74-
Align NVPTXIncreaseLocalAlignment::getConservativeArrayAlignment(
76+
Align NVPTXIncreaseLocalAlignment::getMaxSafeLocalAlignment(
7577
const unsigned ArraySize) {
76-
return commonAlignment(MaxAlign, ArraySize);
78+
return commonAlignment(MaxUsableAlign, ArraySize);
7779
}
7880

79-
/// Find a better alignment for local arrays
81+
/// Find a better alignment for local allocas.
8082
bool NVPTXIncreaseLocalAlignment::updateAllocaAlignment(AllocaInst *Alloca,
8183
const DataLayout &DL) {
82-
// Looking for statically sized local arrays
8384
if (!Alloca->isStaticAlloca())
8485
return false;
8586

@@ -88,12 +89,15 @@ bool NVPTXIncreaseLocalAlignment::updateAllocaAlignment(AllocaInst *Alloca,
8889
return false;
8990

9091
const auto ArraySizeValue = ArraySize->getFixedValue();
91-
const Align PreferredAlignment =
92-
MaxLocalArrayAlignment ? getAggressiveArrayAlignment(ArraySizeValue)
93-
: getConservativeArrayAlignment(ArraySizeValue);
92+
if (ArraySizeValue == 0)
93+
return false;
94+
95+
const Align NewAlignment =
96+
std::max(getMaxSafeLocalAlignment(ArraySizeValue),
97+
getMaxUsefulArrayAlignment(ArraySizeValue));
9498

95-
if (PreferredAlignment > Alloca->getAlign()) {
96-
Alloca->setAlignment(PreferredAlignment);
99+
if (NewAlignment > Alloca->getAlign()) {
100+
Alloca->setAlignment(NewAlignment);
97101
return true;
98102
}
99103

@@ -130,8 +134,7 @@ struct NVPTXIncreaseLocalAlignmentLegacyPass : public FunctionPass {
130134
char NVPTXIncreaseLocalAlignmentLegacyPass::ID = 0;
131135
INITIALIZE_PASS(NVPTXIncreaseLocalAlignmentLegacyPass,
132136
"nvptx-increase-local-alignment",
133-
"Increase alignment for statically sized alloca arrays", false,
134-
false)
137+
"Increase alignment for statically sized allocas", false, false)
135138

136139
FunctionPass *llvm::createNVPTXIncreaseLocalAlignmentPass() {
137140
return new NVPTXIncreaseLocalAlignmentLegacyPass();

llvm/test/CodeGen/NVPTX/increase-local-align.ll

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt -S -passes=nvptx-increase-local-alignment < %s | FileCheck %s --check-prefixes=COMMON,DEFAULT
3-
; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-use-max-local-array-alignment < %s | FileCheck %s --check-prefixes=COMMON,MAX
2+
; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-ensure-minimum-local-alignment=1 < %s | FileCheck %s --check-prefixes=COMMON,MIN-1
3+
; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-ensure-minimum-local-alignment=8 < %s | FileCheck %s --check-prefixes=COMMON,MIN-8
4+
; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-ensure-minimum-local-alignment=16 < %s | FileCheck %s --check-prefixes=COMMON,MIN-16
45
target triple = "nvptx64-nvidia-cuda"
56

67
define void @test1() {
@@ -13,13 +14,17 @@ define void @test1() {
1314
}
1415

1516
define void @test2() {
16-
; DEFAULT-LABEL: define void @test2() {
17-
; DEFAULT-NEXT: [[A:%.*]] = alloca [63 x i8], align 1
18-
; DEFAULT-NEXT: ret void
17+
; MIN-1-LABEL: define void @test2() {
18+
; MIN-1-NEXT: [[A:%.*]] = alloca [63 x i8], align 1
19+
; MIN-1-NEXT: ret void
1920
;
20-
; MAX-LABEL: define void @test2() {
21-
; MAX-NEXT: [[A:%.*]] = alloca [63 x i8], align 16
22-
; MAX-NEXT: ret void
21+
; MIN-8-LABEL: define void @test2() {
22+
; MIN-8-NEXT: [[A:%.*]] = alloca [63 x i8], align 8
23+
; MIN-8-NEXT: ret void
24+
;
25+
; MIN-16-LABEL: define void @test2() {
26+
; MIN-16-NEXT: [[A:%.*]] = alloca [63 x i8], align 16
27+
; MIN-16-NEXT: ret void
2328
;
2429
%a = alloca [63 x i8], align 1
2530
ret void
@@ -35,13 +40,17 @@ define void @test3() {
3540
}
3641

3742
define void @test4() {
38-
; DEFAULT-LABEL: define void @test4() {
39-
; DEFAULT-NEXT: [[A:%.*]] = alloca i8, i32 63, align 1
40-
; DEFAULT-NEXT: ret void
43+
; MIN-1-LABEL: define void @test4() {
44+
; MIN-1-NEXT: [[A:%.*]] = alloca i8, i32 63, align 1
45+
; MIN-1-NEXT: ret void
46+
;
47+
; MIN-8-LABEL: define void @test4() {
48+
; MIN-8-NEXT: [[A:%.*]] = alloca i8, i32 63, align 8
49+
; MIN-8-NEXT: ret void
4150
;
42-
; MAX-LABEL: define void @test4() {
43-
; MAX-NEXT: [[A:%.*]] = alloca i8, i32 63, align 16
44-
; MAX-NEXT: ret void
51+
; MIN-16-LABEL: define void @test4() {
52+
; MIN-16-NEXT: [[A:%.*]] = alloca i8, i32 63, align 16
53+
; MIN-16-NEXT: ret void
4554
;
4655
%a = alloca i8, i32 63, align 1
4756
ret void
@@ -83,3 +92,11 @@ define void @test8() {
8392
ret void
8493
}
8594

95+
define void @test9() {
96+
; COMMON-LABEL: define void @test9() {
97+
; COMMON-NEXT: [[A:%.*]] = alloca [0 x i32], align 1
98+
; COMMON-NEXT: ret void
99+
;
100+
%a = alloca [0 x i32], align 1
101+
ret void
102+
}

llvm/test/CodeGen/NVPTX/local-stack-frame.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ declare void @bar(ptr %a)
9494
define void @foo3(i32 %a) {
9595
; PTX32-LABEL: foo3(
9696
; PTX32: {
97-
; PTX32-NEXT: .local .align 4 .b8 __local_depot2[12];
97+
; PTX32-NEXT: .local .align 16 .b8 __local_depot2[16];
9898
; PTX32-NEXT: .reg .b32 %SP;
9999
; PTX32-NEXT: .reg .b32 %SPL;
100100
; PTX32-NEXT: .reg .b32 %r<6>;
@@ -110,7 +110,7 @@ define void @foo3(i32 %a) {
110110
;
111111
; PTX64-LABEL: foo3(
112112
; PTX64: {
113-
; PTX64-NEXT: .local .align 4 .b8 __local_depot2[12];
113+
; PTX64-NEXT: .local .align 16 .b8 __local_depot2[16];
114114
; PTX64-NEXT: .reg .b64 %SP;
115115
; PTX64-NEXT: .reg .b64 %SPL;
116116
; PTX64-NEXT: .reg .b32 %r<2>;

llvm/test/CodeGen/NVPTX/variadics-backend.ll

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ declare void @llvm.va_end.p0(ptr)
101101
define dso_local i32 @foo() {
102102
; CHECK-PTX-LABEL: foo(
103103
; CHECK-PTX: {
104-
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot1[40];
104+
; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot1[48];
105105
; CHECK-PTX-NEXT: .reg .b64 %SP;
106106
; CHECK-PTX-NEXT: .reg .b64 %SPL;
107107
; CHECK-PTX-NEXT: .reg .b32 %r<3>;
@@ -138,7 +138,7 @@ entry:
138138
define dso_local i32 @variadics2(i32 noundef %first, ...) {
139139
; CHECK-PTX-LABEL: variadics2(
140140
; CHECK-PTX: {
141-
; CHECK-PTX-NEXT: .local .align 1 .b8 __local_depot2[3];
141+
; CHECK-PTX-NEXT: .local .align 4 .b8 __local_depot2[4];
142142
; CHECK-PTX-NEXT: .reg .b64 %SP;
143143
; CHECK-PTX-NEXT: .reg .b64 %SPL;
144144
; CHECK-PTX-NEXT: .reg .b16 %rs<4>;
@@ -215,17 +215,10 @@ define dso_local i32 @bar() {
215215
; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2;
216216
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
217217
; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3;
218-
; CHECK-PTX-NEXT: st.b32 [%SP+8], 1;
219-
; CHECK-PTX-NEXT: st.b8 [%SP+12], 1;
220-
; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
221-
; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 8;
222-
; CHECK-PTX-NEXT: mov.b32 %r1, 1;
223-
; CHECK-PTX-NEXT: st.b32 [%SP+16], %r1;
224-
; CHECK-PTX-NEXT: mov.b16 %rs4, 1;
225-
; CHECK-PTX-NEXT: st.b8 [%SP+20], %rs4;
226-
; CHECK-PTX-NEXT: mov.b64 %rd3, 1;
227-
; CHECK-PTX-NEXT: st.b64 [%SP+24], %rd3;
228-
; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 16;
218+
; CHECK-PTX-NEXT: st.b32 [%SP+16], 1;
219+
; CHECK-PTX-NEXT: st.b8 [%SP+20], 1;
220+
; CHECK-PTX-NEXT: st.b64 [%SP+24], 1;
221+
; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 16;
229222
; CHECK-PTX-NEXT: { // callseq 1, 0
230223
; CHECK-PTX-NEXT: .param .b32 param0;
231224
; CHECK-PTX-NEXT: st.param.b32 [param0], 1;

0 commit comments

Comments
 (0)