Skip to content

Commit 3b8539c

Browse files
authored
[NVPTX] use incomplete aggregate initializers (#79062)
The PTX ISA specifies that initializers may be incomplete ([5.4.4. Initializers](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#initializers)) > As in C, array initializers may be incomplete, i.e., the number of initializer elements may be less than the extent of the corresponding array dimension, with remaining array locations initialized to the default value for the specified array type. Emitting initializers in this form is preferable because it reduces the size of the PTX, in some cases significantly, and can improve compile time of ptxas as a result.
1 parent 1605bf5 commit 3b8539c

File tree

4 files changed

+37
-4
lines changed

4 files changed

+37
-4
lines changed

llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1292,10 +1292,21 @@ void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) {
12921292

12931293
void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
12941294
unsigned int ptrSize = AP.MAI->getCodePointerSize();
1295-
symbolPosInBuffer.push_back(size);
1295+
// Do not emit trailing zero initializers. They will be zero-initialized by
1296+
// ptxas. This saves on both space requirements for the generated PTX and on
1297+
// memory use by ptxas. (See:
1298+
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#global-state-space)
1299+
unsigned int InitializerCount = size;
1300+
// TODO: symbols make this harder, but it would still be good to trim trailing
1301+
// 0s for aggs with symbols as well.
1302+
if (numSymbols() == 0)
1303+
while (InitializerCount >= 1 && !buffer[InitializerCount - 1])
1304+
InitializerCount--;
1305+
1306+
symbolPosInBuffer.push_back(InitializerCount);
12961307
unsigned int nSym = 0;
12971308
unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
1298-
for (unsigned int pos = 0; pos < size;) {
1309+
for (unsigned int pos = 0; pos < InitializerCount;) {
12991310
if (pos)
13001311
os << ", ";
13011312
if (pos != nextSymbolPos) {
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
2+
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_50 | %ptxas-verify %}
3+
4+
; Make sure the globals constants have trailing zeros properly trimmed
5+
6+
; basic case
7+
; CHECK-DAG: .b8 A[8] = {3, 4, 0, 0, 5};
8+
@A = global [8 x i8] [i8 3, i8 4, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0]
9+
10+
; all-zeros
11+
; CHECK-DAG: .b8 B[2];
12+
@B = global [2 x i8] [i8 0, i8 0]
13+
14+
; all-non-zeros
15+
; CHECK-DAG: .b8 C[4] = {1, 2, 3, 4};
16+
@C = global [4 x i8] [i8 1, i8 2, i8 3, i8 4]
17+
18+
; initializer with a symbol, the last 0 could be default initialized
19+
; CHECK-DAG: .u8 e = 1;
20+
; CHECK-DAG: .u64 D[4] = {e, 0, e, 0};
21+
@e = addrspace(1) global i8 1
22+
@D = addrspace(1) global [4 x ptr addrspace(1)] [ptr addrspace(1) @e, ptr addrspace(1) null, ptr addrspace(1) @e, ptr addrspace(1) null]

llvm/test/CodeGen/NVPTX/globals_init.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,6 @@
2626
@Gblf64 = global [2 x double] [double 5.75e-25, double 12.25e+56]
2727

2828
; Make sure we fill in alignment gaps correctly.
29-
; CHECK-DAG: .b8 GblU[12] = {7, 6, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0};
29+
; CHECK-DAG: .b8 GblU[12] = {7, 6, 0, 0, 5, 4, 3, 2, 1};
3030
@GblU = global {i16, i32, i8} {i16 1543, i32 33752069, i8 1}
3131

llvm/test/CodeGen/NVPTX/i128-global.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
22
; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
33

4-
; CHECK: .visible .global .align 16 .b8 G1[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
4+
; CHECK: .visible .global .align 16 .b8 G1[16] = {1};
55
@G1 = global i128 1
66

77
; CHECK: .visible .global .align 16 .b8 G2[16];

0 commit comments

Comments
 (0)