Skip to content

Commit ffa8ed8

Browse files
committed
[AMDGPU] Promote nestedGEP allocas to vectors
1 parent 02ed6d8 commit ffa8ed8

File tree

3 files changed

+118
-14
lines changed

3 files changed

+118
-14
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -437,8 +437,62 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
437437
unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
438438
SmallMapVector<Value *, APInt, 4> VarOffsets;
439439
APInt ConstOffset(BW, 0);
440-
if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
441-
!GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
440+
441+
// Walk backwards through nested GEPs to collect both constant and variable
442+
// offsets, so that nested vector GEP chains can be lowered in one step.
443+
//
444+
// Given this IR fragment as input:
445+
//
446+
// %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
447+
// %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
448+
// %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
449+
// %3 = load i32, ptr addrspace(5) %2, align 4
450+
//
451+
// Combine both GEP operations in a single pass, producing:
452+
// BasePtr = %0
453+
// ConstOffset = 4
454+
// VarOffsets = { %j → element_size(<2 x i32>) }
455+
//
456+
// That lets us emit a single buffer_load directly into a VGPR, without ever
457+
// allocating scratch memory for the intermediate pointer.
458+
Value *CurPtr = GEP;
459+
while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
460+
SmallMapVector<Value *, APInt, 4> LocalVarsOffsets;
461+
APInt LocalConstOffset(BW, 0);
462+
463+
if (!CurGEP->collectOffset(DL, BW, LocalVarsOffsets, LocalConstOffset))
464+
return nullptr;
465+
466+
// Merge any variable-index contributions into the accumulated VarOffsets
467+
// map.
468+
// Only a single pointer variable is allowed in the entire GEP chain.
469+
// If VarOffsets already holds a different pointer, abort.
470+
//
471+
// Example:
472+
// Suppose LocalVarsOffsets = { (%ptr → 4) } from this GEP, and
473+
// VarOffsets already has { (%ptr → 8) } from an inner GEP.
474+
// After this loop, VarOffsets should become { (%ptr → 12) }.
475+
for (auto &VarEntry : LocalVarsOffsets) {
476+
// If VarOffsets already records a different pointer, abort.
477+
if (!VarOffsets.empty() && !VarOffsets.count(VarEntry.first))
478+
return nullptr;
479+
480+
// Look up whether we’ve seen this pointer before.
481+
auto *Existing = VarOffsets.find(VarEntry.first);
482+
if (Existing == VarOffsets.end())
483+
VarOffsets.insert({VarEntry.first, VarEntry.second});
484+
else
485+
Existing->second += VarEntry.second;
486+
}
487+
488+
ConstOffset += LocalConstOffset;
489+
490+
// Move to the next outer pointer
491+
CurPtr = CurGEP->getPointerOperand()->stripPointerCasts();
492+
}
493+
494+
// Only proceed if this GEP stems from the same alloca.
495+
if (CurPtr->stripPointerCasts() != Alloca)
442496
return nullptr;
443497

444498
unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);

llvm/test/CodeGen/AMDGPU/amdpal.ll

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tahiti | FileCheck --check-prefixes=PAL,CI --enable-var-scope %s
2-
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga | FileCheck --check-prefixes=PAL,VI --enable-var-scope %s
1+
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tahiti | FileCheck --check-prefixes=PAL --enable-var-scope %s
2+
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga | FileCheck --check-prefixes=PAL --enable-var-scope %s
33

44
; PAL-NOT: .AMDGPU.config
55
; PAL-LABEL: {{^}}simple:
@@ -51,17 +51,12 @@ entry:
5151
ret void
5252
}
5353

54-
; Check code sequence for amdpal use of scratch for alloca in a compute shader.
55-
; The scratch descriptor is loaded from offset 0x10 of the GIT, rather than offset
56-
; 0 in a graphics shader.
57-
; Prior to GCN3 s_load_dword offsets are dwords, so the offset will be 0x4.
54+
; After the change that **promotes the alloca to a vector** (GEP‑of‑GEP
55+
; promotion), no scratch buffer is needed, so the descriptor load should
56+
; disappear.
5857

5958
; PAL-LABEL: {{^}}scratch2_cs:
60-
; PAL: s_movk_i32 s{{[0-9]+}}, 0x1234
61-
; PAL: s_mov_b32 s[[GITPTR:[0-9]+]], s0
62-
; CI: s_load_dwordx4 s[[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s[[[GITPTR]]:{{[0-9]+\]}}, 0x4
63-
; VI: s_load_dwordx4 s[[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s[[[GITPTR]]:{{[0-9]+\]}}, 0x10
64-
; PAL: buffer_store{{.*}}, s[[[SCRATCHDESC]]:
59+
; PAL: buffer_store{{.*}}, s[[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}
6560

6661
define amdgpu_cs void @scratch2_cs(i32 inreg, i32 inreg, i32 inreg, <3 x i32> inreg, i32 inreg, <3 x i32> %coord, <2 x i32> %in, i32 %extra, i32 %idx) #0 {
6762
entry:
@@ -88,6 +83,6 @@ declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32,
8883
; PAL-NEXT: .cs:
8984
; PAL-NEXT: .entry_point: _amdgpu_cs_main
9085
; PAL-NEXT: .entry_point_symbol: scratch2_cs
91-
; PAL-NEXT: .scratch_memory_size: 0x10
86+
; PAL-NEXT: .scratch_memory_size: 0
9287
; PAL-NEXT: .sgpr_count: 0x
9388
; PAL-NEXT: .vgpr_count: 0x
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
3+
target triple = "amdgcn-amd-amdhsa"
4+
define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(i32 %j) #0 {
5+
; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(
6+
; CHECK-SAME: i32 [[J:%.*]]) #[[ATTR0:[0-9]+]] {
7+
; CHECK-NEXT: [[ENTRY:.*:]]
8+
; CHECK-NEXT: [[SORTEDFRAGMENTS:%.*]] = freeze <20 x i32> poison
9+
; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[J]], 2
10+
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[J]], 2
11+
; CHECK-NEXT: [[TMP2:%.*]] = add i32 1, [[TMP1]]
12+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <20 x i32> [[SORTEDFRAGMENTS]], i32 [[TMP2]]
13+
; CHECK-NEXT: ret void
14+
;
15+
entry:
16+
%SortedFragments = alloca [10 x <2 x i32>], align 8, addrspace(5)
17+
%0 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %SortedFragments, i32 0, i32 %j
18+
%1 = getelementptr i8, ptr addrspace(5) %0, i32 4
19+
%2 = load i32, ptr addrspace(5) %1, align 4
20+
ret void
21+
}
22+
23+
attributes #0 = { "amdgpu-promote-alloca-to-vector-max-regs"="32" }
24+
25+
define amdgpu_cs void @scalar_alloca_ptr_with_vector_gep_of_scratch(i32 inreg, i32 inreg, i32 inreg, <3 x i32> inreg, i32 inreg, <3 x i32> %coord, <2 x i32> %in, i32 %extra, i32 %idx) #1 {
26+
; CHECK-LABEL: define amdgpu_cs void @scalar_alloca_ptr_with_vector_gep_of_scratch(
27+
; CHECK-SAME: i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg [[TMP2:%.*]], <3 x i32> inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], <3 x i32> [[COORD:%.*]], <2 x i32> [[IN:%.*]], i32 [[EXTRA:%.*]], i32 [[IDX:%.*]]) #[[ATTR1:[0-9]+]] {
28+
; CHECK-NEXT: [[ENTRY:.*:]]
29+
; CHECK-NEXT: [[V:%.*]] = freeze <3 x i32> poison
30+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <3 x i32> [[V]], i32 [[EXTRA]], i32 0
31+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[IN]], i64 0
32+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x i32> [[TMP5]], i32 [[TMP6]], i32 1
33+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[IN]], i64 1
34+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x i32> [[TMP7]], i32 [[TMP8]], i32 2
35+
; CHECK-NEXT: [[TMP10:%.*]] = add i32 1, [[IDX]]
36+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <3 x i32> [[TMP9]], i32 [[TMP10]]
37+
; CHECK-NEXT: [[XF:%.*]] = bitcast i32 [[TMP11]] to float
38+
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[XF]], ptr addrspace(8) poison, i32 0, i32 0, i32 0)
39+
; CHECK-NEXT: ret void
40+
;
41+
entry:
42+
%v = alloca [3 x i32], addrspace(5)
43+
%v1 = getelementptr [3 x i32], ptr addrspace(5) %v, i32 0, i32 1
44+
store i32 %extra, ptr addrspace(5) %v
45+
store <2 x i32> %in, ptr addrspace(5) %v1
46+
%e = getelementptr [2 x i32], ptr addrspace(5) %v1, i32 0, i32 %idx
47+
%x = load i32, ptr addrspace(5) %e
48+
%xf = bitcast i32 %x to float
49+
call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %xf, ptr addrspace(8) poison, i32 0, i32 0, i32 0)
50+
ret void
51+
}
52+
53+
attributes #1 = { nounwind "amdgpu-git-ptr-high"="0x1234" }
54+
55+
declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg)

0 commit comments

Comments
 (0)