Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 28 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,9 +437,34 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
SmallMapVector<Value *, APInt, 4> VarOffsets;
APInt ConstOffset(BW, 0);
if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
!GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
return nullptr;

// Walk backwards through nested GEPs to collect both constant and variable
// offsets, so that nested vector GEP chains can be lowered in one step.
//
// Given this IR fragment as input:
//
// %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
// %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
// %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
// %3 = load i32, ptr addrspace(5) %2, align 4
//
// Combine both GEP operations in a single pass, producing:
// BasePtr = %0
// ConstOffset = 4
// VarOffsets = { %j -> element_size(<2 x i32>) }
//
// That lets us emit a single buffer_load directly into a VGPR, without ever
// allocating scratch memory for the intermediate pointer.
Value *CurPtr = GEP;
while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
return nullptr;

// Move to the next outer pointer.
CurPtr = CurGEP->getPointerOperand();
}

assert(CurPtr == Alloca && "GEP not based on alloca");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure this is safe to assert?

Copy link
Contributor Author

@harrisonGPU harrisonGPU May 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am sure it is safe ! Because it is from the following code:

for (AllocaInst *AI : Allocas) {
const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
// First, check if we have enough budget to vectorize this alloca.
if (AllocaCost <= VectorizationBudget) {
// If we do, attempt vectorization, otherwise, fall through and try
// promoting to LDS instead.
if (tryPromoteAllocaToVector(*AI)) {

So it is must alloca type.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need this assertion here? should it be an assertion or error (which returns a nullptr)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It won't actually case assertion , but I'd like to add an assert to verify that it directly references the alloca.


unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
if (VarOffsets.size() > 1)
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/amdpal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ entry:
store i32 %extra, ptr addrspace(5) %v
store <2 x i32> %in, ptr addrspace(5) %v1
%e = getelementptr [2 x i32], ptr addrspace(5) %v1, i32 0, i32 %idx
%x = load i32, ptr addrspace(5) %e
%x = load volatile i32, ptr addrspace(5) %e
%xf = bitcast i32 %x to float
call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %xf, ptr addrspace(8) poison, i32 0, i32 0, i32 0)
ret void
Expand Down
79 changes: 79 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep-of-gep.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(i32 %idx, ptr addrspace(1) %output) #0 {
; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(
; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[BUF:%.*]] = freeze <20 x i32> poison
; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 2
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <20 x i32> [[BUF]], i32 1, i32 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <20 x i32> [[TMP1]], i32 2, i32 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[IDX]], 2
; CHECK-NEXT: [[TMP5:%.*]] = add i32 1, [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <20 x i32> [[TMP3]], i32 [[TMP5]]
; CHECK-NEXT: store i32 [[TMP6]], ptr addrspace(1) [[OUTPUT]], align 4
; CHECK-NEXT: ret void
;
entry:
%alloca = alloca [10 x <2 x i32>], align 8, addrspace(5)
%row = getelementptr [10 x <2 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %idx
store <2 x i32> <i32 1, i32 2>, ptr addrspace(5) %row, align 8
%elt1 = getelementptr i8, ptr addrspace(5) %row, i32 4
%val = load i32, ptr addrspace(5) %elt1, align 4
store i32 %val, ptr addrspace(1) %output
ret void
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to make this not dead code. Also use named values in tests

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have updated it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is still dead code

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean not use alloca after load? right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understood your mind! Thanks !

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have updated it, what do you think about it? :-)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"dead code" in this case refers to the fact that this alloca has no stores, no valid data.
This means the result of the load will always be poison, so load and all associated instructions can be removed if we made the compiler infer that.
Have a look at other existing promote alloca tests for some ideas of putting data into alloca.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I have updated it, and I inserted valid data to alloca, what do you think about it?

}

define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep3(i32 %idx, ptr addrspace(1) %output) #0 {
; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep3(
; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <16 x i32> poison
; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 2
; CHECK-NEXT: [[TMP1:%.*]] = add i32 8, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[ALLOCA]], i32 10, i32 [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP2]], i32 20, i32 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[IDX]], 2
; CHECK-NEXT: [[TMP6:%.*]] = add i32 9, [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i32> [[TMP4]], i32 [[TMP6]]
; CHECK-NEXT: store i32 [[TMP7]], ptr addrspace(1) [[OUTPUT]], align 4
; CHECK-NEXT: ret void
;
entry:
%alloca = alloca [2 x [4 x <2 x i32>]], align 8, addrspace(5)
%lvl1 = getelementptr inbounds [2 x [4 x <2 x i32>]], ptr addrspace(5) %alloca, i32 0, i32 1
%lvl2 = getelementptr inbounds [4 x <2 x i32>], ptr addrspace(5) %lvl1, i32 0, i32 %idx
store <2 x i32> <i32 10, i32 20>, ptr addrspace(5) %lvl2, align 8
%byte = getelementptr inbounds i8, ptr addrspace(5) %lvl2, i32 4
%val = load i32, ptr addrspace(5) %byte, align 4
store i32 %val, ptr addrspace(1) %output
ret void
}

define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_twice_idx(i32 %idx, ptr addrspace(1) %out) #0 {
; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_twice_idx(
; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[BUF:%.*]] = freeze <20 x i32> poison
; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <20 x i32> [[BUF]], i32 1, i32 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <20 x i32> [[TMP4]], i32 2, i32 [[TMP5]]
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[IDX]], 3
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <20 x i32> [[TMP3]], i32 [[TMP1]]
; CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[OUT]], align 4
; CHECK-NEXT: ret void
;
entry:
%alloca = alloca [10 x [2 x i32]], align 8, addrspace(5)
%row = getelementptr inbounds [10 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 %idx
store <2 x i32> <i32 1, i32 2>, ptr addrspace(5) %row, align 8
%elt = getelementptr inbounds [2 x i32], ptr addrspace(5) %row, i32 0, i32 %idx
%val = load i32, ptr addrspace(5) %elt, align 4
store i32 %val, ptr addrspace(1) %out
ret void
}

attributes #0 = { "amdgpu-promote-alloca-to-vector-max-regs"="32" }
Loading