[AMDGPU] PromoteAlloca: handle out-of-bounds GEP for shufflevector #139700

ro-i · 2025-05-13T10:04:40Z

EDIT: Updated title and text of this PR

This LLVM defect was identified via the AMD Fuzzing project.

Note: The original (reduced) test case was:

define amdgpu_vs void @promote_memcpy_two_aggrs() {
  %f1 = alloca [5 x float], align 4, addrspace(5)
  %G2 = getelementptr <1 x double>, ptr addrspace(5) %f1, i1 true
  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) %f1, ptr addrspace(5) %G2, i32 8, i1 false)
  ret void
}

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(5) noalias readonly captures(none), i32, i1 immarg) #0

attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }

This uncovers other potential issues, especially using i1 1 (= i1 true) as an index. getelementptr sign extends the index value to the size of the pointer index type (i32), which in this case leads to a very large i32 value and not to i32 1. This might need a more general fix. But promote-alloca should nonetheless be able to handle wrong indices, I think. That's why I started with this PR.

llvmbot · 2025-05-13T10:05:13Z

@llvm/pr-subscribers-backend-amdgpu

Author: Robert Imschweiler (ro-i)

Changes

Don't try to handle GEPs where the index is already known to be out-of-bounds -> avoid crash while generating shufflevector.

This LLVM defect was identified via the AMD Fuzzing project.

Note: The original (reduced) test case was:

define amdgpu_vs void @<!-- -->promote_memcpy_two_aggrs() {
  %f1 = alloca [5 x float], align 4, addrspace(5)
  %G2 = getelementptr &lt;1 x double&gt;, ptr addrspace(5) %f1, i1 true
  call void @<!-- -->llvm.memcpy.p5.p5.i32(ptr addrspace(5) %f1, ptr addrspace(5) %G2, i32 8, i1 false)
  ret void
}

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @<!-- -->llvm.memcpy.p5.p5.i32(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(5) noalias readonly captures(none), i32, i1 immarg) #<!-- -->0

attributes #<!-- -->0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }

This uncovers other potential issues, especially using i1 1 (= i1 true) as an index. getelementptr sign extends the index value to the size of the pointer index type (i32), which in this case leads to a very large i32 value and not to i32 1. This might need a more general fix. But promote-alloca should nonetheless be able to handle wrong indices, I think. That's why I started with this PR.

Full diff: https://github.com/llvm/llvm-project/pull/139700.diff

4 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (+2-1)
(modified) llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll (+4-5)
(added) llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll (+34)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 933ee6ceeaf4a..69e1d952c7622 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -438,7 +438,8 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   SmallMapVector<Value *, APInt, 4> VarOffsets;
   APInt ConstOffset(BW, 0);
   if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
-      !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
+      !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset) ||
+      ConstOffset.getZExtValue() >= Alloca->getAllocationSize(DL))
     return nullptr;
 
   unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index e1bbc243344b0..ba9691b265b50 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -18,9 +18,9 @@ declare void @llvm.amdgcn.s.barrier() #2
 ; SI-ALLOCA: v_lshlrev_b32_e32 [[SIZE_SCALE:v[0-9]+]], 2, [[LOAD_A]]
 
 ; SI-ALLOCA: v_mov_b32_e32 [[PTRREG:v[0-9]+]], [[SIZE_SCALE]]
-; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
+; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen
 ; SI-ALLOCA: s_barrier
-; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
+; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen
 ;
 ; SI-PROMOTE: LDSByteSize: 0
 define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) #0 {
@@ -32,7 +32,7 @@ define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias
   %a = load i32, ptr addrspace(1) %a_ptr, !range !0, !noundef !{}
   %b = load i32, ptr addrspace(1) %b_ptr, !range !0, !noundef !{}
   %result = add i32 %a, %b
-  %alloca_ptr = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b
+  %alloca_ptr = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
   store i32 %result, ptr addrspace(5) %alloca_ptr, align 4
   ; Dummy call
   call void @llvm.amdgcn.s.barrier()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
index d72f158763c61..810fcced208d1 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -312,13 +312,12 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[SEL3]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 6, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x i64> [[TMP4]], i64 [[TMP6]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <3 x i64> [[TMP7]], i64 [[TMP9]], i64 2
 ; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP10]], i32 2
@@ -337,7 +336,7 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
   %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
   store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
   store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
-  %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i64 1, i64 %sel3
+  %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i64 0, i64 %sel3
   %load = load <3 x i64>, ptr addrspace(5) %gep
   %elem = extractelement <3 x i64> %load, i32 2
   store i64 %elem, ptr %out
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll
new file mode 100644
index 0000000000000..f4d65ec0f7a03
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple amdgcn -passes=amdgpu-promote-alloca-to-vector -S < %s | FileCheck %s
+
+; Skip promote-alloca in case of an index which is known to be out of bounds.
+
+define amdgpu_kernel void @out_of_bounds() {
+; CHECK-LABEL: define amdgpu_kernel void @out_of_bounds() {
+; CHECK-NEXT:    [[PTR:%.*]] = alloca [4 x float], align 4, addrspace(5)
+; CHECK-NEXT:    [[ELEM_PTR:%.*]] = getelementptr [4 x float], ptr addrspace(5) [[PTR]], i32 0, i32 42
+; CHECK-NEXT:    call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) [[PTR]], ptr addrspace(5) [[ELEM_PTR]], i32 8, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %ptr = alloca [4 x float], align 4, addrspace(5)
+  %elem_ptr = getelementptr [4 x float], ptr addrspace(5) %ptr, i32 0, i32 42
+  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) %ptr, ptr addrspace(5) %elem_ptr, i32 8, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @in_bounds() {
+; CHECK-LABEL: define amdgpu_kernel void @in_bounds() {
+; CHECK-NEXT:    [[PTR:%.*]] = freeze <4 x float> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[PTR]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    ret void
+;
+  %ptr = alloca [4 x float], align 4, addrspace(5)
+  %elem_ptr = getelementptr [4 x float], ptr addrspace(5) %ptr, i32 0, i32 2
+  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) %ptr, ptr addrspace(5) %elem_ptr, i32 8, i1 false)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) writeonly captures(none), ptr addrspace(5) readonly captures(none), i32, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }

arsenm · 2025-05-13T10:10:04Z

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

  if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
-      !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
+      !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset) ||
+      ConstOffset.getZExtValue() >= Alloca->getAllocationSize(DL))


You shouldn't need to consider the out of bounds here, this seems far removed from the failure point and condition

I feel like this is the point where things start to go wrong. The situation in which GEPToVectorIndex is called looks like this:

llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines 916 to 927 in 9b63bdd

if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {

// If we can't compute a vector index from this GEP, then we can't

// promote this alloca to vector.

Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);

if (!Index)

return RejectUser(Inst, "cannot compute vector index for GEP");

GEPVectorIdx[GEP] = Index;

UsersToRemove.push_back(Inst);

continue;

}

// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.

Doesn't this include what I'd like to achieve?

An constant out of bounds index is still a computable index

True, I separated it.
It would also be possible to implement the check specifically here (SrcBegin):

llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines 663 to 675 in 84c1564

unsigned SrcBegin = TI->SrcIndex->getZExtValue();

unsigned DestBegin = TI->DestIndex->getZExtValue();

SmallVector<int> Mask;

for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {

if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {

Mask.push_back(SrcBegin++);

} else {

Mask.push_back(Idx);

}

}

return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);

but that wouldn't cover as many cases and I think that doesn't hurt (?)

Mostly you just need to avoid crashing in the compiler, and you should be able to ignore the index. It was UB and you are at most downgrading it to a poison value

Hm, so you think I should replace it with poison for shufflevector?

arsenm · 2025-05-13T13:27:07Z

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

There's no reason to reject it, and you should proceed. The out of bounds index will just produce a poison value in the vector indexing

Well, at least for shufflevector it would crash:

opt: /work1/omp-nightly/build/git/trunk21.0/llvm-project/llvm/lib/IR/Instructions.cpp:1756: llvm::ShuffleVectorInst::ShuffleVectorInst(llvm::Value*, llvm::Value*, llvm::ArrayRef<int>, const llvm::Twine&, llvm::InsertPosition): Assertion `isValidOperands(V1, V2, Mask) && "Invalid shuffle vector instruction operands!"' failed. PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace. Stack dump: 0. Program arguments: /COD/LATEST/trunk/bin/opt -mtriple amdgcn -passes=amdgpu-promote-alloca-to-vector r.ll -S 1. Running pass "function(amdgpu-promote-alloca-to-vector)" on module "r.ll" 2. Running pass "amdgpu-promote-alloca-to-vector" on function "promote_memcpy_two_aggrs"

So wouldn't it make more sense to disallow it more broadly?

I'm saying you should be guarding at the point of failure in the minimal way. Which in this case probably means fixing the shuffle mask entries?

I see, I think I now get what you mean

arsenm

LGTM except unrelated test changes

llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll

llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll

llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll

arsenm · 2025-05-21T09:27:44Z

Patch title is also out of date

This LLVM defect was identified via the AMD Fuzzing project.

ro-i · 2025-05-21T09:46:21Z

cleaned up

llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll

Co-authored-by: Matt Arsenault <[email protected]>

ro-i · 2025-05-21T13:28:47Z

merged, thanks for the reviews :)

ro-i requested review from arsenm and perlfu May 13, 2025 10:04

llvmbot added the backend:AMDGPU label May 13, 2025

arsenm reviewed May 13, 2025

View reviewed changes

ro-i requested a review from arsenm May 20, 2025 11:21

arsenm reviewed May 21, 2025

View reviewed changes

llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll Outdated Show resolved Hide resolved

[AMDGPU] PromoteAlloca: handle out-of-bounds GEP for shufflevector

dce5aea

This LLVM defect was identified via the AMD Fuzzing project.

ro-i force-pushed the promote-alloca-shufflevector branch from 40ac917 to dce5aea Compare May 21, 2025 09:44

ro-i changed the title ~~[AMDGPU] PromoteAlloca: reject known out-of-bounds index~~ [AMDGPU] PromoteAlloca: handle out-of-bounds GEP for shufflevector May 21, 2025

arsenm approved these changes May 21, 2025

View reviewed changes

llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll Outdated Show resolved Hide resolved

ro-i and others added 2 commits May 21, 2025 06:51

fix typo

04afda3

Update llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll

74378e1

Co-authored-by: Matt Arsenault <[email protected]>

arsenm approved these changes May 21, 2025

View reviewed changes

ro-i merged commit dc29901 into llvm:main May 21, 2025
11 checks passed

ro-i deleted the promote-alloca-shufflevector branch May 21, 2025 14:37

	if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
	// If we can't compute a vector index from this GEP, then we can't
	// promote this alloca to vector.
	Value Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, DL, NewGEPInsts);
	if (!Index)
	return RejectUser(Inst, "cannot compute vector index for GEP");

	GEPVectorIdx[GEP] = Index;
	UsersToRemove.push_back(Inst);
	continue;
	}

	unsigned SrcBegin = TI->SrcIndex->getZExtValue();
	unsigned DestBegin = TI->DestIndex->getZExtValue();

	SmallVector<int> Mask;
	for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
	if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
	Mask.push_back(SrcBegin++);
	} else {
	Mask.push_back(Idx);
	}
	}

	return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);

[AMDGPU] PromoteAlloca: handle out-of-bounds GEP for shufflevector #139700

[AMDGPU] PromoteAlloca: handle out-of-bounds GEP for shufflevector #139700

Uh oh!

Conversation

ro-i commented May 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented May 13, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

ro-i May 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm commented May 21, 2025

Uh oh!

ro-i commented May 21, 2025

Uh oh!

Uh oh!

Uh oh!

ro-i commented May 21, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

ro-i commented May 13, 2025 •

edited

Loading

ro-i May 13, 2025 •

edited

Loading