-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] PromoteAlloca: handle out-of-bounds GEP for shufflevector #139700
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Robert Imschweiler (ro-i) ChangesDon't try to handle GEPs where the index is already known to be out-of-bounds -> avoid crash while generating shufflevector. This LLVM defect was identified via the AMD Fuzzing project. Note: The original (reduced) test case was: This uncovers other potential issues, especially using Full diff: https://github.com/llvm/llvm-project/pull/139700.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 933ee6ceeaf4a..69e1d952c7622 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -438,7 +438,8 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
SmallMapVector<Value *, APInt, 4> VarOffsets;
APInt ConstOffset(BW, 0);
if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
- !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
+ !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset) ||
+ ConstOffset.getZExtValue() >= Alloca->getAllocationSize(DL))
return nullptr;
unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index e1bbc243344b0..ba9691b265b50 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -18,9 +18,9 @@ declare void @llvm.amdgcn.s.barrier() #2
; SI-ALLOCA: v_lshlrev_b32_e32 [[SIZE_SCALE:v[0-9]+]], 2, [[LOAD_A]]
; SI-ALLOCA: v_mov_b32_e32 [[PTRREG:v[0-9]+]], [[SIZE_SCALE]]
-; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
+; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen
; SI-ALLOCA: s_barrier
-; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
+; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen
;
; SI-PROMOTE: LDSByteSize: 0
define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) #0 {
@@ -32,7 +32,7 @@ define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias
%a = load i32, ptr addrspace(1) %a_ptr, !range !0, !noundef !{}
%b = load i32, ptr addrspace(1) %b_ptr, !range !0, !noundef !{}
%result = add i32 %a, %b
- %alloca_ptr = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b
+ %alloca_ptr = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
store i32 %result, ptr addrspace(5) %alloca_ptr, align 4
; Dummy call
call void @llvm.amdgcn.s.barrier()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
index d72f158763c61..810fcced208d1 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -312,13 +312,12 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[SEL3]], 3
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 6, [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x i64> [[TMP4]], i64 [[TMP6]], i64 1
-; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP1]], 2
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <3 x i64> [[TMP7]], i64 [[TMP9]], i64 2
; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP10]], i32 2
@@ -337,7 +336,7 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
%gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
- %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i64 1, i64 %sel3
+ %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i64 0, i64 %sel3
%load = load <3 x i64>, ptr addrspace(5) %gep
%elem = extractelement <3 x i64> %load, i32 2
store i64 %elem, ptr %out
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll
new file mode 100644
index 0000000000000..f4d65ec0f7a03
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple amdgcn -passes=amdgpu-promote-alloca-to-vector -S < %s | FileCheck %s
+
+; Skip promote-alloca in case of an index which is known to be out of bounds.
+
+define amdgpu_kernel void @out_of_bounds() {
+; CHECK-LABEL: define amdgpu_kernel void @out_of_bounds() {
+; CHECK-NEXT: [[PTR:%.*]] = alloca [4 x float], align 4, addrspace(5)
+; CHECK-NEXT: [[ELEM_PTR:%.*]] = getelementptr [4 x float], ptr addrspace(5) [[PTR]], i32 0, i32 42
+; CHECK-NEXT: call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) [[PTR]], ptr addrspace(5) [[ELEM_PTR]], i32 8, i1 false)
+; CHECK-NEXT: ret void
+;
+ %ptr = alloca [4 x float], align 4, addrspace(5)
+ %elem_ptr = getelementptr [4 x float], ptr addrspace(5) %ptr, i32 0, i32 42
+ call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) %ptr, ptr addrspace(5) %elem_ptr, i32 8, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @in_bounds() {
+; CHECK-LABEL: define amdgpu_kernel void @in_bounds() {
+; CHECK-NEXT: [[PTR:%.*]] = freeze <4 x float> poison
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[PTR]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT: ret void
+;
+ %ptr = alloca [4 x float], align 4, addrspace(5)
+ %elem_ptr = getelementptr [4 x float], ptr addrspace(5) %ptr, i32 0, i32 2
+ call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) %ptr, ptr addrspace(5) %elem_ptr, i32 8, i1 false)
+ ret void
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) writeonly captures(none), ptr addrspace(5) readonly captures(none), i32, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
|
| if (GEP->getPointerOperand()->stripPointerCasts() != Alloca || | ||
| !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) | ||
| !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset) || | ||
| ConstOffset.getZExtValue() >= Alloca->getAllocationSize(DL)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You shouldn't need to consider the out of bounds here, this seems far removed from the failure point and condition
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like this is the point where things start to go wrong. The situation in which GEPToVectorIndex is called looks like this:
llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Lines 916 to 927 in 9b63bdd
| if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) { | |
| // If we can't compute a vector index from this GEP, then we can't | |
| // promote this alloca to vector. | |
| Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts); | |
| if (!Index) | |
| return RejectUser(Inst, "cannot compute vector index for GEP"); | |
| GEPVectorIdx[GEP] = Index; | |
| UsersToRemove.push_back(Inst); | |
| continue; | |
| } | |
// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.
Doesn't this include what I'd like to achieve?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An constant out of bounds index is still a computable index
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
True, I separated it.
It would also be possible to implement the check specifically here (SrcBegin):
llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Lines 663 to 675 in 84c1564
| unsigned SrcBegin = TI->SrcIndex->getZExtValue(); | |
| unsigned DestBegin = TI->DestIndex->getZExtValue(); | |
| SmallVector<int> Mask; | |
| for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) { | |
| if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { | |
| Mask.push_back(SrcBegin++); | |
| } else { | |
| Mask.push_back(Idx); | |
| } | |
| } | |
| return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask); |
but that wouldn't cover as many cases and I think that doesn't hurt (?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mostly you just need to avoid crashing in the compiler, and you should be able to ignore the index. It was UB and you are at most downgrading it to a poison value
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, so you think I should replace it with poison for shufflevector?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's no reason to reject it, and you should proceed. The out of bounds index will just produce a poison value in the vector indexing
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, at least for shufflevector it would crash:
opt: /work1/omp-nightly/build/git/trunk21.0/llvm-project/llvm/lib/IR/Instructions.cpp:1756: llvm::ShuffleVectorInst::ShuffleVectorInst(llvm::Value*, llvm::Value*, llvm::ArrayRef<int>, const llvm::Twine&, llvm::InsertPosition): Assertion `isValidOperands(V1, V2, Mask) && "Invalid shuffle vector instruction operands!"' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0. Program arguments: /COD/LATEST/trunk/bin/opt -mtriple amdgcn -passes=amdgpu-promote-alloca-to-vector r.ll -S
1. Running pass "function(amdgpu-promote-alloca-to-vector)" on module "r.ll"
2. Running pass "amdgpu-promote-alloca-to-vector" on function "promote_memcpy_two_aggrs"
So wouldn't it make more sense to disallow it more broadly?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm saying you should be guarding at the point of failure in the minimal way. Which in this case probably means fixing the shuffle mask entries?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, I think I now get what you mean
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM except unrelated test changes
|
Patch title is also out of date |
This LLVM defect was identified via the AMD Fuzzing project.
40ac917 to
dce5aea
Compare
|
cleaned up |
Co-authored-by: Matt Arsenault <[email protected]>
|
merged, thanks for the reviews :) |
EDIT: Updated title and text of this PR
This LLVM defect was identified via the AMD Fuzzing project.
Note: The original (reduced) test case was:
This uncovers other potential issues, especially using
i1 1(=i1 true) as an index.getelementptrsign extends the index value to the size of the pointer index type (i32), which in this case leads to a very large i32 value and not toi32 1. This might need a more general fix. But promote-alloca should nonetheless be able to handle wrong indices, I think. That's why I started with this PR.