Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,9 +443,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
return nullptr;

APInt IndexQuot;
uint64_t Rem;
APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
if (Rem != 0)
APInt Rem;
APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you could still use the form of sdivrem that takes an int64_t dividor, right? Here and below.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, I've missed that variant. I opened PR #157864 to address this and your other comment; I'd rather not revert this PR for NFC improvements since it fixed a buildbot failure.

IndexQuot, Rem);
if (!Rem.isZero())
return nullptr;
if (VarOffsets.size() == 0)
return ConstantInt::get(GEP->getContext(), IndexQuot);
Expand All @@ -454,8 +455,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,

const auto &VarOffset = VarOffsets.front();
APInt OffsetQuot;
APInt::udivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
if (Rem != 0 || OffsetQuot.isZero())
APInt::sdivrem(VarOffset.second,
APInt(VarOffset.second.getBitWidth(), VecElemSize), OffsetQuot,
Rem);
if (!Rem.isZero() || OffsetQuot.isZero())
return nullptr;

Value *Offset = VarOffset.first;
Expand All @@ -465,7 +468,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,

if (!OffsetQuot.isOne()) {
ConstantInt *ConstMul =
ConstantInt::get(OffsetType, OffsetQuot.getZExtValue());
ConstantInt::get(OffsetType, OffsetQuot.getSExtValue());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems odd to convert APInt -> int64_t -> APInt here. Could you construct the ConstantInt directly from OffsetQuot.sext(OffsetType.getBitWidth()) ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #157864

Offset = Builder.CreateMul(Offset, ConstMul);
if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
NewInsts.push_back(NewInst);
Expand All @@ -474,7 +477,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
return Offset;

ConstantInt *ConstIndex =
ConstantInt::get(OffsetType, IndexQuot.getZExtValue());
ConstantInt::get(OffsetType, IndexQuot.getSExtValue());
Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
NewInsts.push_back(NewInst);
Expand Down
63 changes: 63 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck %s

; Check that the extracted index is correctly sign-extended when 32-bit scratch
; address arithmetic is promoted to 64-bit vector index arithmetic.

define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) {
; CHECK-LABEL: @negative_index_byte(
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> [[STACK]], i8 0, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 1, i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3
; CHECK-NEXT: [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i64 [[TMP5]]
; CHECK-NEXT: store i8 [[TMP6]], ptr [[OUT:%.*]], align 1
; CHECK-NEXT: ret void
;
%stack = alloca [4 x i8], align 4, addrspace(5)
%gep.0 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 0
%gep.1 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 1
%gep.2 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 2
%gep.3 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 3
store i8 0, ptr addrspace(5) %gep.0
store i8 1, ptr addrspace(5) %gep.1
store i8 2, ptr addrspace(5) %gep.2
store i8 3, ptr addrspace(5) %gep.3
%vgep = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 %offset
%cgep = getelementptr inbounds [4 x i8], ptr addrspace(5) %vgep, i64 0, i64 -1
%load = load i8, ptr addrspace(5) %cgep
store i8 %load, ptr %out
ret void
}

define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) {
; CHECK-LABEL: @negative_index_word(
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i32> poison
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[STACK]], i32 0, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 1, i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3
; CHECK-NEXT: [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i64 [[TMP5]]
; CHECK-NEXT: store i32 [[TMP6]], ptr [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
%stack = alloca [4 x i32], align 4, addrspace(5)
%gep.0 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 0
%gep.1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 1
%gep.2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 2
%gep.3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 3
store i32 0, ptr addrspace(5) %gep.0
store i32 1, ptr addrspace(5) %gep.1
store i32 2, ptr addrspace(5) %gep.2
store i32 3, ptr addrspace(5) %gep.3
%vgep = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 %offset
%cgep = getelementptr inbounds [4 x i32], ptr addrspace(5) %vgep, i64 0, i64 -1
%load = load i32, ptr addrspace(5) %cgep
store i32 %load, ptr %out
ret void
}