Skip to content

Commit a23141f

Browse files
ritter-x2agithub-actions[bot]
authored andcommitted
Automerge: [AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca (#157682)
[AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca AMDGPUPromoteAlloca can transform i32 GEP offsets that operate on allocas into i64 extractelement indices. Before this patch, negative GEP offsets would be zero-extended, leading to wrong extractelement indices with values around (2**32-1). This fixes failing LlvmLibcCharacterConverterUTF32To8Test tests for AMDGPU.
2 parents a49a722 + b965f26 commit a23141f

File tree

2 files changed

+73
-7
lines changed

2 files changed

+73
-7
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -443,9 +443,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
443443
return nullptr;
444444

445445
APInt IndexQuot;
446-
uint64_t Rem;
447-
APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
448-
if (Rem != 0)
446+
APInt Rem;
447+
APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize),
448+
IndexQuot, Rem);
449+
if (!Rem.isZero())
449450
return nullptr;
450451
if (VarOffsets.size() == 0)
451452
return ConstantInt::get(GEP->getContext(), IndexQuot);
@@ -454,8 +455,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
454455

455456
const auto &VarOffset = VarOffsets.front();
456457
APInt OffsetQuot;
457-
APInt::udivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
458-
if (Rem != 0 || OffsetQuot.isZero())
458+
APInt::sdivrem(VarOffset.second,
459+
APInt(VarOffset.second.getBitWidth(), VecElemSize), OffsetQuot,
460+
Rem);
461+
if (!Rem.isZero() || OffsetQuot.isZero())
459462
return nullptr;
460463

461464
Value *Offset = VarOffset.first;
@@ -465,7 +468,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
465468

466469
if (!OffsetQuot.isOne()) {
467470
ConstantInt *ConstMul =
468-
ConstantInt::get(OffsetType, OffsetQuot.getZExtValue());
471+
ConstantInt::get(OffsetType, OffsetQuot.getSExtValue());
469472
Offset = Builder.CreateMul(Offset, ConstMul);
470473
if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
471474
NewInsts.push_back(NewInst);
@@ -474,7 +477,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
474477
return Offset;
475478

476479
ConstantInt *ConstIndex =
477-
ConstantInt::get(OffsetType, IndexQuot.getZExtValue());
480+
ConstantInt::get(OffsetType, IndexQuot.getSExtValue());
478481
Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
479482
if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
480483
NewInsts.push_back(NewInst);
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck %s
3+
4+
; Check that the extracted index is correctly sign-extended when 32-bit scratch
5+
; address arithmetic is promoted to 64-bit vector index arithmetic.
6+
7+
define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) {
8+
; CHECK-LABEL: @negative_index_byte(
9+
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
10+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> [[STACK]], i8 0, i32 0
11+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 1, i32 1
12+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2
13+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3
14+
; CHECK-NEXT: [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
15+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i64 [[TMP5]]
16+
; CHECK-NEXT: store i8 [[TMP6]], ptr [[OUT:%.*]], align 1
17+
; CHECK-NEXT: ret void
18+
;
19+
%stack = alloca [4 x i8], align 4, addrspace(5)
20+
%gep.0 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 0
21+
%gep.1 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 1
22+
%gep.2 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 2
23+
%gep.3 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 3
24+
store i8 0, ptr addrspace(5) %gep.0
25+
store i8 1, ptr addrspace(5) %gep.1
26+
store i8 2, ptr addrspace(5) %gep.2
27+
store i8 3, ptr addrspace(5) %gep.3
28+
%vgep = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 %offset
29+
%cgep = getelementptr inbounds [4 x i8], ptr addrspace(5) %vgep, i64 0, i64 -1
30+
%load = load i8, ptr addrspace(5) %cgep
31+
store i8 %load, ptr %out
32+
ret void
33+
}
34+
35+
define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) {
36+
; CHECK-LABEL: @negative_index_word(
37+
; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i32> poison
38+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[STACK]], i32 0, i32 0
39+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 1, i32 1
40+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2
41+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3
42+
; CHECK-NEXT: [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
43+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i64 [[TMP5]]
44+
; CHECK-NEXT: store i32 [[TMP6]], ptr [[OUT:%.*]], align 4
45+
; CHECK-NEXT: ret void
46+
;
47+
%stack = alloca [4 x i32], align 4, addrspace(5)
48+
%gep.0 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 0
49+
%gep.1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 1
50+
%gep.2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 2
51+
%gep.3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 3
52+
store i32 0, ptr addrspace(5) %gep.0
53+
store i32 1, ptr addrspace(5) %gep.1
54+
store i32 2, ptr addrspace(5) %gep.2
55+
store i32 3, ptr addrspace(5) %gep.3
56+
%vgep = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 %offset
57+
%cgep = getelementptr inbounds [4 x i32], ptr addrspace(5) %vgep, i64 0, i64 -1
58+
%load = load i32, ptr addrspace(5) %cgep
59+
store i32 %load, ptr %out
60+
ret void
61+
}
62+
63+

0 commit comments

Comments
 (0)