Automerge: [AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca (#157682)

ritter-x2a · github-actions[bot] · commit a23141fa7ade · 2025-09-10T09:45:23.000Z
[AMDGPU] Treat GEP offsets as signed in AMDGPUPromoteAlloca

AMDGPUPromoteAlloca can transform i32 GEP offsets that operate on
allocas into i64 extractelement indices. Before this patch, negative GEP
offsets would be zero-extended, leading to wrong extractelement indices
with values around (2**32-1).

This fixes failing LlvmLibcCharacterConverterUTF32To8Test tests for
AMDGPU.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -443,9 +443,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
     return nullptr;
 
   APInt IndexQuot;
-  uint64_t Rem;
-  APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
-  if (Rem != 0)
+  APInt Rem;
+  APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize),
+                 IndexQuot, Rem);
+  if (!Rem.isZero())
     return nullptr;
   if (VarOffsets.size() == 0)
     return ConstantInt::get(GEP->getContext(), IndexQuot);
@@ -454,8 +455,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
 
   const auto &VarOffset = VarOffsets.front();
   APInt OffsetQuot;
-  APInt::udivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
-  if (Rem != 0 || OffsetQuot.isZero())
+  APInt::sdivrem(VarOffset.second,
+                 APInt(VarOffset.second.getBitWidth(), VecElemSize), OffsetQuot,
+                 Rem);
+  if (!Rem.isZero() || OffsetQuot.isZero())
     return nullptr;
 
   Value *Offset = VarOffset.first;
@@ -465,7 +468,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
 
   if (!OffsetQuot.isOne()) {
     ConstantInt *ConstMul =
-        ConstantInt::get(OffsetType, OffsetQuot.getZExtValue());
+        ConstantInt::get(OffsetType, OffsetQuot.getSExtValue());
     Offset = Builder.CreateMul(Offset, ConstMul);
     if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
       NewInsts.push_back(NewInst);
@@ -474,7 +477,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
     return Offset;
 
   ConstantInt *ConstIndex =
-      ConstantInt::get(OffsetType, IndexQuot.getZExtValue());
+      ConstantInt::get(OffsetType, IndexQuot.getSExtValue());
   Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
   if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
     NewInsts.push_back(NewInst);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck %s
+
+; Check that the extracted index is correctly sign-extended when 32-bit scratch
+; address arithmetic is promoted to 64-bit vector index arithmetic.
+
+define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) {
+; CHECK-LABEL: @negative_index_byte(
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> [[STACK]], i8 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 1, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT:    store i8 [[TMP6]], ptr [[OUT:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca [4 x i8], align 4, addrspace(5)
+  %gep.0 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 0
+  %gep.1 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+  %gep.2 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 2
+  %gep.3 = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 3
+  store i8 0, ptr addrspace(5) %gep.0
+  store i8 1, ptr addrspace(5) %gep.1
+  store i8 2, ptr addrspace(5) %gep.2
+  store i8 3, ptr addrspace(5) %gep.3
+  %vgep = getelementptr inbounds [4 x i8], ptr addrspace(5) %stack, i64 0, i64 %offset
+  %cgep = getelementptr inbounds [4 x i8], ptr addrspace(5) %vgep, i64 0, i64 -1
+  %load = load i8, ptr addrspace(5) %cgep
+  store i8 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) {
+; CHECK-LABEL: @negative_index_word(
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i32> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[STACK]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 1, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[TMP6]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca [4 x i32], align 4, addrspace(5)
+  %gep.0 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 0
+  %gep.1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 1
+  %gep.2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 2
+  %gep.3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 3
+  store i32 0, ptr addrspace(5) %gep.0
+  store i32 1, ptr addrspace(5) %gep.1
+  store i32 2, ptr addrspace(5) %gep.2
+  store i32 3, ptr addrspace(5) %gep.3
+  %vgep = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i64 0, i64 %offset
+  %cgep = getelementptr inbounds [4 x i32], ptr addrspace(5) %vgep, i64 0, i64 -1
+  %load = load i32, ptr addrspace(5) %cgep
+  store i32 %load, ptr %out
+  ret void
+}
+
+