llvm · harrisonGPU · Jun 2, 2025 · May 22, 2025 · May 23, 2025 · May 23, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -437,9 +437,34 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
   SmallMapVector<Value *, APInt, 4> VarOffsets;
   APInt ConstOffset(BW, 0);
-  if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
-      !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
-    return nullptr;
+
+  // Walk backwards through nested GEPs to collect both constant and variable
+  // offsets, so that nested vector GEP chains can be lowered in one step.
+  //
+  // Given this IR fragment as input:
+  //
+  //   %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
+  //   %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
+  //   %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
+  //   %3 = load i32, ptr addrspace(5) %2, align 4
+  //
+  // Combine both GEP operations in a single pass, producing:
+  //   BasePtr      = %0
+  //   ConstOffset  = 4
+  //   VarOffsets   = { %j -> element_size(<2 x i32>) }
+  //
+  // That lets us emit a single buffer_load directly into a VGPR, without ever
+  // allocating scratch memory for the intermediate pointer.
+  Value *CurPtr = GEP;
+  while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
+    if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
+      return nullptr;
+
+    // Move to the next outer pointer.
+    CurPtr = CurGEP->getPointerOperand();
+  }
+
+  assert(CurPtr == Alloca && "GEP not based on alloca");
 for (AllocaInst *AI : Allocas) { 
   const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType()); 
   // First, check if we have enough budget to vectorize this alloca. 
   if (AllocaCost <= VectorizationBudget) { 
     // If we do, attempt vectorization, otherwise, fall through and try 
     // promoting to LDS instead. 
     if (tryPromoteAllocaToVector(*AI)) { 
 for (AllocaInst *AI : Allocas) { 
   const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType()); 
   // First, check if we have enough budget to vectorize this alloca. 
   if (AllocaCost <= VectorizationBudget) { 
     // If we do, attempt vectorization, otherwise, fall through and try 
     // promoting to LDS instead. 
     if (tryPromoteAllocaToVector(*AI)) { 
 
   unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
   if (VarOffsets.size() > 1)

diff --git a/llvm/test/CodeGen/AMDGPU/amdpal.ll b/llvm/test/CodeGen/AMDGPU/amdpal.ll
@@ -70,7 +70,7 @@ entry:
   store i32 %extra, ptr addrspace(5) %v
   store <2 x i32> %in, ptr addrspace(5) %v1
   %e = getelementptr [2 x i32], ptr addrspace(5) %v1, i32 0, i32 %idx
-  %x = load i32, ptr addrspace(5) %e
+  %x = load volatile i32, ptr addrspace(5) %e
   %xf = bitcast i32 %x to float
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %xf, ptr addrspace(8) poison, i32 0, i32 0, i32 0)
   ret void

diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep-of-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep-of-gep.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(i32 %idx, ptr addrspace(1) %output) #0 {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(
+; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BUF:%.*]] = freeze <20 x i32> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[IDX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <20 x i32> [[BUF]], i32 1, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <20 x i32> [[TMP1]], i32 2, i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[IDX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 1, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <20 x i32> [[TMP3]], i32 [[TMP5]]
+; CHECK-NEXT:    store i32 [[TMP6]], ptr addrspace(1) [[OUTPUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [10 x <2 x i32>], align 8, addrspace(5)
+  %row = getelementptr [10 x <2 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %idx
+  store <2 x i32> <i32 1, i32 2>, ptr addrspace(5) %row, align 8
+  %elt1 = getelementptr i8, ptr addrspace(5) %row, i32 4
+  %val = load i32, ptr addrspace(5) %elt1, align 4
+  store i32 %val, ptr addrspace(1) %output
+  ret void
+}
+
+define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep3(i32 %idx, ptr addrspace(1) %output) #0 {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep3(
+; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <16 x i32> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[IDX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 8, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> [[ALLOCA]], i32 10, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> [[TMP2]], i32 20, i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[IDX]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 9, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i32> [[TMP4]], i32 [[TMP6]]
+; CHECK-NEXT:    store i32 [[TMP7]], ptr addrspace(1) [[OUTPUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [2 x [4 x <2 x i32>]], align 8, addrspace(5)
+  %lvl1 = getelementptr inbounds [2 x [4 x <2 x i32>]], ptr addrspace(5) %alloca, i32 0, i32 1
+  %lvl2 = getelementptr inbounds [4 x <2 x i32>], ptr addrspace(5) %lvl1, i32 0, i32 %idx
+  store <2 x i32> <i32 10, i32 20>, ptr addrspace(5) %lvl2, align 8
+  %byte = getelementptr inbounds i8, ptr addrspace(5) %lvl2, i32 4
+  %val = load i32, ptr addrspace(5) %byte, align 4
+  store i32 %val, ptr addrspace(1) %output
+  ret void
+}
+
+define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_twice_idx(i32 %idx, ptr addrspace(1) %out) #0 {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_twice_idx(
+; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BUF:%.*]] = freeze <20 x i32> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[IDX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <20 x i32> [[BUF]], i32 1, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <20 x i32> [[TMP4]], i32 2, i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[IDX]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <20 x i32> [[TMP3]], i32 [[TMP1]]
+; CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [10 x [2 x i32]], align 8, addrspace(5)
+  %row = getelementptr inbounds [10 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 %idx
+  store <2 x i32> <i32 1, i32 2>, ptr addrspace(5) %row, align 8
+  %elt = getelementptr inbounds [2 x i32], ptr addrspace(5) %row, i32 0, i32 %idx
+  %val = load i32, ptr addrspace(5) %elt, align 4
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { "amdgpu-promote-alloca-to-vector-max-regs"="32" }