Skip to content

Commit 244905c

Browse files
committed
[AMDGPU] Sink uniform buffer address offsets into soffset
This patch implements an optimization to partition MUBUF load/store offsets into vector and scalar components for better address coalescing and reduced VGPR pressure. Transform buffer operations where voffset = add(uniform, divergent) by moving the uniform part to soffset and keeping the divergent part in voffset. Before: v_add_u32 v1, v0, sN buffer_{load,store}_T v*, v1, s[bufDesc:bufDesc+3] offen After: buffer_{load,store}_T v*, v0, s[bufDesc:bufDesc+3], sN offen The optimization currently applies to raw buffer loads/stores when soffset is initially zero. Tests includes comprehensive validation of both buffer loads and stores across various supported variants (i8, i16, i32, vectors, floats) with positive and negative test cases.
1 parent cc5185b commit 244905c

File tree

5 files changed

+1015
-10
lines changed

5 files changed

+1015
-10
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ class AMDGPUCodeGenPrepareImpl
260260
bool visitIntrinsicInst(IntrinsicInst &I);
261261
bool visitFMinLike(IntrinsicInst &I);
262262
bool visitSqrt(IntrinsicInst &I);
263+
bool visitBufferIntrinsic(IntrinsicInst &I);
263264
bool run();
264265
};
265266

@@ -1910,6 +1911,15 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
19101911
return visitFMinLike(I);
19111912
case Intrinsic::sqrt:
19121913
return visitSqrt(I);
1914+
case Intrinsic::amdgcn_raw_buffer_load:
1915+
case Intrinsic::amdgcn_raw_buffer_load_format:
1916+
case Intrinsic::amdgcn_raw_buffer_store:
1917+
case Intrinsic::amdgcn_raw_buffer_store_format:
1918+
case Intrinsic::amdgcn_raw_ptr_buffer_load:
1919+
case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1920+
case Intrinsic::amdgcn_raw_ptr_buffer_store:
1921+
case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
1922+
return visitBufferIntrinsic(I);
19131923
default:
19141924
return false;
19151925
}
@@ -2046,6 +2056,75 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
20462056
return true;
20472057
}
20482058

2059+
/// Sink uniform addends in buffer address calculations into soffset.
2060+
///
2061+
/// Transforms buffer loads/stores with voffset = add(uniform, divergent)
2062+
/// into voffset = divergent, soffset = uniform for better address coalescing
2063+
/// Only applies to raw buffer operations with soffset initially zero.
2064+
bool AMDGPUCodeGenPrepareImpl::visitBufferIntrinsic(IntrinsicInst &I) {
2065+
Intrinsic::ID IID = I.getIntrinsicID();
2066+
bool IsLoad = (IID == Intrinsic::amdgcn_raw_buffer_load ||
2067+
IID == Intrinsic::amdgcn_raw_buffer_load_format ||
2068+
IID == Intrinsic::amdgcn_raw_ptr_buffer_load ||
2069+
IID == Intrinsic::amdgcn_raw_ptr_buffer_load_format);
2070+
bool IsStore = (IID == Intrinsic::amdgcn_raw_buffer_store ||
2071+
IID == Intrinsic::amdgcn_raw_buffer_store_format ||
2072+
IID == Intrinsic::amdgcn_raw_ptr_buffer_store ||
2073+
IID == Intrinsic::amdgcn_raw_ptr_buffer_store_format);
2074+
2075+
if (!IsLoad && !IsStore)
2076+
return false;
2077+
2078+
// Buffer intrinsic operand layout (same for vector and pointer descriptor):
2079+
// Load: (rsrc, voffset, soffset, cachepolicy)
2080+
// Store: (vdata, rsrc, voffset, soffset, cachepolicy)
2081+
const unsigned VOffsetIdx = IsStore ? 2 : 1;
2082+
const unsigned SOffsetIdx = IsStore ? 3 : 2;
2083+
2084+
Value *VOffset = I.getArgOperand(VOffsetIdx);
2085+
Value *SOffset = I.getArgOperand(SOffsetIdx);
2086+
2087+
// Only optimize when soffset is currently zero
2088+
if (!match(SOffset, m_Zero()))
2089+
return false;
2090+
2091+
// Pattern match: voffset = add(uniform, divergent)
2092+
Value *LHS, *RHS;
2093+
if (!match(VOffset, m_Add(m_Value(LHS), m_Value(RHS))))
2094+
return false;
2095+
2096+
bool LHSUniform = UA.isUniform(LHS);
2097+
bool RHSUniform = UA.isUniform(RHS);
2098+
2099+
// Need exactly one uniform and one divergent operand.
2100+
// TODO: Handle the case where both are uniform.
2101+
if (LHSUniform == RHSUniform)
2102+
return false;
2103+
2104+
Value *UniformAddend = LHSUniform ? LHS : RHS;
2105+
Value *DivergentAddend = LHSUniform ? RHS : LHS;
2106+
2107+
// Skip if the uniform addend is a non-negative constant that fits in the
2108+
// 12-bit immediate offset field. The backend will fold it into the immediate
2109+
// field, which avoids consuming an soffset operand.
2110+
// Negative or large constants must use soffset.
2111+
if (auto *CI = dyn_cast<ConstantInt>(UniformAddend)) {
2112+
int64_t Offset = CI->getSExtValue();
2113+
if (Offset >= 0 && Offset <= 4095)
2114+
return false;
2115+
}
2116+
2117+
LLVM_DEBUG(dbgs() << "AMDGPUCodeGenPrepare: Sinking uniform addend into "
2118+
"soffset for buffer "
2119+
<< (IsStore ? "store" : "load") << ": " << I << '\n');
2120+
2121+
// Update voffset and soffset operands
2122+
I.setArgOperand(VOffsetIdx, DivergentAddend);
2123+
I.setArgOperand(SOffsetIdx, UniformAddend);
2124+
2125+
return true;
2126+
}
2127+
20492128
bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
20502129
if (skipFunction(F))
20512130
return false;

0 commit comments

Comments
 (0)