Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitIntrinsicInst(IntrinsicInst &I);
bool visitFMinLike(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
bool visitBufferIntrinsic(IntrinsicInst &I);
bool run();
};

Expand Down Expand Up @@ -1910,6 +1911,15 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
return visitFMinLike(I);
case Intrinsic::sqrt:
return visitSqrt(I);
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
case Intrinsic::amdgcn_raw_ptr_buffer_store:
case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
return visitBufferIntrinsic(I);
default:
return false;
}
Expand Down Expand Up @@ -2046,6 +2056,75 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
return true;
}

/// Sink uniform addends in buffer address calculations into soffset.
///
/// Transforms buffer loads/stores with voffset = add(uniform, divergent)
/// into voffset = divergent, soffset = uniform for better address coalescing
/// Only applies to raw buffer operations with soffset initially zero.
bool AMDGPUCodeGenPrepareImpl::visitBufferIntrinsic(IntrinsicInst &I) {
Intrinsic::ID IID = I.getIntrinsicID();
bool IsLoad = (IID == Intrinsic::amdgcn_raw_buffer_load ||
IID == Intrinsic::amdgcn_raw_buffer_load_format ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_load ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_load_format);
Comment on lines +2066 to +2069
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bool IsLoad = (IID == Intrinsic::amdgcn_raw_buffer_load ||
IID == Intrinsic::amdgcn_raw_buffer_load_format ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_load ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_load_format);
bool IsLoad = IID == Intrinsic::amdgcn_raw_buffer_load ||
IID == Intrinsic::amdgcn_raw_buffer_load_format ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_load ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;

bool IsStore = (IID == Intrinsic::amdgcn_raw_buffer_store ||
IID == Intrinsic::amdgcn_raw_buffer_store_format ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_store ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_store_format);
Comment on lines +2070 to +2073
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bool IsStore = (IID == Intrinsic::amdgcn_raw_buffer_store ||
IID == Intrinsic::amdgcn_raw_buffer_store_format ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_store ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_store_format);
bool IsStore = IID == Intrinsic::amdgcn_raw_buffer_store ||
IID == Intrinsic::amdgcn_raw_buffer_store_format ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_store ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;


if (!IsLoad && !IsStore)
return false;
Comment on lines +2075 to +2076
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!IsLoad && !IsStore)
return false;

This is unreachable


// Buffer intrinsic operand layout (same for vector and pointer descriptor):
// Load: (rsrc, voffset, soffset, cachepolicy)
// Store: (vdata, rsrc, voffset, soffset, cachepolicy)
const unsigned VOffsetIdx = IsStore ? 2 : 1;
const unsigned SOffsetIdx = IsStore ? 3 : 2;

Value *VOffset = I.getArgOperand(VOffsetIdx);
Value *SOffset = I.getArgOperand(SOffsetIdx);

// Only optimize when soffset is currently zero
if (!match(SOffset, m_Zero()))
return false;
Comment on lines +2088 to +2089
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this code evolve when we want to handle soffsets that are not zero? Or more complex voffsets like (((non_uniform + uniform_a) + uniform_b) + 8).

In this previous case, 8 is kept as it is to sink it later to the constant part. uniform_a and uniform_b are kept in there too, while it'd have been possible to move, at least uniform_b, to the soffset ?

I understand that as a first step we can put this transformation in here; but I would not be surprised about it having its own pass (or having a pass to do "uniform reassociate" more generally).

Copy link
Contributor Author

@PrasoonMishra PrasoonMishra Nov 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're correct. I intentionally targeted the simplest case (soffset = 0, single-level add) as a starting point. I wanted to evolve it incrementally here, as the current pattern is simple enough that a separate pass felt premature.
That said, if you feel it would benefit from its own pass from the start, I'm happy to refactor it out now. What do you think?


// Pattern match: voffset = add(uniform, divergent)
Value *LHS, *RHS;
if (!match(VOffset, m_Add(m_Value(LHS), m_Value(RHS))))
return false;

bool LHSUniform = UA.isUniform(LHS);
bool RHSUniform = UA.isUniform(RHS);

// Need exactly one uniform and one divergent operand.
// TODO: Handle the case where both are uniform.
if (LHSUniform == RHSUniform)
return false;

Value *UniformAddend = LHSUniform ? LHS : RHS;
Value *DivergentAddend = LHSUniform ? RHS : LHS;

// Skip if the uniform addend is a non-negative constant that fits in the
// 12-bit immediate offset field. The backend will fold it into the immediate
// field, which avoids consuming an soffset operand.
// Negative or large constants must use soffset.
if (auto *CI = dyn_cast<ConstantInt>(UniformAddend)) {
int64_t Offset = CI->getSExtValue();
if (Offset >= 0 && Offset <= 4095)
Comment on lines +2112 to +2113
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't be hardcoded, and doesn't this very per subtarget? Check isLegalAddressingMode?

return false;
}

LLVM_DEBUG(dbgs() << "AMDGPUCodeGenPrepare: Sinking uniform addend into "
"soffset for buffer "
<< (IsStore ? "store" : "load") << ": " << I << '\n');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
<< (IsStore ? "store" : "load") << ": " << I << '\n');
<< ": " << I << '\n');

If you're printing the whole instruction adding the extra word doesn't help


// Update voffset and soffset operands
I.setArgOperand(VOffsetIdx, DivergentAddend);
I.setArgOperand(SOffsetIdx, UniformAddend);

return true;
}

bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
Expand Down
Loading