Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 56 additions & 11 deletions llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -840,8 +840,12 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
Offset = I->getOperand(OffsetIdx).getImm();
}

if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
const AMDGPU::GcnBufferFormatInfo *Info =
AMDGPU::getGcnBufferFormatInfo(Format, *LSO.STM);
EltSize = Info->BitsPerComp / 8;
}

Width = getOpcodeWidth(*I, *LSO.TII);

Expand Down Expand Up @@ -1049,24 +1053,44 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,

const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
if (!Info0)
return false;
const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
if (!Info1)
return false;

if (Info0->BitsPerComp != Info1->BitsPerComp ||
Info0->NumFormat != Info1->NumFormat)
return false;

// TODO: Should be possible to support more formats, but if format loads
// are not dword-aligned, the merged load might not be valid.
Comment on lines -1063 to -1064
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will your patch merge two 16-bit loads at offsets 2 and 4, into a single 32-bit load at offset 2?

If it does that then the merged load is not dword aligned. Is that allowed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I understand your concern. I’ve verified section 9.5 “Alignment” of the RDNA 3 Shader Instruction Set Architecture manual, which states:

Formatted ops such as BUFFER_LOAD_FORMAT_* must be aligned as follows:
• 1-byte formats → 1-byte alignment
• 2-byte formats → 2-byte alignment
• 4-byte and larger formats → 4-byte alignment

I’ve therefore added an explicit alignment check and a new Lit test, gfx11_tbuffer_load_x_off2_off4_16bit_no_merge.

Reference:
https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf

if (Info0->BitsPerComp != 32)
// For 8-bit or 16-bit formats there is no 3-component variant.
// If NumCombinedComponents is 3, try the 4-component format and use XYZ.
// Example:
// tbuffer_load_format_x + tbuffer_load_format_x + tbuffer_load_format_x
// ==> tbuffer_load_format_xyz with format:[BUF_FMT_16_16_16_16_SNORM]
unsigned NumCombinedComponents = CI.Width + Paired.Width;
if (NumCombinedComponents == 3 && CI.EltSize <= 2)
NumCombinedComponents = 4;

if (getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, STI) ==
0)
return false;

// Merge only when the two access ranges are strictly back-to-back,
// any gap or overlap can over-write data or leave holes.
unsigned ElemIndex0 = CI.Offset / CI.EltSize;
unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
if (ElemIndex0 + CI.Width != ElemIndex1 &&
ElemIndex1 + Paired.Width != ElemIndex0)
return false;

if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
// 1-byte formats require 1-byte alignment.
// 2-byte formats require 2-byte alignment.
// 4-byte and larger formats require 4-byte alignment.
unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
unsigned RequiredAlign = std::min(MergedBytes, 4u);
unsigned MinOff = std::min(CI.Offset, Paired.Offset);
if (MinOff % RequiredAlign != 0)
return false;

return true;
}

uint32_t EltOffset0 = CI.Offset / CI.EltSize;
Expand Down Expand Up @@ -1596,8 +1620,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
if (Regs.VAddr)
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

// For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
// If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
// and use XYZ of XYZW to enable the merge.
unsigned NumCombinedComponents = CI.Width + Paired.Width;
if (NumCombinedComponents == 3 && CI.EltSize <= 2)
NumCombinedComponents = 4;
unsigned JoinedFormat =
getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);

// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
Expand Down Expand Up @@ -1639,8 +1669,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
if (Regs.VAddr)
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

// For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
// If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
// and use XYZ of XYZW to enable the merge.
unsigned NumCombinedComponents = CI.Width + Paired.Width;
if (NumCombinedComponents == 3 && CI.EltSize <= 2)
NumCombinedComponents = 4;
unsigned JoinedFormat =
getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);

// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
Expand Down Expand Up @@ -2353,6 +2389,15 @@ SILoadStoreOptimizer::collectMergeableInsts(
if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
continue;

if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
const MachineOperand *Fmt =
TII->getNamedOperand(MI, AMDGPU::OpName::format);
if (!AMDGPU::getGcnBufferFormatInfo(Fmt->getImm(), *STM)) {
LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI);
continue;
}
}

CombineInfo CI;
CI.setMI(MI, *this);
CI.Order = Order++;
Expand Down
Loading