Skip to content

Commit 23a5a7b

Browse files
authored
[AMDGPU] Support merging 16-bit and 8-bit TBUFFER load/store instruction (#145078)
SILoadStoreOptimizer can now recognise consecutive 16-bit and 8-bit `TBUFFER_LOAD`/`TBUFFER_STORE` instructions that each write * a single component (`X`), or * two components (`XY`), and fold them into the wider native variants: ``` X + X --> XY X + X + X + X --> XYZW XY + XY --> XYZW X + X + X --> XYZ XY + X --> XYZ ``` The optimisation cuts the number of TBUFFER instructions, shrinking code size and improving memory throughput.
1 parent 2738828 commit 23a5a7b

File tree

2 files changed

+674
-11
lines changed

2 files changed

+674
-11
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -878,8 +878,12 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
878878
Offset = I->getOperand(OffsetIdx).getImm();
879879
}
880880

881-
if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
881+
if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
882882
Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
883+
const AMDGPU::GcnBufferFormatInfo *Info =
884+
AMDGPU::getGcnBufferFormatInfo(Format, *LSO.STM);
885+
EltSize = Info->BitsPerComp / 8;
886+
}
883887

884888
Width = getOpcodeWidth(*I, *LSO.TII);
885889

@@ -1087,24 +1091,44 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
10871091

10881092
const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
10891093
llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1090-
if (!Info0)
1091-
return false;
10921094
const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
10931095
llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1094-
if (!Info1)
1095-
return false;
10961096

10971097
if (Info0->BitsPerComp != Info1->BitsPerComp ||
10981098
Info0->NumFormat != Info1->NumFormat)
10991099
return false;
11001100

1101-
// TODO: Should be possible to support more formats, but if format loads
1102-
// are not dword-aligned, the merged load might not be valid.
1103-
if (Info0->BitsPerComp != 32)
1101+
// For 8-bit or 16-bit formats there is no 3-component variant.
1102+
// If NumCombinedComponents is 3, try the 4-component format and use XYZ.
1103+
// Example:
1104+
// tbuffer_load_format_x + tbuffer_load_format_x + tbuffer_load_format_x
1105+
// ==> tbuffer_load_format_xyz with format:[BUF_FMT_16_16_16_16_SNORM]
1106+
unsigned NumCombinedComponents = CI.Width + Paired.Width;
1107+
if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1108+
NumCombinedComponents = 4;
1109+
1110+
if (getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, STI) ==
1111+
0)
1112+
return false;
1113+
1114+
// Merge only when the two access ranges are strictly back-to-back,
1115+
// any gap or overlap can over-write data or leave holes.
1116+
unsigned ElemIndex0 = CI.Offset / CI.EltSize;
1117+
unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
1118+
if (ElemIndex0 + CI.Width != ElemIndex1 &&
1119+
ElemIndex1 + Paired.Width != ElemIndex0)
11041120
return false;
11051121

1106-
if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1122+
// 1-byte formats require 1-byte alignment.
1123+
// 2-byte formats require 2-byte alignment.
1124+
// 4-byte and larger formats require 4-byte alignment.
1125+
unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1126+
unsigned RequiredAlign = std::min(MergedBytes, 4u);
1127+
unsigned MinOff = std::min(CI.Offset, Paired.Offset);
1128+
if (MinOff % RequiredAlign != 0)
11071129
return false;
1130+
1131+
return true;
11081132
}
11091133

11101134
uint32_t EltOffset0 = CI.Offset / CI.EltSize;
@@ -1634,8 +1658,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
16341658
if (Regs.VAddr)
16351659
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
16361660

1661+
// For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1662+
// If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1663+
// and use XYZ of XYZW to enable the merge.
1664+
unsigned NumCombinedComponents = CI.Width + Paired.Width;
1665+
if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1666+
NumCombinedComponents = 4;
16371667
unsigned JoinedFormat =
1638-
getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1668+
getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);
16391669

16401670
// It shouldn't be possible to get this far if the two instructions
16411671
// don't have a single memoperand, because MachineInstr::mayAlias()
@@ -1677,8 +1707,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
16771707
if (Regs.VAddr)
16781708
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
16791709

1710+
// For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1711+
// If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1712+
// and use XYZ of XYZW to enable the merge.
1713+
unsigned NumCombinedComponents = CI.Width + Paired.Width;
1714+
if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1715+
NumCombinedComponents = 4;
16801716
unsigned JoinedFormat =
1681-
getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1717+
getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);
16821718

16831719
// It shouldn't be possible to get this far if the two instructions
16841720
// don't have a single memoperand, because MachineInstr::mayAlias()
@@ -2413,6 +2449,15 @@ SILoadStoreOptimizer::collectMergeableInsts(
24132449
if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
24142450
continue;
24152451

2452+
if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2453+
const MachineOperand *Fmt =
2454+
TII->getNamedOperand(MI, AMDGPU::OpName::format);
2455+
if (!AMDGPU::getGcnBufferFormatInfo(Fmt->getImm(), *STM)) {
2456+
LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI);
2457+
continue;
2458+
}
2459+
}
2460+
24162461
CombineInfo CI;
24172462
CI.setMI(MI, *this);
24182463
CI.Order = Order++;

0 commit comments

Comments
 (0)