-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Support merging 16-bit and 8-bit TBUFFER load/store instruction #145078
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
8ec052a
eb78bbc
62cf51d
b85baa7
95484ef
760b507
151036a
d779672
8625df4
15a9d34
8cf32f0
4c52217
72bbd4c
0ccf934
dbe1b90
be722aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -840,8 +840,16 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, | |
| Offset = I->getOperand(OffsetIdx).getImm(); | ||
| } | ||
|
|
||
| if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) | ||
| if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) { | ||
| Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); | ||
| const AMDGPU::GcnBufferFormatInfo *Info = | ||
| AMDGPU::getGcnBufferFormatInfo(Format, *LSO.STM); | ||
|
|
||
| // Use 2-byte element size if the tbuffer format is 16-bit. | ||
| // Use 1-byte element size if the tbuffer format is 8-bit. | ||
| if (Info) | ||
|
||
| EltSize = Info->BitsPerComp / 8; | ||
| } | ||
|
|
||
| Width = getOpcodeWidth(*I, *LSO.TII); | ||
|
|
||
|
|
@@ -1060,13 +1068,46 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, | |
| Info0->NumFormat != Info1->NumFormat) | ||
| return false; | ||
|
|
||
| // TODO: Should be possible to support more formats, but if format loads | ||
| // are not dword-aligned, the merged load might not be valid. | ||
|
||
| if (Info0->BitsPerComp != 32) | ||
| // For 8-bit or 16-bit formats there is no 3-component variant. | ||
| // If NumCombinedComponents is 3, try the 4-component format and use XYZ. | ||
| // Example: | ||
| // tbuffer_load_format_x + tbuffer_load_format_x + tbuffer_load_format_x | ||
| // ==> tbuffer_load_format_xyz with format:[BUF_FMT_16_16_16_16_SNORM] | ||
| unsigned NumCombinedComponents = CI.Width + Paired.Width; | ||
| unsigned CombinedBufferFormat = | ||
harrisonGPU marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, STI); | ||
| if (CombinedBufferFormat == 0 && NumCombinedComponents == 3) { | ||
| if (Info0->BitsPerComp == 8 || Info0->BitsPerComp == 16) { | ||
| unsigned TryFormat = getBufferFormatWithCompCount(CI.Format, 4, STI); | ||
| if (!TryFormat) | ||
| return false; | ||
| CombinedBufferFormat = TryFormat; | ||
| NumCombinedComponents = 4; | ||
| } | ||
| } | ||
|
|
||
| if (CombinedBufferFormat == 0) | ||
| return false; | ||
|
|
||
| // Merge only when the two access ranges are strictly back-to-back, | ||
| // any gap or overlap can over-write data or leave holes. | ||
| unsigned BytePerComp = Info0->BitsPerComp / 8; | ||
harrisonGPU marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| unsigned ElemIndex0 = CI.Offset / BytePerComp; | ||
| unsigned ElemIndex1 = Paired.Offset / BytePerComp; | ||
| if (ElemIndex0 + CI.Width != ElemIndex1 && | ||
| ElemIndex1 + Paired.Width != ElemIndex0) | ||
| return false; | ||
|
|
||
| if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) | ||
| // 1-byte formats require 1-byte alignment. | ||
| // 2-byte formats require 2-byte alignment. | ||
| // 4-byte and larger formats require 4-byte alignment. | ||
| unsigned MergedBytes = BytePerComp * NumCombinedComponents; | ||
| unsigned RequiredAlign = std::min(MergedBytes, 4u); | ||
| unsigned MinOff = std::min(CI.Offset, Paired.Offset); | ||
| if (MinOff % RequiredAlign != 0) | ||
| return false; | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| uint32_t EltOffset0 = CI.Offset / CI.EltSize; | ||
|
|
@@ -1596,8 +1637,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( | |
| if (Regs.VAddr) | ||
| MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); | ||
|
|
||
| // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding. | ||
| // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components | ||
| // and use XYZ of XYZW to enable the merge. | ||
| unsigned NumCombinedComponents = CI.Width + Paired.Width; | ||
| if (NumCombinedComponents == 3 && (CI.EltSize == 1 || CI.EltSize == 2)) | ||
harrisonGPU marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| NumCombinedComponents = 4; | ||
| unsigned JoinedFormat = | ||
| getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); | ||
| getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM); | ||
|
|
||
| // It shouldn't be possible to get this far if the two instructions | ||
| // don't have a single memoperand, because MachineInstr::mayAlias() | ||
|
|
@@ -1639,8 +1686,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( | |
| if (Regs.VAddr) | ||
| MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); | ||
|
|
||
| // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding. | ||
| // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components | ||
| // and use XYZ of XYZW to enable the merge. | ||
| unsigned NumCombinedComponents = CI.Width + Paired.Width; | ||
| if (NumCombinedComponents == 3 && (CI.EltSize == 1 || CI.EltSize == 2)) | ||
harrisonGPU marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| NumCombinedComponents = 4; | ||
| unsigned JoinedFormat = | ||
| getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); | ||
| getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM); | ||
|
|
||
| // It shouldn't be possible to get this far if the two instructions | ||
| // don't have a single memoperand, because MachineInstr::mayAlias() | ||
|
|
@@ -2353,6 +2406,19 @@ SILoadStoreOptimizer::collectMergeableInsts( | |
| if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) | ||
| continue; | ||
|
|
||
| if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) { | ||
| const MachineOperand *Fmt = | ||
| TII->getNamedOperand(MI, AMDGPU::OpName::format); | ||
| if (!Fmt) { | ||
| LLVM_DEBUG(dbgs() << "Skip tbuffer without format operand: " << MI); | ||
harrisonGPU marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| continue; | ||
| } | ||
| if (!AMDGPU::getGcnBufferFormatInfo(Fmt->getImm(), *STM)) { | ||
| LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI); | ||
| continue; | ||
| } | ||
| } | ||
|
|
||
| CombineInfo CI; | ||
| CI.setMI(MI, *this); | ||
| CI.Order = Order++; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.