@@ -878,8 +878,12 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
878878 Offset = I->getOperand (OffsetIdx).getImm ();
879879 }
880880
881- if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
881+ if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
882882 Format = LSO.TII ->getNamedOperand (*I, AMDGPU::OpName::format)->getImm ();
883+ const AMDGPU::GcnBufferFormatInfo *Info =
884+ AMDGPU::getGcnBufferFormatInfo (Format, *LSO.STM );
885+ EltSize = Info->BitsPerComp / 8 ;
886+ }
883887
884888 Width = getOpcodeWidth (*I, *LSO.TII );
885889
@@ -1087,24 +1091,44 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
10871091
10881092 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
10891093 llvm::AMDGPU::getGcnBufferFormatInfo (CI.Format , STI);
1090- if (!Info0)
1091- return false ;
10921094 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
10931095 llvm::AMDGPU::getGcnBufferFormatInfo (Paired.Format , STI);
1094- if (!Info1)
1095- return false ;
10961096
10971097 if (Info0->BitsPerComp != Info1->BitsPerComp ||
10981098 Info0->NumFormat != Info1->NumFormat )
10991099 return false ;
11001100
1101- // TODO: Should be possible to support more formats, but if format loads
1102- // are not dword-aligned, the merged load might not be valid.
1103- if (Info0->BitsPerComp != 32 )
1101+ // For 8-bit or 16-bit formats there is no 3-component variant.
1102+ // If NumCombinedComponents is 3, try the 4-component format and use XYZ.
1103+ // Example:
1104+ // tbuffer_load_format_x + tbuffer_load_format_x + tbuffer_load_format_x
1105+ // ==> tbuffer_load_format_xyz with format:[BUF_FMT_16_16_16_16_SNORM]
1106+ unsigned NumCombinedComponents = CI.Width + Paired.Width ;
1107+ if (NumCombinedComponents == 3 && CI.EltSize <= 2 )
1108+ NumCombinedComponents = 4 ;
1109+
1110+ if (getBufferFormatWithCompCount (CI.Format , NumCombinedComponents, STI) ==
1111+ 0 )
1112+ return false ;
1113+
1114+ // Merge only when the two access ranges are strictly back-to-back,
1115+ // any gap or overlap can over-write data or leave holes.
1116+ unsigned ElemIndex0 = CI.Offset / CI.EltSize ;
1117+ unsigned ElemIndex1 = Paired.Offset / Paired.EltSize ;
1118+ if (ElemIndex0 + CI.Width != ElemIndex1 &&
1119+ ElemIndex1 + Paired.Width != ElemIndex0)
11041120 return false ;
11051121
1106- if (getBufferFormatWithCompCount (CI.Format , CI.Width + Paired.Width , STI) == 0 )
1122+ // 1-byte formats require 1-byte alignment.
1123+ // 2-byte formats require 2-byte alignment.
1124+ // 4-byte and larger formats require 4-byte alignment.
1125+ unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1126+ unsigned RequiredAlign = std::min (MergedBytes, 4u );
1127+ unsigned MinOff = std::min (CI.Offset , Paired.Offset );
1128+ if (MinOff % RequiredAlign != 0 )
11071129 return false ;
1130+
1131+ return true ;
11081132 }
11091133
11101134 uint32_t EltOffset0 = CI.Offset / CI.EltSize ;
@@ -1634,8 +1658,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
16341658 if (Regs.VAddr )
16351659 MIB.add (*TII->getNamedOperand (*CI.I , AMDGPU::OpName::vaddr));
16361660
1661+ // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1662+ // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1663+ // and use XYZ of XYZW to enable the merge.
1664+ unsigned NumCombinedComponents = CI.Width + Paired.Width ;
1665+ if (NumCombinedComponents == 3 && CI.EltSize <= 2 )
1666+ NumCombinedComponents = 4 ;
16371667 unsigned JoinedFormat =
1638- getBufferFormatWithCompCount (CI.Format , CI. Width + Paired. Width , *STM);
1668+ getBufferFormatWithCompCount (CI.Format , NumCombinedComponents , *STM);
16391669
16401670 // It shouldn't be possible to get this far if the two instructions
16411671 // don't have a single memoperand, because MachineInstr::mayAlias()
@@ -1677,8 +1707,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
16771707 if (Regs.VAddr )
16781708 MIB.add (*TII->getNamedOperand (*CI.I , AMDGPU::OpName::vaddr));
16791709
1710+ // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1711+ // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1712+ // and use XYZ of XYZW to enable the merge.
1713+ unsigned NumCombinedComponents = CI.Width + Paired.Width ;
1714+ if (NumCombinedComponents == 3 && CI.EltSize <= 2 )
1715+ NumCombinedComponents = 4 ;
16801716 unsigned JoinedFormat =
1681- getBufferFormatWithCompCount (CI.Format , CI. Width + Paired. Width , *STM);
1717+ getBufferFormatWithCompCount (CI.Format , NumCombinedComponents , *STM);
16821718
16831719 // It shouldn't be possible to get this far if the two instructions
16841720 // don't have a single memoperand, because MachineInstr::mayAlias()
@@ -2413,6 +2449,15 @@ SILoadStoreOptimizer::collectMergeableInsts(
24132449 if (Swizzled != -1 && MI.getOperand (Swizzled).getImm ())
24142450 continue ;
24152451
2452+ if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2453+ const MachineOperand *Fmt =
2454+ TII->getNamedOperand (MI, AMDGPU::OpName::format);
2455+ if (!AMDGPU::getGcnBufferFormatInfo (Fmt->getImm (), *STM)) {
2456+ LLVM_DEBUG (dbgs () << " Skip tbuffer with unknown format: " << MI);
2457+ continue ;
2458+ }
2459+ }
2460+
24162461 CombineInfo CI;
24172462 CI.setMI (MI, *this );
24182463 CI.Order = Order++;
0 commit comments