@@ -260,6 +260,7 @@ class AMDGPUCodeGenPrepareImpl
260260 bool visitIntrinsicInst (IntrinsicInst &I);
261261 bool visitFMinLike (IntrinsicInst &I);
262262 bool visitSqrt (IntrinsicInst &I);
263+ bool visitBufferIntrinsic (IntrinsicInst &I);
263264 bool run ();
264265};
265266
@@ -1910,6 +1911,15 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
19101911 return visitFMinLike (I);
19111912 case Intrinsic::sqrt:
19121913 return visitSqrt (I);
1914+ case Intrinsic::amdgcn_raw_buffer_load:
1915+ case Intrinsic::amdgcn_raw_buffer_load_format:
1916+ case Intrinsic::amdgcn_raw_buffer_store:
1917+ case Intrinsic::amdgcn_raw_buffer_store_format:
1918+ case Intrinsic::amdgcn_raw_ptr_buffer_load:
1919+ case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1920+ case Intrinsic::amdgcn_raw_ptr_buffer_store:
1921+ case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
1922+ return visitBufferIntrinsic (I);
19131923 default :
19141924 return false ;
19151925 }
@@ -2046,6 +2056,86 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
20462056 return true ;
20472057}
20482058
2059+ // / Sink uniform addends in buffer address calculations into soffset.
2060+ // /
2061+ // / Transforms buffer loads/stores with voffset = add(uniform, divergent)
2062+ // / into voffset = divergent, soffset = uniform for better address coalescing
2063+ // / Only applies to raw buffer operations with soffset initially zero.
2064+ bool AMDGPUCodeGenPrepareImpl::visitBufferIntrinsic (IntrinsicInst &I) {
2065+ Intrinsic::ID IID = I.getIntrinsicID ();
2066+ bool IsLoad = (IID == Intrinsic::amdgcn_raw_buffer_load ||
2067+ IID == Intrinsic::amdgcn_raw_buffer_load_format ||
2068+ IID == Intrinsic::amdgcn_raw_ptr_buffer_load ||
2069+ IID == Intrinsic::amdgcn_raw_ptr_buffer_load_format);
2070+ bool IsStore = (IID == Intrinsic::amdgcn_raw_buffer_store ||
2071+ IID == Intrinsic::amdgcn_raw_buffer_store_format ||
2072+ IID == Intrinsic::amdgcn_raw_ptr_buffer_store ||
2073+ IID == Intrinsic::amdgcn_raw_ptr_buffer_store_format);
2074+
2075+ if (!IsLoad && !IsStore)
2076+ return false ;
2077+
2078+ // Buffer intrinsic operand layout (same for vector and pointer descriptor):
2079+ // Load: (rsrc, voffset, soffset, cachepolicy)
2080+ // Store: (vdata, rsrc, voffset, soffset, cachepolicy)
2081+ const unsigned VOffsetIdx = IsStore ? 2 : 1 ;
2082+ const unsigned SOffsetIdx = IsStore ? 3 : 2 ;
2083+
2084+ Value *VOffset = I.getArgOperand (VOffsetIdx);
2085+ Value *SOffset = I.getArgOperand (SOffsetIdx);
2086+
2087+ // Only optimize when soffset is currently zero
2088+ if (!match (SOffset, m_Zero ()))
2089+ return false ;
2090+
2091+ // Pattern match: voffset = add(uniform, divergent)
2092+ Value *LHS, *RHS;
2093+ if (!match (VOffset, m_Add (m_Value (LHS), m_Value (RHS))))
2094+ return false ;
2095+
2096+ bool LHSUniform = UA.isUniform (LHS);
2097+ bool RHSUniform = UA.isUniform (RHS);
2098+
2099+ // Need exactly one uniform and one divergent operand.
2100+ // TODO: Handle the case where both are uniform.
2101+ if (LHSUniform == RHSUniform)
2102+ return false ;
2103+
2104+ Value *UniformAddend = LHSUniform ? LHS : RHS;
2105+ Value *DivergentAddend = LHSUniform ? RHS : LHS;
2106+
2107+ // Skip if the uniform addend is a non-negative constant that fits in the
2108+ // 12-bit immediate offset field. The backend will fold it into the immediate
2109+ // field, which avoids consuming an soffset operand.
2110+ // Negative or large constants must use soffset.
2111+ if (auto *CI = dyn_cast<ConstantInt>(UniformAddend)) {
2112+ int64_t Offset = CI->getSExtValue ();
2113+ if (Offset >= 0 && Offset <= 4095 )
2114+ return false ;
2115+ }
2116+
2117+ LLVM_DEBUG (dbgs () << " AMDGPUCodeGenPrepare: Sinking uniform addend into "
2118+ " soffset for buffer "
2119+ << (IsStore ? " store" : " load" ) << " : " << I << ' \n ' );
2120+
2121+ // Clone the instruction and insert it before the old instruction
2122+ CallInst *NewCall = cast<CallInst>(I.clone ());
2123+ NewCall->insertBefore (I.getIterator ());
2124+
2125+ // Update voffset and soffset operands
2126+ NewCall->setArgOperand (VOffsetIdx, DivergentAddend);
2127+ NewCall->setArgOperand (SOffsetIdx, UniformAddend);
2128+
2129+ // Replace and erase the old instruction
2130+ if (IsLoad) {
2131+ NewCall->takeName (&I);
2132+ I.replaceAllUsesWith (NewCall);
2133+ }
2134+ I.eraseFromParent ();
2135+
2136+ return true ;
2137+ }
2138+
20492139bool AMDGPUCodeGenPrepare::runOnFunction (Function &F) {
20502140 if (skipFunction (F))
20512141 return false ;
0 commit comments