Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6405,6 +6405,13 @@ operations.
``buffer/global/flat_load/store/atomic`` instructions to global memory are
termed vector memory operations.

``global_load_lds`` or ``buffer/global_load`` instructions with the `lds` flag
are LDS DMA loads. They interact with caches as if the loaded data were
being loaded to registers and not to LDS, and so therefore support the same
cache modifiers. They cannot be performed atomically. They implement volatile
(via aux/cpol bit 31) and nontemporal (via metadata) as if they were loads
from the global address space.

Private address space uses ``buffer_load/store`` using the scratch V#
(GFX6-GFX8), or ``scratch_load/store`` (GFX9-GFX11). Since only a single thread
is accessing the memory, atomic memory orderings are not meaningful, and all
Expand Down
3 changes: 2 additions & 1 deletion llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2829,7 +2829,8 @@ class AMDGPUGlobalLoadLDS :
llvm_i32_ty, // imm offset (applied to both global and LDS address)
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
// bit 1 = sc1,
// bit 4 = scc))
// bit 4 = scc,
// bit 31 = volatile (compiler implemented)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We ought to get this in the AMDGPUUsage documentation for intrinsics (and we really should get tablegenerated docs for this)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah ... should making a table of aux bits bi this PR?

[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]>;
Expand Down
16 changes: 12 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3446,10 +3446,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
: 0); // swz

MachineMemOperand *LoadMMO = *MI.memoperands_begin();
// Don't set the offset value here because the pointer points to the base of
// the buffer.
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();

MachinePointerInfo StorePtrI = LoadPtrI;
StorePtrI.V = nullptr;
LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
AMDGPUAS::BUFFER_RESOURCE));
LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;

auto F = LoadMMO->getFlags() &
Expand Down Expand Up @@ -3627,13 +3631,17 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
if (isSGPR(Addr))
MIB.addReg(VOffset);

MIB.add(MI.getOperand(4)) // offset
.add(MI.getOperand(5)); // cpol
MIB.add(MI.getOperand(4)); // offset

unsigned Aux = MI.getOperand(5).getImm();
MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol

MachineMemOperand *LoadMMO = *MI.memoperands_begin();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = MI.getOperand(4).getImm();
MachinePointerInfo StorePtrI = LoadPtrI;
LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
AMDGPUAS::GLOBAL_ADDRESS));
LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIDefines.h
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,9 @@ enum CPol {
// Volatile (used to preserve/signal operation volatility for buffer
// operations not a real instruction bit)
VOLATILE = 1 << 31,
// The set of "cache policy" bits used for compiler features that
// do not correspond to handware features.
VIRTUAL_BITS = VOLATILE,
};

} // namespace CPol
Expand Down
13 changes: 10 additions & 3 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1652,6 +1652,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
Info.ptrVal = CI.getArgOperand(1);
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
Info.flags |= MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
Expand Down Expand Up @@ -11237,8 +11240,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

MachinePointerInfo StorePtrI = LoadPtrI;
LoadPtrI.V = PoisonValue::get(
PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE));
LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;

auto F = LoadMMO->getFlags() &
Expand Down Expand Up @@ -11325,7 +11328,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}

Ops.push_back(Op.getOperand(5)); // Offset
Ops.push_back(Op.getOperand(6)); // CPol

unsigned Aux = Op.getConstantOperandVal(6);
Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
MVT::i32)); // CPol

Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue

Expand Down
63 changes: 53 additions & 10 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/TargetParser/TargetParser.h"

Expand Down Expand Up @@ -276,6 +277,12 @@ class SIMemOpAccess final {
/// rmw operation, "std::nullopt" otherwise.
std::optional<SIMemOpInfo>
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;

/// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
/// along with an indication of whether this is a load or store. If it is not
/// a direct-to-LDS operation, returns std::nullopt.
std::optional<SIMemOpInfo>
getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
};

class SICacheControl {
Expand Down Expand Up @@ -691,6 +698,9 @@ class SIMemoryLegalizer final {
/// instructions are added/deleted or \p MI is modified, false otherwise.
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
/// Expands LDS DMA operation \p MI. Returns true if instructions are
/// added/deleted or \p MI is modified, false otherwise.
bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);

public:
SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
Expand Down Expand Up @@ -820,6 +830,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
return SIAtomicAddrSpace::SCRATCH;
if (AS == AMDGPUAS::REGION_ADDRESS)
return SIAtomicAddrSpace::GDS;
if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
AS == AMDGPUAS::BUFFER_STRIDED_POINTER)
return SIAtomicAddrSpace::GLOBAL;

return SIAtomicAddrSpace::OTHER;
}
Expand Down Expand Up @@ -975,6 +988,16 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
return constructFromMIWithMMO(MI);
}

std::optional<SIMemOpInfo>
SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);

if (!SIInstrInfo::isLDSDMA(*MI))
return std::nullopt;

return constructFromMIWithMMO(MI);
}

SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
TII = ST.getInstrInfo();
IV = getIsaVersion(ST.getCPU());
Expand Down Expand Up @@ -1078,7 +1101,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));

// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
Expand Down Expand Up @@ -1405,7 +1428,7 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));

// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
Expand Down Expand Up @@ -1707,7 +1730,7 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));

// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
Expand Down Expand Up @@ -1940,7 +1963,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));

// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
Expand Down Expand Up @@ -2236,7 +2259,7 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));

// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
Expand Down Expand Up @@ -2575,7 +2598,7 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {

// Only handle load and store, not atomic read-modify-write instructions.
assert(MI->mayLoad() ^ MI->mayStore());
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));

// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
Expand Down Expand Up @@ -2891,6 +2914,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
return Changed;
}

bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI) {
assert(MI->mayLoad() && MI->mayStore());

// The volatility or nontemporal-ness of the operation is a
// function of the global memory, not the LDS.
SIMemOp OpKind =
SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;

// Handle volatile and/or nontemporal markers on direct-to-LDS loads and
// stores. The operation is treated as a volatile/nontemporal store
// to its second argument.
return CC->enableVolatileAndOrNonTemporal(
MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
MOI.isNonTemporal(), MOI.isLastUse());
}

bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
const MachineModuleInfo &MMI =
getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
Expand Down Expand Up @@ -2942,14 +2982,17 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;

if (const auto &MOI = MOA.getLoadInfo(MI))
if (const auto &MOI = MOA.getLoadInfo(MI)) {
Changed |= expandLoad(*MOI, MI);
else if (const auto &MOI = MOA.getStoreInfo(MI)) {
} else if (const auto &MOI = MOA.getStoreInfo(MI)) {
Changed |= expandStore(*MOI, MI);
} else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
} else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) {
Changed |= expandLDSDMA(*MOI, MI);
} else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) {
Changed |= expandAtomicFence(*MOI, MI);
else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
} else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) {
Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3611,10 +3611,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
; SDAG-NEXT: s_waitcnt vmcnt(1)
; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6
; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
Expand All @@ -3627,12 +3627,12 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
; GISEL-NEXT: s_waitcnt vmcnt(1)
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
Expand All @@ -3652,6 +3652,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -3671,6 +3672,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
Expand Down
Loading
Loading