Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 26 additions & 10 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6803,13 +6803,38 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
unsigned Size = Ty.getSizeInBits();
MachineFunction &MF = B.getMF();
unsigned Opc = 0;

const unsigned MemSize = (Size + 7) / 8;
const Align MemAlign = B.getDataLayout().getABITypeAlign(
getTypeForLLT(Ty, MF.getFunction().getContext()));

// FIXME: When intrinsic definition is fixed, this should have an MMO already.
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo(),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
MemSize, MemAlign);

if (Size < 32 && ST.hasScalarSubwordLoads()) {
assert(Size == 8 || Size == 16);
if (!ST.hasScalarSubwordBufferLoads()) {
// fallback to S_BUFFER_LOAD_UBYTE/USHORT
MI.getOperand(1).setIntrinsicID(Intrinsic::amdgcn_raw_buffer_load);

Register Zero = B.buildConstant(S32, 0).getReg(0);
MI.insert(MI.operands_begin() + 4,
MachineOperand::CreateReg(Zero, false));

MI.addMemOperand(MF, MMO);
Observer.changedInstr(MI);
return true;
}

Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
: AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
// The 8-bit and 16-bit scalar buffer load instructions have 32-bit
// destination register.
Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
Dst = B.getMRI()->createGenericVirtualRegister(S32);
} else {
Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
Dst = OrigDst;
Expand All @@ -6834,15 +6859,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
MI.setDesc(B.getTII().get(Opc));
MI.removeOperand(1); // Remove intrinsic ID

// FIXME: When intrinsic definition is fixed, this should have an MMO already.
const unsigned MemSize = (Size + 7) / 8;
const Align MemAlign = B.getDataLayout().getABITypeAlign(
getTypeForLLT(Ty, MF.getFunction().getContext()));
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo(),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
MemSize, MemAlign);
MI.addMemOperand(MF, MMO);
if (Dst != OrigDst) {
MI.getOperand(0).setReg(Dst);
Expand Down
41 changes: 31 additions & 10 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1406,16 +1406,37 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
if (i != 0)
BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);

B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
.addDef(LoadParts[i]) // vdata
.addUse(RSrc) // rsrc
.addUse(VIndex) // vindex
.addUse(VOffset) // voffset
.addUse(SOffset) // soffset
.addImm(ImmOffset + 16 * i) // offset(imm)
.addImm(0) // cachepolicy, swizzled buffer(imm)
.addImm(0) // idxen(imm)
.addMemOperand(MMO);
unsigned Opc;
switch (MI.getOpcode()) {
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
break;
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
break;
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
break;
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
break;
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
break;
default:
llvm_unreachable("Unexpected opcode");
}

B.buildInstr(Opc)
.addDef(LoadParts[i]) // vdata
.addUse(RSrc) // rsrc
.addUse(VIndex) // vindex
.addUse(VOffset) // voffset
.addUse(SOffset) // soffset
.addImm(ImmOffset + 16 * i) // offset(imm)
.addImm(0) // cachepolicy, swizzled buffer(imm)
.addImm(0) // idxen(imm)
.addMemOperand(MMO);
}

// TODO: If only the resource is a VGPR, it may be better to execute the
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
cl::desc("Enable the use of AA during codegen."),
cl::init(true));

static cl::opt<bool> UseGFX12SubwordSBufferLoad(
"amdgpu-use-gfx12-subword-sbuffer-load",
cl::desc("Enable the use of s_buffer_load_(i/u)(8/16) instructions."),
cl::init(false));

static cl::opt<unsigned>
NSAThreshold("amdgpu-nsa-threshold",
cl::desc("Number of addresses from which to enable MIMG NSA."),
Expand Down Expand Up @@ -348,6 +353,19 @@ void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
}
}

bool GCNSubtarget::hasScalarSubwordBufferLoads() const {
Generation Gen = getGeneration();

// On gfx12, s_buffer_load_(i/u)(8/16) have a hw-bug that is triggered when:
// * the stride is not a multiple of 4, or
// * the stride is 0 and the num-records is not a multiple of 4
// Avoid these instructions unless the frontend explicitly specifies that the
// input buffers are known to not trigger the bug.
if (Gen == GFX12)
return UseGFX12SubwordSBufferLoad;
return hasScalarSubwordLoads();
}

bool GCNSubtarget::hasMadF16() const {
return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }

bool hasScalarSubwordBufferLoads() const;

TrapHandlerAbi getTrapHandlerAbi() const {
return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
}
Expand Down
81 changes: 43 additions & 38 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6438,7 +6438,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
MachineMemOperand::MOInvariant,
VT.getStoreSize(), Alignment);
SDValue LoadVal;
if (!Offset->isDivergent()) {
if (!Offset->isDivergent() && Subtarget->hasScalarSubwordBufferLoads()) {
SDValue Ops[] = {Rsrc, // source register
Offset, CachePolicy};
SDValue BufferLoad =
Expand Down Expand Up @@ -8367,52 +8367,57 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
MachineMemOperand::MOInvariant,
VT.getStoreSize(), Alignment);

if (!Offset->isDivergent()) {
SDValue Ops[] = {Rsrc, Offset, CachePolicy};

// Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
// s_buffer_load_u16 instruction is emitted for both signed and unsigned
// loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
// and generates s_buffer_load_i16 (performSignExtendInRegCombine).
if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
SDValue BufferLoad =
DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
DAG.getVTList(MVT::i32), Ops, VT, MMO);
// We have a divergent offset. Emit a MUBUF buffer load instead. We can
// assume that the buffer is unswizzled.
SDValue BufferLoadOps[] = {
DAG.getEntryNode(), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
{}, // voffset
{}, // soffset
{}, // offset
CachePolicy, // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};

if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
if (!Offset->isDivergent() && Subtarget->hasScalarSubwordBufferLoads()) {
// Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
// s_buffer_load_u16 instruction is emitted for both signed and unsigned
// loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
// and generates s_buffer_load_i16 (performSignExtendInRegCombine).
SDValue SBufferLoadOps[] = {Rsrc, Offset, CachePolicy};
SDValue BufferLoad = DAG.getMemIntrinsicNode(
AMDGPUISD::SBUFFER_LOAD_USHORT, DL, DAG.getVTList(MVT::i32),
SBufferLoadOps, VT, MMO);
return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
}

// If s_buffer_load_u16/u8 is not supported by the platform (gfx12 when we
// cannot ensure the buffer's num-records/stride is not properly aligned)
// lower to a buffer_load_u8/u16
setBufferOffsets(Offset, DAG, &BufferLoadOps[3], Align(4));
return handleByteShortBufferLoads(DAG, VT, DL, BufferLoadOps, MMO);
}

if (!Offset->isDivergent()) {
SDValue SBufferLoadOps[] = {Rsrc, Offset, CachePolicy};

// Widen vec3 load to vec4.
if (VT.isVector() && VT.getVectorNumElements() == 3 &&
!Subtarget->hasScalarDwordx3Loads()) {
EVT WidenedVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
auto WidenedOp = DAG.getMemIntrinsicNode(
AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), SBufferLoadOps,
WidenedVT, MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
DAG.getVectorIdxConstant(0, DL));
return Subvector;
}

return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
DAG.getVTList(VT), Ops, VT, MMO);
}

// We have a divergent offset. Emit a MUBUF buffer load instead. We can
// assume that the buffer is unswizzled.
SDValue Ops[] = {
DAG.getEntryNode(), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
{}, // voffset
{}, // soffset
{}, // offset
CachePolicy, // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
DAG.getVTList(VT), SBufferLoadOps, VT, MMO);
}

SmallVector<SDValue, 4> Loads;
Expand All @@ -8431,14 +8436,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,

// Use the alignment to ensure that the required offsets will fit into the
// immediate offsets.
setBufferOffsets(Offset, DAG, &Ops[3],
setBufferOffsets(Offset, DAG, &BufferLoadOps[3],
NumLoads > 1 ? Align(16 * NumLoads) : Align(4));

uint64_t InstOffset = Ops[5]->getAsZExtVal();
uint64_t InstOffset = BufferLoadOps[5]->getAsZExtVal();
for (unsigned i = 0; i < NumLoads; ++i) {
Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
LoadVT, MMO, DAG));
BufferLoadOps[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
BufferLoadOps, LoadVT, MMO, DAG));
}

if (NumElts == 8 || NumElts == 16)
Expand Down Expand Up @@ -12680,7 +12685,7 @@ SITargetLowering::performSignExtendInRegCombine(SDNode *N,
VTSign->getVT() == MVT::i8) ||
(Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
VTSign->getVT() == MVT::i16))) {
assert(Subtarget->hasScalarSubwordLoads() &&
assert(Subtarget->hasScalarSubwordBufferLoads() &&
"s_buffer_load_{u8, i8} are supported "
"in GFX12 (or newer) architectures.");
EVT VT = Src.getValueType();
Expand Down
Loading