Skip to content

Commit 96e5eed

Browse files
authored
[AMDGPU] Select VMEM prefetch for llvm.prefetch on gfx1250 (#150493)
We have a choice to use a scalar or vector prefetch for an uniform pointer. Since we do not have scalar stores our scalar cache is practically readonly. The rw argument of the prefetch intrinsic is used to force vector operation even for an uniform case. On GFX12 scalar prefetch will be used anyway, it is still useful but it will only bring data to L2.
1 parent 7884c07 commit 96e5eed

13 files changed

+1016
-47
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,12 @@ def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch",
280280
"SMEM prefetches do not fail on illegal address"
281281
>;
282282

283+
def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
284+
"HasSafeCUPrefetch",
285+
"true",
286+
"VMEM CU scope prefetches do not fail on illegal address"
287+
>;
288+
283289
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
284290
"HasVcmpxExecWARHazard",
285291
"true",

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,5 +446,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
446446
def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
447447
GISDNodeXFormEquiv<as_hw_round_mode>;
448448

449+
def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">,
450+
GISDNodeXFormEquiv<PrefetchLoc>;
451+
449452
def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">,
450453
GISDNodeXFormEquiv<MFMALdScaleXForm>;

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "SIModeRegisterDefaults.h"
2020
#include "llvm/Analysis/ValueTracking.h"
2121
#include "llvm/CodeGen/SelectionDAGISel.h"
22+
#include "llvm/Support/AMDGPUAddrSpace.h"
2223
#include "llvm/Target/TargetMachine.h"
2324

2425
namespace llvm {

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7068,6 +7068,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
70687068
MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
70697069
}
70707070

7071+
void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7072+
const MachineInstr &MI,
7073+
int OpIdx) const {
7074+
uint32_t V = MI.getOperand(2).getImm();
7075+
V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK))
7076+
<< AMDGPU::CPol::SCOPE_SHIFT;
7077+
if (!Subtarget->hasSafeCUPrefetch())
7078+
V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7079+
MIB.addImm(V);
7080+
}
7081+
70717082
/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
70727083
void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
70737084
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
414414

415415
void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
416416
int OpIdx) const;
417+
418+
void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI,
419+
int OpIdx) const;
420+
417421
void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB,
418422
const MachineInstr &MI, int OpIdx) const;
419423

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3501,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
35013501
applyMappingMAD_64_32(B, OpdMapper);
35023502
return;
35033503
case AMDGPU::G_PREFETCH: {
3504-
if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) {
3504+
if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) {
35053505
MI.eraseFromParent();
35063506
return;
35073507
}
35083508
Register PtrReg = MI.getOperand(0).getReg();
35093509
unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
3510-
if (PtrBank == AMDGPU::VGPRRegBankID) {
3510+
if (PtrBank == AMDGPU::VGPRRegBankID &&
3511+
(!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) {
3512+
// Cannot do I$ prefetch with divergent pointer.
35113513
MI.eraseFromParent();
35123514
return;
35133515
}
35143516
unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3515-
if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3516-
AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3517+
if ((!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3518+
AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3519+
(!Subtarget.hasSafeSmemPrefetch() &&
3520+
(AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
3521+
!MI.getOperand(3).getImm() /* I$ prefetch */))) {
35173522
MI.eraseFromParent();
35183523
return;
35193524
}

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2184,6 +2184,50 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
21842184

21852185
} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
21862186

2187+
def PrefetchLoc: SDNodeXForm<timm, [{
2188+
uint32_t V = N->getZExtValue();
2189+
V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT;
2190+
if (!Subtarget->hasSafeCUPrefetch())
2191+
V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
2192+
return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
2193+
}]>;
2194+
2195+
def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
2196+
(prefetch node:$ptr, node:$rw, node:$loc, node:$type),
2197+
[{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> {
2198+
let GISelPredicateCode = [{
2199+
return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
2200+
}];
2201+
}
2202+
2203+
def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
2204+
(prefetch node:$ptr, node:$rw, node:$loc, node:$type),
2205+
[{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
2206+
(cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
2207+
!Subtarget->hasSafeSmemPrefetch()); }]> {
2208+
let GISelPredicateCode = [{
2209+
return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
2210+
((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
2211+
!Subtarget->hasSafeSmemPrefetch());
2212+
}];
2213+
}
2214+
2215+
multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> {
2216+
def : GCNPat <
2217+
(prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
2218+
(!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc)))
2219+
> {
2220+
let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25);
2221+
}
2222+
2223+
def : GCNPat <
2224+
(prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
2225+
(!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc)))
2226+
> {
2227+
let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30);
2228+
}
2229+
}
2230+
21872231
multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
21882232
def : GCNPat <
21892233
(intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol),
@@ -2198,6 +2242,14 @@ multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
21982242
}
21992243

22002244
let SubtargetPredicate = HasVmemPrefInsts in {
2245+
defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>;
2246+
defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>;
2247+
2248+
// Patterns for forced vector prefetch with rw = 1.
2249+
defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>;
2250+
defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>;
2251+
2252+
22012253
// Patterns for target intrinsics
22022254
defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>;
22032255
defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>;

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
247247
bool HasInstFwdPrefetchBug = false;
248248
bool HasVmemPrefInsts = false;
249249
bool HasSafeSmemPrefetch = false;
250+
bool HasSafeCUPrefetch = false;
250251
bool HasVcmpxExecWARHazard = false;
251252
bool HasLdsBranchVmemWARHazard = false;
252253
bool HasNSAtoVMEMBug = false;
@@ -995,6 +996,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
995996

996997
bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
997998

999+
bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
1000+
9981001
// Has s_cmpk_* instructions.
9991002
bool hasSCmpK() const { return getGeneration() < GFX12; }
10001003

llvm/lib/Target/AMDGPU/SIDefines.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -392,11 +392,13 @@ enum CPol {
392392
TH_ATOMIC_CASCADE = 4, // Cascading vs regular
393393

394394
// Scope
395-
SCOPE = 0x3 << 3, // All Scope bits
396-
SCOPE_CU = 0 << 3,
397-
SCOPE_SE = 1 << 3,
398-
SCOPE_DEV = 2 << 3,
399-
SCOPE_SYS = 3 << 3,
395+
SCOPE_SHIFT = 3,
396+
SCOPE_MASK = 0x3,
397+
SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits
398+
SCOPE_CU = 0 << SCOPE_SHIFT,
399+
SCOPE_SE = 1 << SCOPE_SHIFT,
400+
SCOPE_DEV = 2 << SCOPE_SHIFT,
401+
SCOPE_SYS = 3 << SCOPE_SHIFT,
400402

401403
NV = 1 << 5, // Non-volatile bit
402404

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -882,7 +882,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
882882
if (Subtarget->hasMad64_32())
883883
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
884884

885-
if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch())
885+
if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
886886
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
887887

888888
if (Subtarget->hasIEEEMinimumMaximumInsts()) {
@@ -4444,19 +4444,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
44444444
}
44454445

44464446
SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4447-
if (Op->isDivergent())
4447+
if (Op->isDivergent() &&
4448+
(!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4449+
// Cannot do I$ prefetch with divergent pointer.
44484450
return SDValue();
44494451

44504452
switch (cast<MemSDNode>(Op)->getAddressSpace()) {
44514453
case AMDGPUAS::FLAT_ADDRESS:
44524454
case AMDGPUAS::GLOBAL_ADDRESS:
44534455
case AMDGPUAS::CONSTANT_ADDRESS:
4454-
case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
44554456
break;
4457+
case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4458+
if (Subtarget->hasSafeSmemPrefetch())
4459+
break;
4460+
[[fallthrough]];
44564461
default:
44574462
return SDValue();
44584463
}
44594464

4465+
// I$ prefetch
4466+
if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4467+
return SDValue();
4468+
44604469
return Op;
44614470
}
44624471

0 commit comments

Comments
 (0)