Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 77 additions & 10 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2271,6 +2271,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
? AMDGPU::SRC_SHARED_BASE
: AMDGPU::SRC_PRIVATE_BASE;
assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
!ST.hasGloballyAddressableScratch()) &&
"Cannot use src_private_base with globally addressable scratch!");
// FIXME: It would be more natural to emit a COPY here, but then copy
// coalescing would kick in and it would think it's okay to use the "HI"
// subregister (instead of extracting the HI 32 bits) which is an artificial
Expand Down Expand Up @@ -2396,11 +2399,30 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
ST.hasGloballyAddressableScratch()) {
// flat -> private with globally addressable scratch: subtract
// src_flat_scratch_base_lo.
const LLT S32 = LLT::scalar(32);
Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
Register FlatScratchBaseLo =
B.buildInstr(AMDGPU::S_MOV_B32, {S32},
{Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
.getReg(0);
MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
return B.buildIntToPtr(Dst, Sub).getReg(0);
}

// Extract low 32-bits of the pointer.
return B.buildExtract(Dst, Src, 0).getReg(0);
};

// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
// G_ADDRSPACE_CAST we need to guess.
if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
// Extract low 32-bits of the pointer.
B.buildExtract(Dst, Src, 0);
castFlatToLocalOrPrivate(Dst);
MI.eraseFromParent();
return true;
}
Expand All @@ -2411,7 +2433,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
auto FlatNull = B.buildConstant(SrcTy, 0);

// Extract low 32-bits of the pointer.
auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);

auto CmpRes =
B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
Expand All @@ -2425,14 +2447,45 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
if (!ApertureReg.isValid())
return false;

// Coerce the type of the low half of the result so we can use
// merge_values.
Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);

if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
ST.hasGloballyAddressableScratch()) {
// For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
// For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
Register AllOnes = B.buildConstant(S32, -1).getReg(0);
Register ThreadID = B.buildConstant(S32, 0).getReg(0);
ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
.addUse(AllOnes)
.addUse(ThreadID)
.getReg(0);
if (ST.isWave64()) {
ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
.addUse(AllOnes)
.addUse(ThreadID)
.getReg(0);
}
Register ShAmt =
B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
Register CvtPtr =
B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
// Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
// 64-bit hi:lo value.
Register FlatScratchBase =
B.buildInstr(AMDGPU::S_MOV_B64, {S64},
{Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
.getReg(0);
MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
}

Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
if (!ApertureReg.isValid())
return false;

// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
Expand Down Expand Up @@ -5788,11 +5841,25 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
unsigned AddrSpace) const {
Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
const LLT S32 = LLT::scalar(32);
auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
Register Hi32 = Unmerge.getReg(1);

B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
ST.hasGloballyAddressableScratch()) {
Register FlatScratchBaseHi =
B.buildInstr(AMDGPU::S_MOV_B32, {S32},
{Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
.getReg(0);
MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
// Test bits 63..58 against the aperture address.
Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
B.buildConstant(S32, 1u << 26));
} else {
Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
}
MI.eraseFromParent();
return true;
}
Expand Down
91 changes: 78 additions & 13 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2098,10 +2098,17 @@ bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {

bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
// Flat -> private/local is a simple truncate.
// Flat -> global is no-op
if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
Subtarget->hasGloballyAddressableScratch()) {
// Flat -> private requires subtracting src_flat_scratch_base_lo.
return false;
}

// Flat -> private/local is a simple truncate.
// Flat -> global is no-op
return true;
}

const GCNTargetMachine &TM =
static_cast<const GCNTargetMachine &>(getTargetMachine());
Expand Down Expand Up @@ -7650,6 +7657,9 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
? AMDGPU::SRC_SHARED_BASE
: AMDGPU::SRC_PRIVATE_BASE;
assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
!Subtarget->hasGloballyAddressableScratch()) &&
"Cannot use src_private_base with globally addressable scratch!");
// Note: this feature (register) is broken. When used as a 32-bit operand,
// it returns a wrong value (all zeroes?). The real value is in the upper 32
// bits.
Expand Down Expand Up @@ -7760,6 +7770,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);

if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
Subtarget->hasGloballyAddressableScratch()) {
// flat -> private with globally addressable scratch: subtract
// src_flat_scratch_base_lo.
SDValue FlatScratchBaseLo(
DAG.getMachineNode(
AMDGPU::S_MOV_B32, SL, MVT::i32,
DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
0);
Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
}

if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return Ptr;

Expand All @@ -7776,11 +7798,40 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {

SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
SDValue CvtPtr =
DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
SDValue CvtPtr;
if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
Subtarget->hasGloballyAddressableScratch()) {
// For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
// For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
ThreadID = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
AllOnes, ThreadID);
if (Subtarget->isWave64())
ThreadID = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
AllOnes, ThreadID);
SDValue ShAmt = DAG.getShiftAmountConstant(
57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
// Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
// 64-bit hi:lo value.
SDValue FlatScratchBase = {
DAG.getMachineNode(
AMDGPU::S_MOV_B64, SL, MVT::i64,
DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
0};
CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
} else {
SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
}

if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return CvtPtr;
Expand Down Expand Up @@ -9424,15 +9475,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private: {
SDLoc SL(Op);
unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
? AMDGPUAS::LOCAL_ADDRESS
: AMDGPUAS::PRIVATE_ADDRESS;
SDValue Aperture = getSegmentAperture(AS, SL, DAG);
SDValue SrcVec =
DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));

SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
DAG.getConstant(1, SL, MVT::i32));

unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
? AMDGPUAS::LOCAL_ADDRESS
: AMDGPUAS::PRIVATE_ADDRESS;
if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
Subtarget->hasGloballyAddressableScratch()) {
SDValue FlatScratchBaseHi(
DAG.getMachineNode(
AMDGPU::S_MOV_B32, DL, MVT::i32,
DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
0);
// Test bits 63..58 against the aperture address.
return DAG.getSetCC(
SL, MVT::i1,
DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
}

SDValue Aperture = getSegmentAperture(AS, SL, DAG);
return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
}
case Intrinsic::amdgcn_perm:
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,8 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16],

def SReg_64_XEXEC_XNULL : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SRC_SHARED_BASE,
SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> {
SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA,
SRC_FLAT_SCRATCH_BASE)> {
let CopyCost = 1;
let AllocationPriority = 1;
let HasSGPR = 1;
Expand Down
Loading