@@ -2098,10 +2098,17 @@ bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
20982098
20992099bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
21002100 unsigned DestAS) const {
2101- // Flat -> private/local is a simple truncate.
2102- // Flat -> global is no-op
2103- if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2101+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2102+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2103+ Subtarget->hasGloballyAddressableScratch()) {
2104+ // Flat -> private requires subtracting src_flat_scratch_base_lo.
2105+ return false;
2106+ }
2107+
2108+ // Flat -> private/local is a simple truncate.
2109+ // Flat -> global is no-op
21042110 return true;
2111+ }
21052112
21062113 const GCNTargetMachine &TM =
21072114 static_cast<const GCNTargetMachine &>(getTargetMachine());
@@ -7650,6 +7657,9 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
76507657 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
76517658 ? AMDGPU::SRC_SHARED_BASE
76527659 : AMDGPU::SRC_PRIVATE_BASE;
7660+ assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
7661+ !Subtarget->hasGloballyAddressableScratch()) &&
7662+ "Cannot use src_private_base with globally addressable scratch!");
76537663 // Note: this feature (register) is broken. When used as a 32-bit operand,
76547664 // it returns a wrong value (all zeroes?). The real value is in the upper 32
76557665 // bits.
@@ -7760,6 +7770,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
77607770 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
77617771 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
77627772
7773+ if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
7774+ Subtarget->hasGloballyAddressableScratch()) {
7775+ // flat -> private with globally addressable scratch: subtract
7776+ // src_flat_scratch_base_lo.
7777+ SDValue FlatScratchBaseLo(
7778+ DAG.getMachineNode(
7779+ AMDGPU::S_MOV_B32, SL, MVT::i32,
7780+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
7781+ 0);
7782+ Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
7783+ }
7784+
77637785 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
77647786 return Ptr;
77657787
@@ -7776,11 +7798,40 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
77767798 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
77777799 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
77787800 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7779-
7780- SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7781- SDValue CvtPtr =
7782- DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7783- CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7801+ SDValue CvtPtr;
7802+ if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
7803+ Subtarget->hasGloballyAddressableScratch()) {
7804+ // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
7805+ // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
7806+ SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
7807+ SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
7808+ ThreadID = DAG.getNode(
7809+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7810+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
7811+ AllOnes, ThreadID);
7812+ if (Subtarget->isWave64())
7813+ ThreadID = DAG.getNode(
7814+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7815+ DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
7816+ AllOnes, ThreadID);
7817+ SDValue ShAmt = DAG.getShiftAmountConstant(
7818+ 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
7819+ SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
7820+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
7821+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7822+ // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
7823+ // 64-bit hi:lo value.
7824+ SDValue FlatScratchBase = {
7825+ DAG.getMachineNode(
7826+ AMDGPU::S_MOV_B64, SL, MVT::i64,
7827+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
7828+ 0};
7829+ CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
7830+ } else {
7831+ SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7832+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7833+ CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7834+ }
77847835
77857836 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
77867837 return CvtPtr;
@@ -9424,15 +9475,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
94249475 case Intrinsic::amdgcn_is_shared:
94259476 case Intrinsic::amdgcn_is_private: {
94269477 SDLoc SL(Op);
9427- unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9428- ? AMDGPUAS::LOCAL_ADDRESS
9429- : AMDGPUAS::PRIVATE_ADDRESS;
9430- SDValue Aperture = getSegmentAperture(AS, SL, DAG);
94319478 SDValue SrcVec =
94329479 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
9433-
94349480 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
94359481 DAG.getConstant(1, SL, MVT::i32));
9482+
9483+ unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9484+ ? AMDGPUAS::LOCAL_ADDRESS
9485+ : AMDGPUAS::PRIVATE_ADDRESS;
9486+ if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
9487+ Subtarget->hasGloballyAddressableScratch()) {
9488+ SDValue FlatScratchBaseHi(
9489+ DAG.getMachineNode(
9490+ AMDGPU::S_MOV_B32, DL, MVT::i32,
9491+ DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9492+ 0);
9493+ // Test bits 63..58 against the aperture address.
9494+ return DAG.getSetCC(
9495+ SL, MVT::i1,
9496+ DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
9497+ DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
9498+ }
9499+
9500+ SDValue Aperture = getSegmentAperture(AS, SL, DAG);
94369501 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
94379502 }
94389503 case Intrinsic::amdgcn_perm:
0 commit comments