Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 55 additions & 30 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1863,15 +1863,6 @@ bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
SIInstrFlags::FlatScratch);
}

// If this matches zero_extend i32:x, return x
static SDValue matchZExtFromI32(SDValue Op) {
if (Op.getOpcode() != ISD::ZERO_EXTEND)
return SDValue();

SDValue ExtSrc = Op.getOperand(0);
return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
}

// If this matches *_extend i32:x, return x
// Otherwise if the value is I32 returns x.
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
Expand All @@ -1890,12 +1881,13 @@ static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
}

// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
SDValue Addr,
SDValue &SAddr,
SDValue &VOffset,
SDValue &Offset) const {
// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset, bool &ScaleOffset,
bool NeedIOffset) const {
int64_t ImmOffset = 0;
ScaleOffset = false;

// Match the immediate offset first, which canonically is moved as low as
// possible.
Expand All @@ -1905,7 +1897,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();

if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
if (NeedIOffset &&
TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
SIInstrFlags::FlatGlobal)) {
Addr = LHS;
ImmOffset = COffsetVal;
Expand All @@ -1915,11 +1908,14 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
// saddr + large_offset -> saddr +
// (voffset = large_offset & ~MaxOffset) +
// (large_offset & MaxOffset);
int64_t SplitImmOffset, RemainderOffset;
std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
if (NeedIOffset) {
std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
}

if (isUInt<32>(RemainderOffset)) {
if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
: isUInt<32>(RemainderOffset)) {
SDNode *VMov = CurDAG->getMachineNode(
AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
Expand All @@ -1946,21 +1942,26 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
// Match the variable offset.
if (Addr.getOpcode() == ISD::ADD) {
LHS = Addr.getOperand(0);
RHS = Addr.getOperand(1);

if (!LHS->isDivergent()) {
// add (i64 sgpr), (zero_extend (i32 vgpr))
if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
// add (i64 sgpr), (*_extend (i32 vgpr))
RHS = Addr.getOperand(1);
ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
if (SDValue ExtRHS = matchExtFromI32orI32(
RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
SAddr = LHS;
VOffset = ZextRHS;
VOffset = ExtRHS;
}
}

RHS = Addr.getOperand(1);
if (!SAddr && !RHS->isDivergent()) {
// add (zero_extend (i32 vgpr)), (i64 sgpr)
if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
// add (*_extend (i32 vgpr)), (i64 sgpr)
ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
if (SDValue ExtLHS = matchExtFromI32orI32(
LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
SAddr = RHS;
VOffset = ZextLHS;
VOffset = ExtLHS;
}
}

Expand All @@ -1970,6 +1971,27 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
}
}

if (Subtarget->hasScaleOffset() &&
(Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
? AMDGPUISD::MAD_I64_I32
: AMDGPUISD::MAD_U64_U32) ||
(Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
Addr.getOperand(0)->isDivergent() &&
isa<ConstantSDNode>(Addr.getOperand(1)) &&
!Addr.getOperand(2)->isDivergent()) {
// mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
unsigned Size =
(unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
ScaleOffset = Addr.getConstantOperandVal(1) == Size;
if (ScaleOffset) {
SAddr = Addr.getOperand(2);
VOffset = Addr.getOperand(0);
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
return true;
}
}

if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
isa<ConstantSDNode>(Addr))
return false;
Expand All @@ -1989,21 +2011,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
SDValue &CPol) const {
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
bool ScaleOffset;
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
return false;

CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
SDLoc(), MVT::i32);
return true;
}

bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
SDValue &CPol) const {
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
bool ScaleOffset;
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
return false;

unsigned CPolVal = AMDGPU::CPol::GLC;
unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
return true;
}
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset) const;
SDValue &VOffset, SDValue &Offset, bool &ScaleOffset,
bool NeedIOffset = true) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
Expand Down
63 changes: 50 additions & 13 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5616,7 +5616,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
unsigned CPolBits) const {
unsigned CPolBits,
bool NeedIOffset) const {
Register Addr = Root.getReg();
Register PtrBase;
int64_t ConstOffset;
Expand All @@ -5627,7 +5628,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);

if (ConstOffset != 0) {
if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
if (NeedIOffset &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
SIInstrFlags::FlatGlobal)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
Expand All @@ -5640,11 +5642,15 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
// saddr + large_offset -> saddr +
// (voffset = large_offset & ~MaxOffset) +
// (large_offset & MaxOffset);
int64_t SplitImmOffset, RemainderOffset;
std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
if (NeedIOffset) {
std::tie(SplitImmOffset, RemainderOffset) =
TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
SIInstrFlags::FlatGlobal);
}

if (isUInt<32>(RemainderOffset)) {
if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
: isUInt<32>(RemainderOffset)) {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
Register HighBits =
Expand All @@ -5654,12 +5660,22 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
HighBits)
.addImm(RemainderOffset);

if (NeedIOffset)
return {{
[=](MachineInstrBuilder &MIB) {
MIB.addReg(PtrBase);
}, // saddr
[=](MachineInstrBuilder &MIB) {
MIB.addReg(HighBits);
}, // voffset
[=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
}};
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
[=](MachineInstrBuilder &MIB) {
MIB.addReg(HighBits);
}, // voffset
[=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
}};
}
Expand Down Expand Up @@ -5691,18 +5707,33 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,

// It's possible voffset is an SGPR here, but the copy to VGPR will be
// inserted later.
if (Register VOffset = matchZeroExtendFromS32(PtrBaseOffset)) {
bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
Subtarget->hasSignedGVSOffset());
if (Register VOffset = matchExtendFromS32OrS32(
PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
if (NeedIOffset)
return {{[=](MachineInstrBuilder &MIB) { // saddr
MIB.addReg(SAddr);
},
[=](MachineInstrBuilder &MIB) { // voffset
MIB.addReg(VOffset);
},
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(ImmOffset);
},
[=](MachineInstrBuilder &MIB) { // cpol
MIB.addImm(CPolBits |
(ScaleOffset ? AMDGPU::CPol::SCAL : 0));
}}};
return {{[=](MachineInstrBuilder &MIB) { // saddr
MIB.addReg(SAddr);
},
[=](MachineInstrBuilder &MIB) { // voffset
MIB.addReg(VOffset);
},
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(ImmOffset);
},
[=](MachineInstrBuilder &MIB) { // cpol
MIB.addImm(CPolBits);
MIB.addImm(CPolBits |
(ScaleOffset ? AMDGPU::CPol::SCAL : 0));
}}};
}
}
Expand All @@ -5723,10 +5754,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
.addImm(0);

if (NeedIOffset)
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
[=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
[=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
}};
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
[=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
[=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
}};
}
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
selectScratchOffset(MachineOperand &Root) const;

InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const;
selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits,
bool NeedIOffset = true) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1167,6 +1167,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasFlatGVSMode() const { return FlatGVSMode; }

// FLAT GLOBAL VOffset is signed
bool hasSignedGVSOffset() const { return GFX1250Insts; }

bool enableSIScheduler() const {
return EnableSIScheduler;
}
Expand Down
7 changes: 4 additions & 3 deletions llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,13 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scale_offset scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
; GCN-NEXT: s_mov_b32 s0, exec_lo
; GCN-NEXT: v_cmpx_ne_u32_e32 0, v2
Expand Down
Loading
Loading