Skip to content

Commit 54e8a9c

Browse files
rampitecmahesh-attarde
authored andcommitted
[AMDGPU] Select scale_offset for global instructions on gfx1250 (llvm#150107)
Also switches immediate offset to signed for the subtarget.
1 parent ddfdff8 commit 54e8a9c

10 files changed

+1003
-144
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 55 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1863,15 +1863,6 @@ bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
18631863
SIInstrFlags::FlatScratch);
18641864
}
18651865

1866-
// If this matches zero_extend i32:x, return x
1867-
static SDValue matchZExtFromI32(SDValue Op) {
1868-
if (Op.getOpcode() != ISD::ZERO_EXTEND)
1869-
return SDValue();
1870-
1871-
SDValue ExtSrc = Op.getOperand(0);
1872-
return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1873-
}
1874-
18751866
// If this matches *_extend i32:x, return x
18761867
// Otherwise if the value is I32 returns x.
18771868
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
@@ -1890,12 +1881,13 @@ static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
18901881
}
18911882

18921883
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1893-
bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1894-
SDValue Addr,
1895-
SDValue &SAddr,
1896-
SDValue &VOffset,
1897-
SDValue &Offset) const {
1884+
// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
1885+
bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
1886+
SDValue &SAddr, SDValue &VOffset,
1887+
SDValue &Offset, bool &ScaleOffset,
1888+
bool NeedIOffset) const {
18981889
int64_t ImmOffset = 0;
1890+
ScaleOffset = false;
18991891

19001892
// Match the immediate offset first, which canonically is moved as low as
19011893
// possible.
@@ -1905,7 +1897,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
19051897
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
19061898
const SIInstrInfo *TII = Subtarget->getInstrInfo();
19071899

1908-
if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1900+
if (NeedIOffset &&
1901+
TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
19091902
SIInstrFlags::FlatGlobal)) {
19101903
Addr = LHS;
19111904
ImmOffset = COffsetVal;
@@ -1915,11 +1908,14 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
19151908
// saddr + large_offset -> saddr +
19161909
// (voffset = large_offset & ~MaxOffset) +
19171910
// (large_offset & MaxOffset);
1918-
int64_t SplitImmOffset, RemainderOffset;
1919-
std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1920-
COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1911+
int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
1912+
if (NeedIOffset) {
1913+
std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1914+
COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1915+
}
19211916

1922-
if (isUInt<32>(RemainderOffset)) {
1917+
if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
1918+
: isUInt<32>(RemainderOffset)) {
19231919
SDNode *VMov = CurDAG->getMachineNode(
19241920
AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
19251921
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
@@ -1946,21 +1942,26 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
19461942
// Match the variable offset.
19471943
if (Addr.getOpcode() == ISD::ADD) {
19481944
LHS = Addr.getOperand(0);
1949-
RHS = Addr.getOperand(1);
19501945

19511946
if (!LHS->isDivergent()) {
1952-
// add (i64 sgpr), (zero_extend (i32 vgpr))
1953-
if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1947+
// add (i64 sgpr), (*_extend (i32 vgpr))
1948+
RHS = Addr.getOperand(1);
1949+
ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
1950+
if (SDValue ExtRHS = matchExtFromI32orI32(
1951+
RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
19541952
SAddr = LHS;
1955-
VOffset = ZextRHS;
1953+
VOffset = ExtRHS;
19561954
}
19571955
}
19581956

1957+
RHS = Addr.getOperand(1);
19591958
if (!SAddr && !RHS->isDivergent()) {
1960-
// add (zero_extend (i32 vgpr)), (i64 sgpr)
1961-
if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1959+
// add (*_extend (i32 vgpr)), (i64 sgpr)
1960+
ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
1961+
if (SDValue ExtLHS = matchExtFromI32orI32(
1962+
LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
19621963
SAddr = RHS;
1963-
VOffset = ZextLHS;
1964+
VOffset = ExtLHS;
19641965
}
19651966
}
19661967

@@ -1970,6 +1971,27 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
19701971
}
19711972
}
19721973

1974+
if (Subtarget->hasScaleOffset() &&
1975+
(Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
1976+
? AMDGPUISD::MAD_I64_I32
1977+
: AMDGPUISD::MAD_U64_U32) ||
1978+
(Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
1979+
CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
1980+
Addr.getOperand(0)->isDivergent() &&
1981+
isa<ConstantSDNode>(Addr.getOperand(1)) &&
1982+
!Addr.getOperand(2)->isDivergent()) {
1983+
// mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
1984+
unsigned Size =
1985+
(unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
1986+
ScaleOffset = Addr.getConstantOperandVal(1) == Size;
1987+
if (ScaleOffset) {
1988+
SAddr = Addr.getOperand(2);
1989+
VOffset = Addr.getOperand(0);
1990+
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1991+
return true;
1992+
}
1993+
}
1994+
19731995
if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
19741996
isa<ConstantSDNode>(Addr))
19751997
return false;
@@ -1989,21 +2011,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
19892011
SDValue &SAddr, SDValue &VOffset,
19902012
SDValue &Offset,
19912013
SDValue &CPol) const {
1992-
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
2014+
bool ScaleOffset;
2015+
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
19932016
return false;
19942017

1995-
CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2018+
CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2019+
SDLoc(), MVT::i32);
19962020
return true;
19972021
}
19982022

19992023
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
20002024
SDValue &SAddr, SDValue &VOffset,
20012025
SDValue &Offset,
20022026
SDValue &CPol) const {
2003-
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
2027+
bool ScaleOffset;
2028+
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
20042029
return false;
20052030

2006-
unsigned CPolVal = AMDGPU::CPol::GLC;
2031+
unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
20072032
CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
20082033
return true;
20092034
}

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
162162
bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
163163
SDValue &Offset) const;
164164
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
165-
SDValue &VOffset, SDValue &Offset) const;
165+
SDValue &VOffset, SDValue &Offset, bool &ScaleOffset,
166+
bool NeedIOffset = true) const;
166167
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
167168
SDValue &VOffset, SDValue &Offset,
168169
SDValue &CPol) const;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5616,7 +5616,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
56165616
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
56175617
InstructionSelector::ComplexRendererFns
56185618
AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5619-
unsigned CPolBits) const {
5619+
unsigned CPolBits,
5620+
bool NeedIOffset) const {
56205621
Register Addr = Root.getReg();
56215622
Register PtrBase;
56225623
int64_t ConstOffset;
@@ -5627,7 +5628,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
56275628
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
56285629

56295630
if (ConstOffset != 0) {
5630-
if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5631+
if (NeedIOffset &&
5632+
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
56315633
SIInstrFlags::FlatGlobal)) {
56325634
Addr = PtrBase;
56335635
ImmOffset = ConstOffset;
@@ -5640,11 +5642,15 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
56405642
// saddr + large_offset -> saddr +
56415643
// (voffset = large_offset & ~MaxOffset) +
56425644
// (large_offset & MaxOffset);
5643-
int64_t SplitImmOffset, RemainderOffset;
5644-
std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
5645-
ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
5645+
int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5646+
if (NeedIOffset) {
5647+
std::tie(SplitImmOffset, RemainderOffset) =
5648+
TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5649+
SIInstrFlags::FlatGlobal);
5650+
}
56465651

5647-
if (isUInt<32>(RemainderOffset)) {
5652+
if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5653+
: isUInt<32>(RemainderOffset)) {
56485654
MachineInstr *MI = Root.getParent();
56495655
MachineBasicBlock *MBB = MI->getParent();
56505656
Register HighBits =
@@ -5654,12 +5660,22 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
56545660
HighBits)
56555661
.addImm(RemainderOffset);
56565662

5663+
if (NeedIOffset)
5664+
return {{
5665+
[=](MachineInstrBuilder &MIB) {
5666+
MIB.addReg(PtrBase);
5667+
}, // saddr
5668+
[=](MachineInstrBuilder &MIB) {
5669+
MIB.addReg(HighBits);
5670+
}, // voffset
5671+
[=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5672+
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5673+
}};
56575674
return {{
56585675
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
56595676
[=](MachineInstrBuilder &MIB) {
56605677
MIB.addReg(HighBits);
56615678
}, // voffset
5662-
[=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
56635679
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
56645680
}};
56655681
}
@@ -5691,18 +5707,33 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
56915707

56925708
// It's possible voffset is an SGPR here, but the copy to VGPR will be
56935709
// inserted later.
5694-
if (Register VOffset = matchZeroExtendFromS32(PtrBaseOffset)) {
5710+
bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5711+
Subtarget->hasSignedGVSOffset());
5712+
if (Register VOffset = matchExtendFromS32OrS32(
5713+
PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5714+
if (NeedIOffset)
5715+
return {{[=](MachineInstrBuilder &MIB) { // saddr
5716+
MIB.addReg(SAddr);
5717+
},
5718+
[=](MachineInstrBuilder &MIB) { // voffset
5719+
MIB.addReg(VOffset);
5720+
},
5721+
[=](MachineInstrBuilder &MIB) { // offset
5722+
MIB.addImm(ImmOffset);
5723+
},
5724+
[=](MachineInstrBuilder &MIB) { // cpol
5725+
MIB.addImm(CPolBits |
5726+
(ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5727+
}}};
56955728
return {{[=](MachineInstrBuilder &MIB) { // saddr
56965729
MIB.addReg(SAddr);
56975730
},
56985731
[=](MachineInstrBuilder &MIB) { // voffset
56995732
MIB.addReg(VOffset);
57005733
},
5701-
[=](MachineInstrBuilder &MIB) { // offset
5702-
MIB.addImm(ImmOffset);
5703-
},
57045734
[=](MachineInstrBuilder &MIB) { // cpol
5705-
MIB.addImm(CPolBits);
5735+
MIB.addImm(CPolBits |
5736+
(ScaleOffset ? AMDGPU::CPol::SCAL : 0));
57065737
}}};
57075738
}
57085739
}
@@ -5723,10 +5754,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
57235754
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
57245755
.addImm(0);
57255756

5757+
if (NeedIOffset)
5758+
return {{
5759+
[=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5760+
[=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5761+
[=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5762+
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5763+
}};
57265764
return {{
57275765
[=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
57285766
[=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5729-
[=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
57305767
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
57315768
}};
57325769
}

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
256256
selectScratchOffset(MachineOperand &Root) const;
257257

258258
InstructionSelector::ComplexRendererFns
259-
selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const;
259+
selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits,
260+
bool NeedIOffset = true) const;
260261
InstructionSelector::ComplexRendererFns
261262
selectGlobalSAddr(MachineOperand &Root) const;
262263
InstructionSelector::ComplexRendererFns

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1167,6 +1167,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
11671167

11681168
bool hasFlatGVSMode() const { return FlatGVSMode; }
11691169

1170+
// FLAT GLOBAL VOffset is signed
1171+
bool hasSignedGVSOffset() const { return GFX1250Insts; }
1172+
11701173
bool enableSIScheduler() const {
11711174
return EnableSIScheduler;
11721175
}

llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,12 +145,13 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
145145
; GCN: ; %bb.0: ; %bb
146146
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
147147
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
148-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
149-
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
148+
; GCN-NEXT: v_mov_b32_e32 v1, 0
150149
; GCN-NEXT: s_wait_kmcnt 0x0
151-
; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
150+
; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scale_offset scope:SCOPE_SYS
152151
; GCN-NEXT: s_wait_loadcnt 0x0
153152
; GCN-NEXT: s_wait_xcnt 0x0
153+
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
154+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
154155
; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
155156
; GCN-NEXT: s_mov_b32 s0, exec_lo
156157
; GCN-NEXT: v_cmpx_ne_u32_e32 0, v2

0 commit comments

Comments
 (0)