Skip to content

Commit d08c297

Browse files
authored
[AMDGPU] Add MC support for new gfx1250 src_flat_scratch_base_lo/hi (#152203)
1 parent dbaa82b commit d08c297

File tree

9 files changed

+139
-42
lines changed

9 files changed

+139
-42
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1365,6 +1365,13 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
13651365
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
13661366
>;
13671367

1368+
def FeatureGloballyAddressableScratch : SubtargetFeature<
1369+
"globally-addressable-scratch",
1370+
"HasGloballyAddressableScratch",
1371+
"true",
1372+
"FLAT instructions can access scratch memory for any thread in any wave"
1373+
>;
1374+
13681375
// FIXME: Remove after all users are migrated to attribute.
13691376
def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
13701377
"DynamicVGPR",
@@ -2055,6 +2062,7 @@ def FeatureISAVersion12_50 : FeatureSet<
20552062
FeatureAtomicFMinFMaxF64FlatInsts,
20562063
FeatureFlatBufferGlobalAtomicFaddF64Inst,
20572064
FeatureMemoryAtomicFAddF32DenormalSupport,
2065+
FeatureGloballyAddressableScratch,
20582066
FeatureKernargPreload,
20592067
FeatureVmemPrefInsts,
20602068
FeatureLshlAddU64Inst,

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 49 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1620,6 +1620,10 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
16201620
return getFeatureBits()[AMDGPU::FeaturePartialNSAEncoding];
16211621
}
16221622

1623+
bool hasGloballyAddressableScratch() const {
1624+
return getFeatureBits()[AMDGPU::FeatureGloballyAddressableScratch];
1625+
}
1626+
16231627
unsigned getNSAMaxSize(bool HasSampler = false) const {
16241628
return AMDGPU::getNSAMaxSize(getSTI(), HasSampler);
16251629
}
@@ -2759,46 +2763,48 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
27592763

27602764
static MCRegister getSpecialRegForName(StringRef RegName) {
27612765
return StringSwitch<unsigned>(RegName)
2762-
.Case("exec", AMDGPU::EXEC)
2763-
.Case("vcc", AMDGPU::VCC)
2764-
.Case("flat_scratch", AMDGPU::FLAT_SCR)
2765-
.Case("xnack_mask", AMDGPU::XNACK_MASK)
2766-
.Case("shared_base", AMDGPU::SRC_SHARED_BASE)
2767-
.Case("src_shared_base", AMDGPU::SRC_SHARED_BASE)
2768-
.Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT)
2769-
.Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT)
2770-
.Case("private_base", AMDGPU::SRC_PRIVATE_BASE)
2771-
.Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE)
2772-
.Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
2773-
.Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
2774-
.Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
2775-
.Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
2776-
.Case("lds_direct", AMDGPU::LDS_DIRECT)
2777-
.Case("src_lds_direct", AMDGPU::LDS_DIRECT)
2778-
.Case("m0", AMDGPU::M0)
2779-
.Case("vccz", AMDGPU::SRC_VCCZ)
2780-
.Case("src_vccz", AMDGPU::SRC_VCCZ)
2781-
.Case("execz", AMDGPU::SRC_EXECZ)
2782-
.Case("src_execz", AMDGPU::SRC_EXECZ)
2783-
.Case("scc", AMDGPU::SRC_SCC)
2784-
.Case("src_scc", AMDGPU::SRC_SCC)
2785-
.Case("tba", AMDGPU::TBA)
2786-
.Case("tma", AMDGPU::TMA)
2787-
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2788-
.Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2789-
.Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO)
2790-
.Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI)
2791-
.Case("vcc_lo", AMDGPU::VCC_LO)
2792-
.Case("vcc_hi", AMDGPU::VCC_HI)
2793-
.Case("exec_lo", AMDGPU::EXEC_LO)
2794-
.Case("exec_hi", AMDGPU::EXEC_HI)
2795-
.Case("tma_lo", AMDGPU::TMA_LO)
2796-
.Case("tma_hi", AMDGPU::TMA_HI)
2797-
.Case("tba_lo", AMDGPU::TBA_LO)
2798-
.Case("tba_hi", AMDGPU::TBA_HI)
2799-
.Case("pc", AMDGPU::PC_REG)
2800-
.Case("null", AMDGPU::SGPR_NULL)
2801-
.Default(AMDGPU::NoRegister);
2766+
.Case("exec", AMDGPU::EXEC)
2767+
.Case("vcc", AMDGPU::VCC)
2768+
.Case("flat_scratch", AMDGPU::FLAT_SCR)
2769+
.Case("xnack_mask", AMDGPU::XNACK_MASK)
2770+
.Case("shared_base", AMDGPU::SRC_SHARED_BASE)
2771+
.Case("src_shared_base", AMDGPU::SRC_SHARED_BASE)
2772+
.Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT)
2773+
.Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT)
2774+
.Case("private_base", AMDGPU::SRC_PRIVATE_BASE)
2775+
.Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE)
2776+
.Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
2777+
.Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
2778+
.Case("src_flat_scratch_base_lo", AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)
2779+
.Case("src_flat_scratch_base_hi", AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)
2780+
.Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
2781+
.Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
2782+
.Case("lds_direct", AMDGPU::LDS_DIRECT)
2783+
.Case("src_lds_direct", AMDGPU::LDS_DIRECT)
2784+
.Case("m0", AMDGPU::M0)
2785+
.Case("vccz", AMDGPU::SRC_VCCZ)
2786+
.Case("src_vccz", AMDGPU::SRC_VCCZ)
2787+
.Case("execz", AMDGPU::SRC_EXECZ)
2788+
.Case("src_execz", AMDGPU::SRC_EXECZ)
2789+
.Case("scc", AMDGPU::SRC_SCC)
2790+
.Case("src_scc", AMDGPU::SRC_SCC)
2791+
.Case("tba", AMDGPU::TBA)
2792+
.Case("tma", AMDGPU::TMA)
2793+
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2794+
.Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2795+
.Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO)
2796+
.Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI)
2797+
.Case("vcc_lo", AMDGPU::VCC_LO)
2798+
.Case("vcc_hi", AMDGPU::VCC_HI)
2799+
.Case("exec_lo", AMDGPU::EXEC_LO)
2800+
.Case("exec_hi", AMDGPU::EXEC_HI)
2801+
.Case("tma_lo", AMDGPU::TMA_LO)
2802+
.Case("tma_hi", AMDGPU::TMA_HI)
2803+
.Case("tba_lo", AMDGPU::TBA_LO)
2804+
.Case("tba_hi", AMDGPU::TBA_HI)
2805+
.Case("pc", AMDGPU::PC_REG)
2806+
.Case("null", AMDGPU::SGPR_NULL)
2807+
.Default(AMDGPU::NoRegister);
28022808
}
28032809

28042810
bool AMDGPUAsmParser::ParseRegister(MCRegister &RegNo, SMLoc &StartLoc,
@@ -6744,6 +6750,9 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
67446750
case SRC_PRIVATE_LIMIT_LO:
67456751
case SRC_PRIVATE_LIMIT:
67466752
return isGFX9Plus();
6753+
case SRC_FLAT_SCRATCH_BASE_LO:
6754+
case SRC_FLAT_SCRATCH_BASE_HI:
6755+
return hasGloballyAddressableScratch();
67476756
case SRC_POPS_EXITING_WAVE_ID:
67486757
return isGFX9Plus() && !isGFX11Plus();
67496758
case TBA:

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1914,6 +1914,8 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
19141914
return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL);
19151915
case 126: return createRegOperand(EXEC_LO);
19161916
case 127: return createRegOperand(EXEC_HI);
1917+
case 230: return createRegOperand(SRC_FLAT_SCRATCH_BASE_LO);
1918+
case 231: return createRegOperand(SRC_FLAT_SCRATCH_BASE_HI);
19171919
case 235: return createRegOperand(SRC_SHARED_BASE_LO);
19181920
case 236: return createRegOperand(SRC_SHARED_LIMIT_LO);
19191921
case 237: return createRegOperand(SRC_PRIVATE_BASE_LO);
@@ -1947,6 +1949,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
19471949
return createRegOperand(SGPR_NULL);
19481950
break;
19491951
case 126: return createRegOperand(EXEC);
1952+
case 230: return createRegOperand(SRC_FLAT_SCRATCH_BASE_LO);
19501953
case 235: return createRegOperand(SRC_SHARED_BASE);
19511954
case 236: return createRegOperand(SRC_SHARED_LIMIT);
19521955
case 237: return createRegOperand(SRC_PRIVATE_BASE);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
281281

282282
bool RequiresCOV6 = false;
283283
bool UseBlockVGPROpsForCSR = false;
284+
bool HasGloballyAddressableScratch = false;
284285

285286
// Dummy feature to use for assembler in tablegen.
286287
bool FeatureDisable = false;
@@ -1325,6 +1326,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13251326

13261327
bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
13271328

1329+
bool hasGloballyAddressableScratch() const {
1330+
return HasGloballyAddressableScratch;
1331+
}
1332+
13281333
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
13291334

13301335
bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
598598
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
599599
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
600600
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
601+
reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
602+
reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
601603

602604
// Reserve async counters pseudo registers
603605
reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,22 @@ defm SRC_SHARED_LIMIT : ApertureRegister<"src_shared_limit", 236>;
246246
defm SRC_PRIVATE_BASE : ApertureRegister<"src_private_base", 237>;
247247
defm SRC_PRIVATE_LIMIT : ApertureRegister<"src_private_limit", 238>;
248248

249+
let isConstant = true in {
250+
defm SRC_FLAT_SCRATCH_BASE_LO : SIRegLoHi16<"src_flat_scratch_base_lo", 230>;
251+
defm SRC_FLAT_SCRATCH_BASE_HI : SIRegLoHi16<"src_flat_scratch_base_hi", 231>;
252+
253+
// Using src_flat_scratch_base_lo in a 64-bit context gets the full 64-bit
254+
// hi:lo value.
255+
def SRC_FLAT_SCRATCH_BASE :
256+
RegisterWithSubRegs<"src_flat_scratch_base_lo",
257+
[SRC_FLAT_SCRATCH_BASE_LO,
258+
SRC_FLAT_SCRATCH_BASE_HI]> {
259+
let Namespace = "AMDGPU";
260+
let SubRegIndices = [sub0, sub1];
261+
let HWEncoding = SRC_FLAT_SCRATCH_BASE_LO.HWEncoding;
262+
}
263+
}
264+
249265
defm SRC_POPS_EXITING_WAVE_ID : SIRegLoHi16<"src_pops_exiting_wave_id", 239>;
250266

251267
// Not addressable
@@ -765,7 +781,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i
765781
SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO,
766782
SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI,
767783
SRC_SHARED_LIMIT_HI, SRC_PRIVATE_BASE_HI, SRC_PRIVATE_LIMIT_HI, SRC_POPS_EXITING_WAVE_ID,
768-
SRC_VCCZ, SRC_EXECZ, SRC_SCC)> {
784+
SRC_VCCZ, SRC_EXECZ, SRC_SCC, SRC_FLAT_SCRATCH_BASE_LO, SRC_FLAT_SCRATCH_BASE_HI)> {
769785
let AllocationPriority = 0;
770786
}
771787

@@ -776,7 +792,8 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
776792
SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16,
777793
SRC_SHARED_BASE_HI_LO16, SRC_SHARED_LIMIT_HI_LO16, SRC_PRIVATE_BASE_HI_LO16,
778794
SRC_PRIVATE_LIMIT_HI_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16,
779-
SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16)> {
795+
SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16,
796+
SRC_FLAT_SCRATCH_BASE_LO_LO16, SRC_FLAT_SCRATCH_BASE_HI_LO16)> {
780797
let Size = 16;
781798
let isAllocatable = 0;
782799
let BaseClassOrder = 16;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2654,6 +2654,8 @@ bool isInlineValue(unsigned Reg) {
26542654
case AMDGPU::SRC_PRIVATE_BASE:
26552655
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
26562656
case AMDGPU::SRC_PRIVATE_LIMIT:
2657+
case AMDGPU::SRC_FLAT_SCRATCH_BASE_LO:
2658+
case AMDGPU::SRC_FLAT_SCRATCH_BASE_HI:
26572659
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
26582660
return true;
26592661
case AMDGPU::SRC_VCCZ:
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX1200-ERR %s
2+
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
3+
4+
s_mov_b32 s0, src_flat_scratch_base_lo
5+
// GFX1200-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: src_flat_scratch_base_lo register not available on this GPU
6+
// GFX1250: encoding: [0xe6,0x00,0x80,0xbe]
7+
8+
s_mov_b32 s0, src_flat_scratch_base_hi
9+
// GFX1200-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: src_flat_scratch_base_hi register not available on this GPU
10+
// GFX1250: encoding: [0xe7,0x00,0x80,0xbe]
11+
12+
s_mov_b64 s[0:1], src_flat_scratch_base_lo
13+
// GFX1200-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: src_flat_scratch_base_lo register not available on this GPU
14+
// GFX1250: encoding: [0xe6,0x01,0x80,0xbe]
15+
16+
s_mov_b64 s[0:1], shared_base
17+
// GFX1250: encoding: [0xeb,0x01,0x80,0xbe]
18+
19+
s_mov_b64 s[0:1], src_shared_base
20+
// GFX1250: encoding: [0xeb,0x01,0x80,0xbe]
21+
22+
s_mov_b64 s[0:1], shared_limit
23+
// GFX1250: encoding: [0xec,0x01,0x80,0xbe]
24+
25+
s_mov_b64 s[0:1], src_shared_limit
26+
// GFX1250: encoding: [0xec,0x01,0x80,0xbe]
27+
28+
s_getreg_b32 s1, hwreg(33)
29+
// GFX1250: encoding: [0x21,0xf8,0x81,0xb8]
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX1250 %s
2+
3+
# GFX1250: s_mov_b32 s0, src_flat_scratch_base_lo ; encoding: [0xe6,0x00,0x80,0xbe]
4+
0xe6,0x00,0x80,0xbe
5+
6+
# GFX1250: s_mov_b32 s0, src_flat_scratch_base_hi ; encoding: [0xe7,0x00,0x80,0xbe]
7+
0xe7,0x00,0x80,0xbe
8+
9+
# GFX1250: s_mov_b64 s[0:1], src_flat_scratch_base_lo ; encoding: [0xe6,0x01,0x80,0xbe]
10+
0xe6,0x01,0x80,0xbe
11+
12+
# GFX1250: s_mov_b64 s[0:1], src_shared_base ; encoding: [0xeb,0x01,0x80,0xbe]
13+
0xeb,0x01,0x80,0xbe
14+
15+
# GFX1250: s_mov_b64 s[0:1], src_shared_base ; encoding: [0xeb,0x01,0x80,0xbe]
16+
0xeb,0x01,0x80,0xbe
17+
18+
# GFX1250: s_mov_b64 s[0:1], src_shared_limit ; encoding: [0xec,0x01,0x80,0xbe]
19+
0xec,0x01,0x80,0xbe
20+
21+
# GFX1250: s_mov_b64 s[0:1], src_shared_limit ; encoding: [0xec,0x01,0x80,0xbe]
22+
0xec,0x01,0x80,0xbe

0 commit comments

Comments
 (0)