Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1365,6 +1365,13 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
>;

def FeatureGloballyAddressableScratch : SubtargetFeature<
"globally-addressable-scratch",
"HasGloballyAddressableScratch",
"true",
"FLAT instructions can access scratch memory for any thread in any wave"
>;

// FIXME: Remove after all users are migrated to attribute.
def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
"DynamicVGPR",
Expand Down Expand Up @@ -2055,6 +2062,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureFlatBufferGlobalAtomicFaddF64Inst,
FeatureMemoryAtomicFAddF32DenormalSupport,
FeatureGloballyAddressableScratch,
FeatureKernargPreload,
FeatureVmemPrefInsts,
FeatureLshlAddU64Inst,
Expand Down
89 changes: 49 additions & 40 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1620,6 +1620,10 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
return getFeatureBits()[AMDGPU::FeaturePartialNSAEncoding];
}

bool hasGloballyAddressableScratch() const {
return getFeatureBits()[AMDGPU::FeatureGloballyAddressableScratch];
}

unsigned getNSAMaxSize(bool HasSampler = false) const {
return AMDGPU::getNSAMaxSize(getSTI(), HasSampler);
}
Expand Down Expand Up @@ -2759,46 +2763,48 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {

static MCRegister getSpecialRegForName(StringRef RegName) {
return StringSwitch<unsigned>(RegName)
.Case("exec", AMDGPU::EXEC)
.Case("vcc", AMDGPU::VCC)
.Case("flat_scratch", AMDGPU::FLAT_SCR)
.Case("xnack_mask", AMDGPU::XNACK_MASK)
.Case("shared_base", AMDGPU::SRC_SHARED_BASE)
.Case("src_shared_base", AMDGPU::SRC_SHARED_BASE)
.Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT)
.Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT)
.Case("private_base", AMDGPU::SRC_PRIVATE_BASE)
.Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE)
.Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
.Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
.Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
.Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
.Case("lds_direct", AMDGPU::LDS_DIRECT)
.Case("src_lds_direct", AMDGPU::LDS_DIRECT)
.Case("m0", AMDGPU::M0)
.Case("vccz", AMDGPU::SRC_VCCZ)
.Case("src_vccz", AMDGPU::SRC_VCCZ)
.Case("execz", AMDGPU::SRC_EXECZ)
.Case("src_execz", AMDGPU::SRC_EXECZ)
.Case("scc", AMDGPU::SRC_SCC)
.Case("src_scc", AMDGPU::SRC_SCC)
.Case("tba", AMDGPU::TBA)
.Case("tma", AMDGPU::TMA)
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
.Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
.Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO)
.Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI)
.Case("vcc_lo", AMDGPU::VCC_LO)
.Case("vcc_hi", AMDGPU::VCC_HI)
.Case("exec_lo", AMDGPU::EXEC_LO)
.Case("exec_hi", AMDGPU::EXEC_HI)
.Case("tma_lo", AMDGPU::TMA_LO)
.Case("tma_hi", AMDGPU::TMA_HI)
.Case("tba_lo", AMDGPU::TBA_LO)
.Case("tba_hi", AMDGPU::TBA_HI)
.Case("pc", AMDGPU::PC_REG)
.Case("null", AMDGPU::SGPR_NULL)
.Default(AMDGPU::NoRegister);
.Case("exec", AMDGPU::EXEC)
.Case("vcc", AMDGPU::VCC)
.Case("flat_scratch", AMDGPU::FLAT_SCR)
.Case("xnack_mask", AMDGPU::XNACK_MASK)
.Case("shared_base", AMDGPU::SRC_SHARED_BASE)
.Case("src_shared_base", AMDGPU::SRC_SHARED_BASE)
.Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT)
.Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT)
.Case("private_base", AMDGPU::SRC_PRIVATE_BASE)
.Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE)
.Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
.Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
.Case("src_flat_scratch_base_lo", AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)
.Case("src_flat_scratch_base_hi", AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)
.Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
.Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
.Case("lds_direct", AMDGPU::LDS_DIRECT)
.Case("src_lds_direct", AMDGPU::LDS_DIRECT)
.Case("m0", AMDGPU::M0)
.Case("vccz", AMDGPU::SRC_VCCZ)
.Case("src_vccz", AMDGPU::SRC_VCCZ)
.Case("execz", AMDGPU::SRC_EXECZ)
.Case("src_execz", AMDGPU::SRC_EXECZ)
.Case("scc", AMDGPU::SRC_SCC)
.Case("src_scc", AMDGPU::SRC_SCC)
.Case("tba", AMDGPU::TBA)
.Case("tma", AMDGPU::TMA)
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
.Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
.Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO)
.Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI)
.Case("vcc_lo", AMDGPU::VCC_LO)
.Case("vcc_hi", AMDGPU::VCC_HI)
.Case("exec_lo", AMDGPU::EXEC_LO)
.Case("exec_hi", AMDGPU::EXEC_HI)
.Case("tma_lo", AMDGPU::TMA_LO)
.Case("tma_hi", AMDGPU::TMA_HI)
.Case("tba_lo", AMDGPU::TBA_LO)
.Case("tba_hi", AMDGPU::TBA_HI)
.Case("pc", AMDGPU::PC_REG)
.Case("null", AMDGPU::SGPR_NULL)
.Default(AMDGPU::NoRegister);
}

bool AMDGPUAsmParser::ParseRegister(MCRegister &RegNo, SMLoc &StartLoc,
Expand Down Expand Up @@ -6744,6 +6750,9 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
case SRC_PRIVATE_LIMIT_LO:
case SRC_PRIVATE_LIMIT:
return isGFX9Plus();
case SRC_FLAT_SCRATCH_BASE_LO:
case SRC_FLAT_SCRATCH_BASE_HI:
return hasGloballyAddressableScratch();
case SRC_POPS_EXITING_WAVE_ID:
return isGFX9Plus() && !isGFX11Plus();
case TBA:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1914,6 +1914,8 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL);
case 126: return createRegOperand(EXEC_LO);
case 127: return createRegOperand(EXEC_HI);
case 230: return createRegOperand(SRC_FLAT_SCRATCH_BASE_LO);
case 231: return createRegOperand(SRC_FLAT_SCRATCH_BASE_HI);
case 235: return createRegOperand(SRC_SHARED_BASE_LO);
case 236: return createRegOperand(SRC_SHARED_LIMIT_LO);
case 237: return createRegOperand(SRC_PRIVATE_BASE_LO);
Expand Down Expand Up @@ -1947,6 +1949,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
return createRegOperand(SGPR_NULL);
break;
case 126: return createRegOperand(EXEC);
case 230: return createRegOperand(SRC_FLAT_SCRATCH_BASE_LO);
case 235: return createRegOperand(SRC_SHARED_BASE);
case 236: return createRegOperand(SRC_SHARED_LIMIT);
case 237: return createRegOperand(SRC_PRIVATE_BASE);
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool RequiresCOV6 = false;
bool UseBlockVGPROpsForCSR = false;
bool HasGloballyAddressableScratch = false;

// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
Expand Down Expand Up @@ -1325,6 +1326,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }

bool hasGloballyAddressableScratch() const {
return HasGloballyAddressableScratch;
}

bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }

bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);

// Reserve async counters pseudo registers
reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
Expand Down
21 changes: 19 additions & 2 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,22 @@ defm SRC_SHARED_LIMIT : ApertureRegister<"src_shared_limit", 236>;
defm SRC_PRIVATE_BASE : ApertureRegister<"src_private_base", 237>;
defm SRC_PRIVATE_LIMIT : ApertureRegister<"src_private_limit", 238>;

let isConstant = true in {
defm SRC_FLAT_SCRATCH_BASE_LO : SIRegLoHi16<"src_flat_scratch_base_lo", 230>;
defm SRC_FLAT_SCRATCH_BASE_HI : SIRegLoHi16<"src_flat_scratch_base_hi", 231>;

// Using src_flat_scratch_base_lo in a 64-bit context gets the full 64-bit
// hi:lo value.
def SRC_FLAT_SCRATCH_BASE :
RegisterWithSubRegs<"src_flat_scratch_base_lo",
[SRC_FLAT_SCRATCH_BASE_LO,
SRC_FLAT_SCRATCH_BASE_HI]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = SRC_FLAT_SCRATCH_BASE_LO.HWEncoding;
}
}

defm SRC_POPS_EXITING_WAVE_ID : SIRegLoHi16<"src_pops_exiting_wave_id", 239>;

// Not addressable
Expand Down Expand Up @@ -765,7 +781,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i
SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO,
SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI,
SRC_SHARED_LIMIT_HI, SRC_PRIVATE_BASE_HI, SRC_PRIVATE_LIMIT_HI, SRC_POPS_EXITING_WAVE_ID,
SRC_VCCZ, SRC_EXECZ, SRC_SCC)> {
SRC_VCCZ, SRC_EXECZ, SRC_SCC, SRC_FLAT_SCRATCH_BASE_LO, SRC_FLAT_SCRATCH_BASE_HI)> {
let AllocationPriority = 0;
}

Expand All @@ -776,7 +792,8 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16,
SRC_SHARED_BASE_HI_LO16, SRC_SHARED_LIMIT_HI_LO16, SRC_PRIVATE_BASE_HI_LO16,
SRC_PRIVATE_LIMIT_HI_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16,
SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16)> {
SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16,
SRC_FLAT_SCRATCH_BASE_LO_LO16, SRC_FLAT_SCRATCH_BASE_HI_LO16)> {
let Size = 16;
let isAllocatable = 0;
let BaseClassOrder = 16;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2654,6 +2654,8 @@ bool isInlineValue(unsigned Reg) {
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
case AMDGPU::SRC_PRIVATE_LIMIT:
case AMDGPU::SRC_FLAT_SCRATCH_BASE_LO:
case AMDGPU::SRC_FLAT_SCRATCH_BASE_HI:
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
return true;
case AMDGPU::SRC_VCCZ:
Expand Down
29 changes: 29 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1250_asm_operands.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX1200-ERR %s
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s

s_mov_b32 s0, src_flat_scratch_base_lo
// GFX1200-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: src_flat_scratch_base_lo register not available on this GPU
// GFX1250: encoding: [0xe6,0x00,0x80,0xbe]

s_mov_b32 s0, src_flat_scratch_base_hi
// GFX1200-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: src_flat_scratch_base_hi register not available on this GPU
// GFX1250: encoding: [0xe7,0x00,0x80,0xbe]

s_mov_b64 s[0:1], src_flat_scratch_base_lo
// GFX1200-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: src_flat_scratch_base_lo register not available on this GPU
// GFX1250: encoding: [0xe6,0x01,0x80,0xbe]

s_mov_b64 s[0:1], shared_base
// GFX1250: encoding: [0xeb,0x01,0x80,0xbe]

s_mov_b64 s[0:1], src_shared_base
// GFX1250: encoding: [0xeb,0x01,0x80,0xbe]

s_mov_b64 s[0:1], shared_limit
// GFX1250: encoding: [0xec,0x01,0x80,0xbe]

s_mov_b64 s[0:1], src_shared_limit
// GFX1250: encoding: [0xec,0x01,0x80,0xbe]

s_getreg_b32 s1, hwreg(33)
// GFX1250: encoding: [0x21,0xf8,0x81,0xb8]
22 changes: 22 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_operands.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX1250 %s

# GFX1250: s_mov_b32 s0, src_flat_scratch_base_lo ; encoding: [0xe6,0x00,0x80,0xbe]
0xe6,0x00,0x80,0xbe

# GFX1250: s_mov_b32 s0, src_flat_scratch_base_hi ; encoding: [0xe7,0x00,0x80,0xbe]
0xe7,0x00,0x80,0xbe

# GFX1250: s_mov_b64 s[0:1], src_flat_scratch_base_lo ; encoding: [0xe6,0x01,0x80,0xbe]
0xe6,0x01,0x80,0xbe

# GFX1250: s_mov_b64 s[0:1], src_shared_base ; encoding: [0xeb,0x01,0x80,0xbe]
0xeb,0x01,0x80,0xbe

# GFX1250: s_mov_b64 s[0:1], src_shared_base ; encoding: [0xeb,0x01,0x80,0xbe]
0xeb,0x01,0x80,0xbe

# GFX1250: s_mov_b64 s[0:1], src_shared_limit ; encoding: [0xec,0x01,0x80,0xbe]
0xec,0x01,0x80,0xbe

# GFX1250: s_mov_b64 s[0:1], src_shared_limit ; encoding: [0xec,0x01,0x80,0xbe]
0xec,0x01,0x80,0xbe
Loading