diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 18f3c4761748a..d84f512f4976d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1365,6 +1365,13 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts", "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32" >; +def FeatureGloballyAddressableScratch : SubtargetFeature< + "globally-addressable-scratch", + "HasGloballyAddressableScratch", + "true", + "FLAT instructions can access scratch memory for any thread in any wave" +>; + // FIXME: Remove after all users are migrated to attribute. def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr", "DynamicVGPR", @@ -2055,6 +2062,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureAtomicFMinFMaxF64FlatInsts, FeatureFlatBufferGlobalAtomicFaddF64Inst, FeatureMemoryAtomicFAddF32DenormalSupport, + FeatureGloballyAddressableScratch, FeatureKernargPreload, FeatureVmemPrefInsts, FeatureLshlAddU64Inst, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index d33765db9cc7d..ff8efd2debc21 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1620,6 +1620,10 @@ class AMDGPUAsmParser : public MCTargetAsmParser { return getFeatureBits()[AMDGPU::FeaturePartialNSAEncoding]; } + bool hasGloballyAddressableScratch() const { + return getFeatureBits()[AMDGPU::FeatureGloballyAddressableScratch]; + } + unsigned getNSAMaxSize(bool HasSampler = false) const { return AMDGPU::getNSAMaxSize(getSTI(), HasSampler); } @@ -2759,46 +2763,48 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { static MCRegister getSpecialRegForName(StringRef RegName) { return StringSwitch(RegName) - .Case("exec", AMDGPU::EXEC) - .Case("vcc", AMDGPU::VCC) - .Case("flat_scratch", AMDGPU::FLAT_SCR) - .Case("xnack_mask", AMDGPU::XNACK_MASK) - .Case("shared_base", AMDGPU::SRC_SHARED_BASE) - .Case("src_shared_base", AMDGPU::SRC_SHARED_BASE) - .Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT) - .Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT) - .Case("private_base", AMDGPU::SRC_PRIVATE_BASE) - .Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE) - .Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT) - .Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT) - .Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) - .Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) - .Case("lds_direct", AMDGPU::LDS_DIRECT) - .Case("src_lds_direct", AMDGPU::LDS_DIRECT) - .Case("m0", AMDGPU::M0) - .Case("vccz", AMDGPU::SRC_VCCZ) - .Case("src_vccz", AMDGPU::SRC_VCCZ) - .Case("execz", AMDGPU::SRC_EXECZ) - .Case("src_execz", AMDGPU::SRC_EXECZ) - .Case("scc", AMDGPU::SRC_SCC) - .Case("src_scc", AMDGPU::SRC_SCC) - .Case("tba", AMDGPU::TBA) - .Case("tma", AMDGPU::TMA) - .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) - .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) - .Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO) - .Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI) - .Case("vcc_lo", AMDGPU::VCC_LO) - .Case("vcc_hi", AMDGPU::VCC_HI) - .Case("exec_lo", AMDGPU::EXEC_LO) - .Case("exec_hi", AMDGPU::EXEC_HI) - .Case("tma_lo", AMDGPU::TMA_LO) - .Case("tma_hi", AMDGPU::TMA_HI) - .Case("tba_lo", AMDGPU::TBA_LO) - .Case("tba_hi", AMDGPU::TBA_HI) - .Case("pc", AMDGPU::PC_REG) - .Case("null", AMDGPU::SGPR_NULL) - .Default(AMDGPU::NoRegister); + .Case("exec", AMDGPU::EXEC) + .Case("vcc", AMDGPU::VCC) + .Case("flat_scratch", AMDGPU::FLAT_SCR) + .Case("xnack_mask", AMDGPU::XNACK_MASK) + .Case("shared_base", AMDGPU::SRC_SHARED_BASE) + .Case("src_shared_base", AMDGPU::SRC_SHARED_BASE) + .Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT) + .Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT) + .Case("private_base", AMDGPU::SRC_PRIVATE_BASE) + .Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE) + .Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT) + .Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT) + .Case("src_flat_scratch_base_lo", AMDGPU::SRC_FLAT_SCRATCH_BASE_LO) + .Case("src_flat_scratch_base_hi", AMDGPU::SRC_FLAT_SCRATCH_BASE_HI) + .Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) + .Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) + .Case("lds_direct", AMDGPU::LDS_DIRECT) + .Case("src_lds_direct", AMDGPU::LDS_DIRECT) + .Case("m0", AMDGPU::M0) + .Case("vccz", AMDGPU::SRC_VCCZ) + .Case("src_vccz", AMDGPU::SRC_VCCZ) + .Case("execz", AMDGPU::SRC_EXECZ) + .Case("src_execz", AMDGPU::SRC_EXECZ) + .Case("scc", AMDGPU::SRC_SCC) + .Case("src_scc", AMDGPU::SRC_SCC) + .Case("tba", AMDGPU::TBA) + .Case("tma", AMDGPU::TMA) + .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) + .Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO) + .Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI) + .Case("vcc_lo", AMDGPU::VCC_LO) + .Case("vcc_hi", AMDGPU::VCC_HI) + .Case("exec_lo", AMDGPU::EXEC_LO) + .Case("exec_hi", AMDGPU::EXEC_HI) + .Case("tma_lo", AMDGPU::TMA_LO) + .Case("tma_hi", AMDGPU::TMA_HI) + .Case("tba_lo", AMDGPU::TBA_LO) + .Case("tba_hi", AMDGPU::TBA_HI) + .Case("pc", AMDGPU::PC_REG) + .Case("null", AMDGPU::SGPR_NULL) + .Default(AMDGPU::NoRegister); } bool AMDGPUAsmParser::ParseRegister(MCRegister &RegNo, SMLoc &StartLoc, @@ -6744,6 +6750,9 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, case SRC_PRIVATE_LIMIT_LO: case SRC_PRIVATE_LIMIT: return isGFX9Plus(); + case SRC_FLAT_SCRATCH_BASE_LO: + case SRC_FLAT_SCRATCH_BASE_HI: + return hasGloballyAddressableScratch(); case SRC_POPS_EXITING_WAVE_ID: return isGFX9Plus() && !isGFX11Plus(); case TBA: diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index fef0d7eb45a8c..fb7d634e62272 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1914,6 +1914,8 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL); case 126: return createRegOperand(EXEC_LO); case 127: return createRegOperand(EXEC_HI); + case 230: return createRegOperand(SRC_FLAT_SCRATCH_BASE_LO); + case 231: return createRegOperand(SRC_FLAT_SCRATCH_BASE_HI); case 235: return createRegOperand(SRC_SHARED_BASE_LO); case 236: return createRegOperand(SRC_SHARED_LIMIT_LO); case 237: return createRegOperand(SRC_PRIVATE_BASE_LO); @@ -1947,6 +1949,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { return createRegOperand(SGPR_NULL); break; case 126: return createRegOperand(EXEC); + case 230: return createRegOperand(SRC_FLAT_SCRATCH_BASE_LO); case 235: return createRegOperand(SRC_SHARED_BASE); case 236: return createRegOperand(SRC_SHARED_LIMIT); case 237: return createRegOperand(SRC_PRIVATE_BASE); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index c84ba1a0a9d47..5530886831cae 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -281,6 +281,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool RequiresCOV6 = false; bool UseBlockVGPROpsForCSR = false; + bool HasGloballyAddressableScratch = false; // Dummy feature to use for assembler in tablegen. bool FeatureDisable = false; @@ -1325,6 +1326,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; } + bool hasGloballyAddressableScratch() const { + return HasGloballyAddressableScratch; + } + bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index f3acc5c2ea159..ae0f304ea3041 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -598,6 +598,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); + reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO); + reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI); // Reserve async counters pseudo registers reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 08d07c927e4c4..ed6b973580502 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -246,6 +246,22 @@ defm SRC_SHARED_LIMIT : ApertureRegister<"src_shared_limit", 236>; defm SRC_PRIVATE_BASE : ApertureRegister<"src_private_base", 237>; defm SRC_PRIVATE_LIMIT : ApertureRegister<"src_private_limit", 238>; +let isConstant = true in { + defm SRC_FLAT_SCRATCH_BASE_LO : SIRegLoHi16<"src_flat_scratch_base_lo", 230>; + defm SRC_FLAT_SCRATCH_BASE_HI : SIRegLoHi16<"src_flat_scratch_base_hi", 231>; + + // Using src_flat_scratch_base_lo in a 64-bit context gets the full 64-bit + // hi:lo value. + def SRC_FLAT_SCRATCH_BASE : + RegisterWithSubRegs<"src_flat_scratch_base_lo", + [SRC_FLAT_SCRATCH_BASE_LO, + SRC_FLAT_SCRATCH_BASE_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = SRC_FLAT_SCRATCH_BASE_LO.HWEncoding; + } +} + defm SRC_POPS_EXITING_WAVE_ID : SIRegLoHi16<"src_pops_exiting_wave_id", 239>; // Not addressable @@ -765,7 +781,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO, SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI, SRC_SHARED_LIMIT_HI, SRC_PRIVATE_BASE_HI, SRC_PRIVATE_LIMIT_HI, SRC_POPS_EXITING_WAVE_ID, - SRC_VCCZ, SRC_EXECZ, SRC_SCC)> { + SRC_VCCZ, SRC_EXECZ, SRC_SCC, SRC_FLAT_SCRATCH_BASE_LO, SRC_FLAT_SCRATCH_BASE_HI)> { let AllocationPriority = 0; } @@ -776,7 +792,8 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16, SRC_SHARED_BASE_HI_LO16, SRC_SHARED_LIMIT_HI_LO16, SRC_PRIVATE_BASE_HI_LO16, SRC_PRIVATE_LIMIT_HI_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, - SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16)> { + SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16, + SRC_FLAT_SCRATCH_BASE_LO_LO16, SRC_FLAT_SCRATCH_BASE_HI_LO16)> { let Size = 16; let isAllocatable = 0; let BaseClassOrder = 16; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 65fa0884b11c9..00dcb9b52d4bd 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2654,6 +2654,8 @@ bool isInlineValue(unsigned Reg) { case AMDGPU::SRC_PRIVATE_BASE: case AMDGPU::SRC_PRIVATE_LIMIT_LO: case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SRC_FLAT_SCRATCH_BASE_LO: + case AMDGPU::SRC_FLAT_SCRATCH_BASE_HI: case AMDGPU::SRC_POPS_EXITING_WAVE_ID: return true; case AMDGPU::SRC_VCCZ: diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_operands.s b/llvm/test/MC/AMDGPU/gfx1250_asm_operands.s new file mode 100644 index 0000000000000..8b7465b5df574 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_operands.s @@ -0,0 +1,29 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX1200-ERR %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s + +s_mov_b32 s0, src_flat_scratch_base_lo +// GFX1200-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: src_flat_scratch_base_lo register not available on this GPU +// GFX1250: encoding: [0xe6,0x00,0x80,0xbe] + +s_mov_b32 s0, src_flat_scratch_base_hi +// GFX1200-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: src_flat_scratch_base_hi register not available on this GPU +// GFX1250: encoding: [0xe7,0x00,0x80,0xbe] + +s_mov_b64 s[0:1], src_flat_scratch_base_lo +// GFX1200-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: src_flat_scratch_base_lo register not available on this GPU +// GFX1250: encoding: [0xe6,0x01,0x80,0xbe] + +s_mov_b64 s[0:1], shared_base +// GFX1250: encoding: [0xeb,0x01,0x80,0xbe] + +s_mov_b64 s[0:1], src_shared_base +// GFX1250: encoding: [0xeb,0x01,0x80,0xbe] + +s_mov_b64 s[0:1], shared_limit +// GFX1250: encoding: [0xec,0x01,0x80,0xbe] + +s_mov_b64 s[0:1], src_shared_limit +// GFX1250: encoding: [0xec,0x01,0x80,0xbe] + +s_getreg_b32 s1, hwreg(33) +// GFX1250: encoding: [0x21,0xf8,0x81,0xb8] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_operands.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_operands.txt new file mode 100644 index 0000000000000..a3e7e570ab9b6 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_operands.txt @@ -0,0 +1,22 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX1250 %s + +# GFX1250: s_mov_b32 s0, src_flat_scratch_base_lo ; encoding: [0xe6,0x00,0x80,0xbe] +0xe6,0x00,0x80,0xbe + +# GFX1250: s_mov_b32 s0, src_flat_scratch_base_hi ; encoding: [0xe7,0x00,0x80,0xbe] +0xe7,0x00,0x80,0xbe + +# GFX1250: s_mov_b64 s[0:1], src_flat_scratch_base_lo ; encoding: [0xe6,0x01,0x80,0xbe] +0xe6,0x01,0x80,0xbe + +# GFX1250: s_mov_b64 s[0:1], src_shared_base ; encoding: [0xeb,0x01,0x80,0xbe] +0xeb,0x01,0x80,0xbe + +# GFX1250: s_mov_b64 s[0:1], src_shared_base ; encoding: [0xeb,0x01,0x80,0xbe] +0xeb,0x01,0x80,0xbe + +# GFX1250: s_mov_b64 s[0:1], src_shared_limit ; encoding: [0xec,0x01,0x80,0xbe] +0xec,0x01,0x80,0xbe + +# GFX1250: s_mov_b64 s[0:1], src_shared_limit ; encoding: [0xec,0x01,0x80,0xbe] +0xec,0x01,0x80,0xbe