diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b7fd131e76056..402147abd8891 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5521,11 +5521,18 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, Register PtrBase; int64_t ConstOffset; - std::tie(PtrBase, ConstOffset) = + bool IsInBounds; + std::tie(PtrBase, ConstOffset, IsInBounds) = getPtrBaseWithConstantOffset(Root.getReg(), *MRI); - if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch && - !isFlatScratchBaseLegal(Root.getReg()))) + // Adding the offset to the base address with an immediate in a FLAT + // instruction must not change the memory aperture in which the address falls. + // Therefore we can only fold offsets from inbounds GEPs into FLAT + // instructions. + if (ConstOffset == 0 || + (FlatVariant == SIInstrFlags::FlatScratch && + !isFlatScratchBaseLegal(Root.getReg())) || + (FlatVariant == SIInstrFlags::FLAT && !IsInBounds)) return Default; unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); @@ -5577,7 +5584,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, // Match the immediate offset first, which canonically is moved as low as // possible. - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(Addr, *MRI); if (ConstOffset != 0) { if (NeedIOffset && @@ -5760,7 +5768,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { // Match the immediate offset first, which canonically is moved as low as // possible. - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(Addr, *MRI); if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) && TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, @@ -5836,7 +5845,8 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { // Match the immediate offset first, which canonically is moved as low as // possible. - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(Addr, *MRI); Register OrigAddr = Addr; if (ConstOffset != 0 && @@ -5942,7 +5952,8 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); Register PtrBase; int64_t ConstOffset; - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(VAddr, *MRI); if (ConstOffset != 0) { if (TII.isLegalMUBUFImmOffset(ConstOffset) && (!STI.privateMemoryResourceIsRangeChecked() || @@ -6181,8 +6192,8 @@ AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const Register PtrBase; int64_t Offset; - std::tie(PtrBase, Offset) = - getPtrBaseWithConstantOffset(Root.getReg(), *MRI); + std::tie(PtrBase, Offset, std::ignore) = + getPtrBaseWithConstantOffset(Root.getReg(), *MRI); if (Offset) { if (isDSOffsetLegal(PtrBase, Offset)) { @@ -6243,8 +6254,8 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, Register PtrBase; int64_t Offset; - std::tie(PtrBase, Offset) = - getPtrBaseWithConstantOffset(Root.getReg(), *MRI); + std::tie(PtrBase, Offset, std::ignore) = + getPtrBaseWithConstantOffset(Root.getReg(), *MRI); if (Offset) { int64_t OffsetValue0 = Offset; @@ -6265,22 +6276,25 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, } /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return -/// the base value with the constant offset. There may be intervening copies -/// between \p Root and the identified constant. Returns \p Root, 0 if this does -/// not match the pattern. -std::pair +/// the base value with the constant offset, and if the offset computation is +/// known to be inbounds. There may be intervening copies between \p Root and +/// the identified constant. Returns \p Root, 0, false if this does not match +/// the pattern. +std::tuple AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( - Register Root, const MachineRegisterInfo &MRI) const { + Register Root, const MachineRegisterInfo &MRI) const { MachineInstr *RootI = getDefIgnoringCopies(Root, MRI); if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) - return {Root, 0}; + return {Root, 0, false}; MachineOperand &RHS = RootI->getOperand(2); std::optional MaybeOffset = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); if (!MaybeOffset) - return {Root, 0}; - return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()}; + return {Root, 0, false}; + bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds); + return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(), + IsInBounds}; } static void addZeroImm(MachineInstrBuilder &MIB) { @@ -6358,7 +6372,8 @@ AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { Register PtrBase; int64_t Offset; - std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); + std::tie(PtrBase, Offset, std::ignore) = + getPtrBaseWithConstantOffset(Src, *MRI); if (isUInt<32>(Offset)) { Data.N0 = PtrBase; Data.Offset = Offset; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index c9da419846ee5..31f5ba1dd5040 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -295,7 +295,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { InstructionSelector::ComplexRendererFns selectDSReadWrite2(MachineOperand &Root, unsigned size) const; - std::pair + std::tuple getPtrBaseWithConstantOffset(Register Root, const MachineRegisterInfo &MRI) const; diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll index 3e2680f55832d..6bb68e1e26a14 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll @@ -1,12 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-MUBUF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-FLATSCR %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-MUBUF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-FLATSCR %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-SDAG,GFX90A-MUBUF,GFX90A-SDAG-MUBUF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-SDAG,GFX90A-FLATSCR,GFX90A-SDAG-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG,GFX10-MUBUF,GFX10-SDAG-MUBUF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG,GFX10-FLATSCR,GFX10-SDAG-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s + +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-GISEL,GFX90A-MUBUF,GFX90A-GISEL-MUBUF %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-GISEL,GFX90A-FLATSCR,GFX90A-GISEL-FLATSCR %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL,GFX10-MUBUF,GFX10-GISEL-MUBUF %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL,GFX10-FLATSCR,GFX10-GISEL-FLATSCR %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s ; This test checks memory addresses with constant offset components that should ; not be folded into memory accesses with immediate offsets. @@ -19,67 +27,146 @@ ; FIXME the offset here should not be folded: if %p points to the beginning of ; scratch or LDS and %i is -1, a folded offset crashes the program. define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { -; GFX90A-LABEL: flat_offset_maybe_oob: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: flat_offset_maybe_oob: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: flat_offset_maybe_oob: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX90A-SDAG-LABEL: flat_offset_maybe_oob: +; GFX90A-SDAG: ; %bb.0: +; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX90A-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_offset_maybe_oob: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: flat_offset_maybe_oob: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX10-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-SDAG-LABEL: flat_offset_maybe_oob: +; GFX942-SDAG: ; %bb.0: +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: flat_offset_maybe_oob: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-LABEL: flat_offset_maybe_oob: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: flat_offset_maybe_oob: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-GISEL-LABEL: flat_offset_maybe_oob: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: flat_offset_maybe_oob: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: flat_load_dword v0, v[0:1] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: flat_offset_maybe_oob: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX942-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX942-GISEL-NEXT: flat_load_dword v0, v[0:1] +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: flat_offset_maybe_oob: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_offset_maybe_oob: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = add nsw i32 %i, 3 %arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx %l = load i32, ptr %arrayidx @@ -88,13 +175,13 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; For MUBUF and for GFX12, folding the offset is okay. define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { -; GFX90A-MUBUF-LABEL: private_offset_maybe_oob: -; GFX90A-MUBUF: ; %bb.0: -; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 -; GFX90A-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 -; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX90A-MUBUF-NEXT: s_setpc_b64 s[30:31] +; GFX90A-SDAG-MUBUF-LABEL: private_offset_maybe_oob: +; GFX90A-SDAG-MUBUF: ; %bb.0: +; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-SDAG-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 +; GFX90A-SDAG-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 +; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX90A-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-FLATSCR-LABEL: private_offset_maybe_oob: ; GFX90A-FLATSCR: ; %bb.0: @@ -105,13 +192,13 @@ define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { ; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX90A-FLATSCR-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-MUBUF-LABEL: private_offset_maybe_oob: -; GFX10-MUBUF: ; %bb.0: -; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 -; GFX10-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 -; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX10-MUBUF-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-MUBUF-LABEL: private_offset_maybe_oob: +; GFX10-SDAG-MUBUF: ; %bb.0: +; GFX10-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 +; GFX10-SDAG-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 +; GFX10-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLATSCR-LABEL: private_offset_maybe_oob: ; GFX10-FLATSCR: ; %bb.0: @@ -141,19 +228,61 @@ define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: private_offset_maybe_oob: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshl_add_u32 v0, v1, 2, v0 -; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:12 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-SDAG-LABEL: private_offset_maybe_oob: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, v0 +; GFX12-SDAG-NEXT: scratch_load_b32 v0, v0, off offset:12 +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-GISEL-MUBUF-LABEL: private_offset_maybe_oob: +; GFX90A-GISEL-MUBUF: ; %bb.0: +; GFX90A-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-GISEL-MUBUF-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX90A-GISEL-MUBUF-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX90A-GISEL-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 +; GFX90A-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX90A-GISEL-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-MUBUF-LABEL: private_offset_maybe_oob: +; GFX10-GISEL-MUBUF: ; %bb.0: +; GFX10-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-MUBUF-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-GISEL-MUBUF-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-GISEL-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 +; GFX10-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: private_offset_maybe_oob: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX12-GISEL-NEXT: scratch_load_b32 v0, v0, off offset:12 +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = add nsw i32 %i, 3 %arrayidx = getelementptr inbounds i32, ptr addrspace(5) %p, i32 %idx %l = load i32, ptr addrspace(5) %arrayidx ret i32 %l } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX10-GISEL-FLATSCR: {{.*}} +; GFX10-MUBUF: {{.*}} +; GFX10-SDAG-FLATSCR: {{.*}} +; GFX12: {{.*}} +; GFX90A: {{.*}} +; GFX90A-GISEL-FLATSCR: {{.*}} +; GFX90A-MUBUF: {{.*}} +; GFX90A-SDAG-FLATSCR: {{.*}}