Skip to content

Commit 0e3862b

Browse files
committed
AMDGPU: Always use AV spill pseudos on targets with AGPRs
This increases allocator freedom to inflate register classes to the AV class, we don't need to introduce a new restriction by basing the opcode on the current virtual register class. Ideally we would avoid this if we don't have any allocatable AGPRs for the function, but it probably doesn't make much difference in the end result if they are excluded from the final allocation order.
1 parent d7b09e7 commit 0e3862b

7 files changed

+149
-267
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 18 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -1642,48 +1642,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
16421642
}
16431643
}
16441644

1645-
static unsigned getAGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1646-
switch (Size) {
1647-
case 4:
1648-
return NeedsCFI ? AMDGPU::SI_SPILL_A32_CFI_SAVE : AMDGPU::SI_SPILL_A32_SAVE;
1649-
case 8:
1650-
return NeedsCFI ? AMDGPU::SI_SPILL_A64_CFI_SAVE : AMDGPU::SI_SPILL_A64_SAVE;
1651-
case 12:
1652-
return NeedsCFI ? AMDGPU::SI_SPILL_A96_CFI_SAVE : AMDGPU::SI_SPILL_A96_SAVE;
1653-
case 16:
1654-
return NeedsCFI ? AMDGPU::SI_SPILL_A128_CFI_SAVE
1655-
: AMDGPU::SI_SPILL_A128_SAVE;
1656-
case 20:
1657-
return NeedsCFI ? AMDGPU::SI_SPILL_A160_CFI_SAVE
1658-
: AMDGPU::SI_SPILL_A160_SAVE;
1659-
case 24:
1660-
return NeedsCFI ? AMDGPU::SI_SPILL_A192_CFI_SAVE
1661-
: AMDGPU::SI_SPILL_A192_SAVE;
1662-
case 28:
1663-
return NeedsCFI ? AMDGPU::SI_SPILL_A224_CFI_SAVE
1664-
: AMDGPU::SI_SPILL_A224_SAVE;
1665-
case 32:
1666-
return NeedsCFI ? AMDGPU::SI_SPILL_A256_CFI_SAVE
1667-
: AMDGPU::SI_SPILL_A256_SAVE;
1668-
case 36:
1669-
return AMDGPU::SI_SPILL_A288_SAVE;
1670-
case 40:
1671-
return AMDGPU::SI_SPILL_A320_SAVE;
1672-
case 44:
1673-
return AMDGPU::SI_SPILL_A352_SAVE;
1674-
case 48:
1675-
return AMDGPU::SI_SPILL_A384_SAVE;
1676-
case 64:
1677-
return NeedsCFI ? AMDGPU::SI_SPILL_A512_CFI_SAVE
1678-
: AMDGPU::SI_SPILL_A512_SAVE;
1679-
case 128:
1680-
return NeedsCFI ? AMDGPU::SI_SPILL_A1024_CFI_SAVE
1681-
: AMDGPU::SI_SPILL_A1024_SAVE;
1682-
default:
1683-
llvm_unreachable("unknown register size");
1684-
}
1685-
}
1686-
16871645
static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
16881646
switch (Size) {
16891647
case 4:
@@ -1738,23 +1696,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
17381696
return AMDGPU::SI_SPILL_WWM_V32_SAVE;
17391697
}
17401698

1741-
static unsigned getVectorRegSpillSaveOpcode(Register Reg,
1742-
const TargetRegisterClass *RC,
1743-
unsigned Size,
1744-
const SIRegisterInfo &TRI,
1745-
const SIMachineFunctionInfo &MFI,
1746-
bool NeedsCFI) {
1747-
bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1699+
unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
1700+
Register Reg, const TargetRegisterClass *RC, unsigned Size,
1701+
const SIMachineFunctionInfo &MFI, bool NeedsCFI) const {
1702+
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
17481703

17491704
// Choose the right opcode if spilling a WWM register.
17501705
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
17511706
return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
17521707

1753-
if (IsVectorSuperClass)
1708+
// TODO: Check if AGPRs are available
1709+
if (ST.hasMAIInsts())
17541710
return getAVSpillSaveOpcode(Size, NeedsCFI);
17551711

1756-
return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size, NeedsCFI)
1757-
: getVGPRSpillSaveOpcode(Size, NeedsCFI);
1712+
return getVGPRSpillSaveOpcode(Size, NeedsCFI);
17581713
}
17591714

17601715
void SIInstrInfo::storeRegToStackSlotImpl(
@@ -1804,7 +1759,7 @@ void SIInstrInfo::storeRegToStackSlotImpl(
18041759
}
18051760

18061761
unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1807-
SpillSize, RI, *MFI, NeedsCFI);
1762+
SpillSize, *MFI, NeedsCFI);
18081763
MFI->setHasSpilledVGPRs();
18091764

18101765
BuildMI(MBB, MI, DL, get(Opcode))
@@ -1906,41 +1861,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
19061861
}
19071862
}
19081863

1909-
static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1910-
switch (Size) {
1911-
case 4:
1912-
return AMDGPU::SI_SPILL_A32_RESTORE;
1913-
case 8:
1914-
return AMDGPU::SI_SPILL_A64_RESTORE;
1915-
case 12:
1916-
return AMDGPU::SI_SPILL_A96_RESTORE;
1917-
case 16:
1918-
return AMDGPU::SI_SPILL_A128_RESTORE;
1919-
case 20:
1920-
return AMDGPU::SI_SPILL_A160_RESTORE;
1921-
case 24:
1922-
return AMDGPU::SI_SPILL_A192_RESTORE;
1923-
case 28:
1924-
return AMDGPU::SI_SPILL_A224_RESTORE;
1925-
case 32:
1926-
return AMDGPU::SI_SPILL_A256_RESTORE;
1927-
case 36:
1928-
return AMDGPU::SI_SPILL_A288_RESTORE;
1929-
case 40:
1930-
return AMDGPU::SI_SPILL_A320_RESTORE;
1931-
case 44:
1932-
return AMDGPU::SI_SPILL_A352_RESTORE;
1933-
case 48:
1934-
return AMDGPU::SI_SPILL_A384_RESTORE;
1935-
case 64:
1936-
return AMDGPU::SI_SPILL_A512_RESTORE;
1937-
case 128:
1938-
return AMDGPU::SI_SPILL_A1024_RESTORE;
1939-
default:
1940-
llvm_unreachable("unknown register size");
1941-
}
1942-
}
1943-
19441864
static unsigned getAVSpillRestoreOpcode(unsigned Size) {
19451865
switch (Size) {
19461866
case 4:
@@ -1982,27 +1902,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
19821902
if (Size != 4)
19831903
llvm_unreachable("unknown wwm register spill size");
19841904

1985-
if (IsVectorSuperClass)
1905+
if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
19861906
return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
19871907

19881908
return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
19891909
}
19901910

1991-
static unsigned
1992-
getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
1993-
unsigned Size, const SIRegisterInfo &TRI,
1994-
const SIMachineFunctionInfo &MFI) {
1995-
bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1911+
unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
1912+
Register Reg, const TargetRegisterClass *RC, unsigned Size,
1913+
const SIMachineFunctionInfo &MFI) const {
1914+
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
19961915

19971916
// Choose the right opcode if restoring a WWM register.
19981917
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
19991918
return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
20001919

2001-
if (IsVectorSuperClass)
1920+
// TODO: Check if AGPRs are available
1921+
if (ST.hasMAIInsts())
20021922
return getAVSpillRestoreOpcode(Size);
20031923

2004-
return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
2005-
: getVGPRSpillRestoreOpcode(Size);
1924+
assert(!RI.isAGPRClass(RC));
1925+
return getVGPRSpillRestoreOpcode(Size);
20061926
}
20071927

20081928
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -2050,7 +1970,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
20501970
}
20511971

20521972
unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
2053-
SpillSize, RI, *MFI);
1973+
SpillSize, *MFI);
20541974
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
20551975
.addFrameIndex(FrameIndex) // vaddr
20561976
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class LiveVariables;
3333
class MachineDominatorTree;
3434
class MachineRegisterInfo;
3535
class RegScavenger;
36+
class SIMachineFunctionInfo;
3637
class TargetRegisterClass;
3738
class ScheduleHazardRecognizer;
3839

@@ -298,10 +299,20 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
298299
bool isKill, int FrameIndex,
299300
const TargetRegisterClass *RC,
300301
const TargetRegisterInfo *TRI) const;
301-
302+
302303
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
303304
int64_t &ImmVal) const override;
304305

306+
unsigned getVectorRegSpillSaveOpcode(Register Reg,
307+
const TargetRegisterClass *RC,
308+
unsigned Size,
309+
const SIMachineFunctionInfo &MFI,
310+
bool NeedsCFI) const;
311+
unsigned
312+
getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
313+
unsigned Size,
314+
const SIMachineFunctionInfo &MFI) const;
315+
305316
void storeRegToStackSlot(
306317
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
307318
bool isKill, int FrameIndex, const TargetRegisterClass *RC,

llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll

Lines changed: 31 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -444,14 +444,6 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
444444
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
445445
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
446446
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
447-
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
448-
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
449-
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
450-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
451-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
452-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
453-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
454-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
455447
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
456448
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
457449
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
@@ -464,20 +456,15 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
464456
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
465457
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
466458
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
467-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:240
468-
; GISEL-GFX942-NEXT: s_nop 0
459+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
460+
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
461+
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
462+
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
463+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
464+
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
469465
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
470466
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
471-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2)
472-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
473-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
474-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
475-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
476-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
477-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
478-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
479-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
480-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:32
467+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
481468
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
482469
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
483470
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
@@ -490,10 +477,8 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
490477
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
491478
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
492479
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
493-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
494-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
495-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
496-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
480+
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
481+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
497482
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
498483
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1
499484
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
@@ -822,14 +807,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
822807
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
823808
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
824809
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
825-
; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
826-
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
827-
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
828-
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
829-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
830-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
831-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
832-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
833810
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
834811
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
835812
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
@@ -842,20 +819,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
842819
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
843820
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
844821
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
845-
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:240
846-
; SDAG-GFX942-NEXT: s_nop 0
822+
; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
823+
; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
824+
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
825+
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
826+
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
827+
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
828+
; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
847829
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
848830
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
849-
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(2)
850-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
851-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
852-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
853-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
854-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
855-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
856-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
857-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
858-
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:32
831+
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
859832
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
860833
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
861834
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
@@ -868,10 +841,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
868841
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
869842
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
870843
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
871-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
872-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
873-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
874-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
844+
; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
845+
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
875846
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
876847
; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
877848
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
@@ -993,16 +964,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
993964
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
994965
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
995966
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
996-
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
997-
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
998-
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
999-
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
1000-
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
1001-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
1002-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
1003-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
1004-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
1005-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
1006967
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
1007968
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
1008969
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
@@ -1015,20 +976,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
1015976
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
1016977
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
1017978
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
1018-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:240
1019-
; GISEL-GFX942-NEXT: s_nop 0
979+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
980+
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
981+
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
982+
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
983+
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
984+
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
985+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
986+
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
987+
; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
1020988
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
1021989
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
1022-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2)
1023-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
1024-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
1025-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
1026-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
1027-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
1028-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
1029-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
1030-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
1031-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:32
990+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
1032991
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
1033992
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
1034993
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
@@ -1041,10 +1000,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
10411000
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
10421001
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
10431002
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
1044-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
1045-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
1046-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
1047-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
1003+
; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
1004+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
10481005
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
10491006
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
10501007
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split

0 commit comments

Comments
 (0)