Skip to content

Commit b5ecec4

Browse files
authored
AMDGPU: Always use AV spill pseudos on targets with AGPRs (llvm#3125)
2 parents a6fb574 + 0e3862b commit b5ecec4

7 files changed

+149
-267
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 18 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -1642,48 +1642,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
16421642
}
16431643
}
16441644

1645-
static unsigned getAGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1646-
switch (Size) {
1647-
case 4:
1648-
return NeedsCFI ? AMDGPU::SI_SPILL_A32_CFI_SAVE : AMDGPU::SI_SPILL_A32_SAVE;
1649-
case 8:
1650-
return NeedsCFI ? AMDGPU::SI_SPILL_A64_CFI_SAVE : AMDGPU::SI_SPILL_A64_SAVE;
1651-
case 12:
1652-
return NeedsCFI ? AMDGPU::SI_SPILL_A96_CFI_SAVE : AMDGPU::SI_SPILL_A96_SAVE;
1653-
case 16:
1654-
return NeedsCFI ? AMDGPU::SI_SPILL_A128_CFI_SAVE
1655-
: AMDGPU::SI_SPILL_A128_SAVE;
1656-
case 20:
1657-
return NeedsCFI ? AMDGPU::SI_SPILL_A160_CFI_SAVE
1658-
: AMDGPU::SI_SPILL_A160_SAVE;
1659-
case 24:
1660-
return NeedsCFI ? AMDGPU::SI_SPILL_A192_CFI_SAVE
1661-
: AMDGPU::SI_SPILL_A192_SAVE;
1662-
case 28:
1663-
return NeedsCFI ? AMDGPU::SI_SPILL_A224_CFI_SAVE
1664-
: AMDGPU::SI_SPILL_A224_SAVE;
1665-
case 32:
1666-
return NeedsCFI ? AMDGPU::SI_SPILL_A256_CFI_SAVE
1667-
: AMDGPU::SI_SPILL_A256_SAVE;
1668-
case 36:
1669-
return AMDGPU::SI_SPILL_A288_SAVE;
1670-
case 40:
1671-
return AMDGPU::SI_SPILL_A320_SAVE;
1672-
case 44:
1673-
return AMDGPU::SI_SPILL_A352_SAVE;
1674-
case 48:
1675-
return AMDGPU::SI_SPILL_A384_SAVE;
1676-
case 64:
1677-
return NeedsCFI ? AMDGPU::SI_SPILL_A512_CFI_SAVE
1678-
: AMDGPU::SI_SPILL_A512_SAVE;
1679-
case 128:
1680-
return NeedsCFI ? AMDGPU::SI_SPILL_A1024_CFI_SAVE
1681-
: AMDGPU::SI_SPILL_A1024_SAVE;
1682-
default:
1683-
llvm_unreachable("unknown register size");
1684-
}
1685-
}
1686-
16871645
static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
16881646
switch (Size) {
16891647
case 4:
@@ -1738,23 +1696,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
17381696
return AMDGPU::SI_SPILL_WWM_V32_SAVE;
17391697
}
17401698

1741-
static unsigned getVectorRegSpillSaveOpcode(Register Reg,
1742-
const TargetRegisterClass *RC,
1743-
unsigned Size,
1744-
const SIRegisterInfo &TRI,
1745-
const SIMachineFunctionInfo &MFI,
1746-
bool NeedsCFI) {
1747-
bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1699+
unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
1700+
Register Reg, const TargetRegisterClass *RC, unsigned Size,
1701+
const SIMachineFunctionInfo &MFI, bool NeedsCFI) const {
1702+
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
17481703

17491704
// Choose the right opcode if spilling a WWM register.
17501705
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
17511706
return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
17521707

1753-
if (IsVectorSuperClass)
1708+
// TODO: Check if AGPRs are available
1709+
if (ST.hasMAIInsts())
17541710
return getAVSpillSaveOpcode(Size, NeedsCFI);
17551711

1756-
return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size, NeedsCFI)
1757-
: getVGPRSpillSaveOpcode(Size, NeedsCFI);
1712+
return getVGPRSpillSaveOpcode(Size, NeedsCFI);
17581713
}
17591714

17601715
void SIInstrInfo::storeRegToStackSlotImpl(
@@ -1804,7 +1759,7 @@ void SIInstrInfo::storeRegToStackSlotImpl(
18041759
}
18051760

18061761
unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1807-
SpillSize, RI, *MFI, NeedsCFI);
1762+
SpillSize, *MFI, NeedsCFI);
18081763
MFI->setHasSpilledVGPRs();
18091764

18101765
BuildMI(MBB, MI, DL, get(Opcode))
@@ -1906,41 +1861,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
19061861
}
19071862
}
19081863

1909-
static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1910-
switch (Size) {
1911-
case 4:
1912-
return AMDGPU::SI_SPILL_A32_RESTORE;
1913-
case 8:
1914-
return AMDGPU::SI_SPILL_A64_RESTORE;
1915-
case 12:
1916-
return AMDGPU::SI_SPILL_A96_RESTORE;
1917-
case 16:
1918-
return AMDGPU::SI_SPILL_A128_RESTORE;
1919-
case 20:
1920-
return AMDGPU::SI_SPILL_A160_RESTORE;
1921-
case 24:
1922-
return AMDGPU::SI_SPILL_A192_RESTORE;
1923-
case 28:
1924-
return AMDGPU::SI_SPILL_A224_RESTORE;
1925-
case 32:
1926-
return AMDGPU::SI_SPILL_A256_RESTORE;
1927-
case 36:
1928-
return AMDGPU::SI_SPILL_A288_RESTORE;
1929-
case 40:
1930-
return AMDGPU::SI_SPILL_A320_RESTORE;
1931-
case 44:
1932-
return AMDGPU::SI_SPILL_A352_RESTORE;
1933-
case 48:
1934-
return AMDGPU::SI_SPILL_A384_RESTORE;
1935-
case 64:
1936-
return AMDGPU::SI_SPILL_A512_RESTORE;
1937-
case 128:
1938-
return AMDGPU::SI_SPILL_A1024_RESTORE;
1939-
default:
1940-
llvm_unreachable("unknown register size");
1941-
}
1942-
}
1943-
19441864
static unsigned getAVSpillRestoreOpcode(unsigned Size) {
19451865
switch (Size) {
19461866
case 4:
@@ -1982,27 +1902,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
19821902
if (Size != 4)
19831903
llvm_unreachable("unknown wwm register spill size");
19841904

1985-
if (IsVectorSuperClass)
1905+
if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
19861906
return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
19871907

19881908
return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
19891909
}
19901910

1991-
static unsigned
1992-
getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
1993-
unsigned Size, const SIRegisterInfo &TRI,
1994-
const SIMachineFunctionInfo &MFI) {
1995-
bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1911+
unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
1912+
Register Reg, const TargetRegisterClass *RC, unsigned Size,
1913+
const SIMachineFunctionInfo &MFI) const {
1914+
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
19961915

19971916
// Choose the right opcode if restoring a WWM register.
19981917
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
19991918
return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
20001919

2001-
if (IsVectorSuperClass)
1920+
// TODO: Check if AGPRs are available
1921+
if (ST.hasMAIInsts())
20021922
return getAVSpillRestoreOpcode(Size);
20031923

2004-
return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
2005-
: getVGPRSpillRestoreOpcode(Size);
1924+
assert(!RI.isAGPRClass(RC));
1925+
return getVGPRSpillRestoreOpcode(Size);
20061926
}
20071927

20081928
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -2050,7 +1970,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
20501970
}
20511971

20521972
unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
2053-
SpillSize, RI, *MFI);
1973+
SpillSize, *MFI);
20541974
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
20551975
.addFrameIndex(FrameIndex) // vaddr
20561976
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class LiveVariables;
3333
class MachineDominatorTree;
3434
class MachineRegisterInfo;
3535
class RegScavenger;
36+
class SIMachineFunctionInfo;
3637
class TargetRegisterClass;
3738
class ScheduleHazardRecognizer;
3839

@@ -298,10 +299,20 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
298299
bool isKill, int FrameIndex,
299300
const TargetRegisterClass *RC,
300301
const TargetRegisterInfo *TRI) const;
301-
302+
302303
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
303304
int64_t &ImmVal) const override;
304305

306+
unsigned getVectorRegSpillSaveOpcode(Register Reg,
307+
const TargetRegisterClass *RC,
308+
unsigned Size,
309+
const SIMachineFunctionInfo &MFI,
310+
bool NeedsCFI) const;
311+
unsigned
312+
getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
313+
unsigned Size,
314+
const SIMachineFunctionInfo &MFI) const;
315+
305316
void storeRegToStackSlot(
306317
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
307318
bool isKill, int FrameIndex, const TargetRegisterClass *RC,

llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll

Lines changed: 31 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -444,14 +444,6 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
444444
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
445445
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
446446
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
447-
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
448-
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
449-
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
450-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
451-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
452-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
453-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
454-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
455447
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
456448
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
457449
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
@@ -464,20 +456,15 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
464456
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
465457
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
466458
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
467-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:240
468-
; GISEL-GFX942-NEXT: s_nop 0
459+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
460+
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
461+
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
462+
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
463+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
464+
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
469465
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
470466
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
471-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2)
472-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
473-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
474-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
475-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
476-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
477-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
478-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
479-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
480-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:32
467+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
481468
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
482469
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
483470
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
@@ -490,10 +477,8 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
490477
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
491478
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
492479
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
493-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
494-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
495-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
496-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
480+
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
481+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
497482
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
498483
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1
499484
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
@@ -822,14 +807,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
822807
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
823808
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
824809
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
825-
; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
826-
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
827-
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
828-
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
829-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
830-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
831-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
832-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
833810
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
834811
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
835812
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
@@ -842,20 +819,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
842819
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
843820
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
844821
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
845-
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:240
846-
; SDAG-GFX942-NEXT: s_nop 0
822+
; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
823+
; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
824+
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
825+
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
826+
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
827+
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
828+
; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
847829
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
848830
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
849-
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(2)
850-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
851-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
852-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
853-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
854-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
855-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
856-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
857-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
858-
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:32
831+
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
859832
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
860833
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
861834
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
@@ -868,10 +841,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
868841
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
869842
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
870843
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
871-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
872-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
873-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
874-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
844+
; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
845+
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
875846
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
876847
; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
877848
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
@@ -993,16 +964,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
993964
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
994965
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
995966
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
996-
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
997-
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
998-
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
999-
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
1000-
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
1001-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
1002-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
1003-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
1004-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
1005-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
1006967
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
1007968
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
1008969
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
@@ -1015,20 +976,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
1015976
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
1016977
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
1017978
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
1018-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:240
1019-
; GISEL-GFX942-NEXT: s_nop 0
979+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
980+
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
981+
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
982+
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
983+
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
984+
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
985+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
986+
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
987+
; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
1020988
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
1021989
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
1022-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2)
1023-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
1024-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
1025-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
1026-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
1027-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
1028-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
1029-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
1030-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
1031-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:32
990+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
1032991
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
1033992
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
1034993
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
@@ -1041,10 +1000,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
10411000
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
10421001
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
10431002
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
1044-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
1045-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
1046-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
1047-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
1003+
; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
1004+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
10481005
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
10491006
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
10501007
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split

0 commit comments

Comments
 (0)