Skip to content

Commit 1614c3b

Browse files
authored
AMDGPU: Always use AV spill pseudos on targets with AGPRs (#149099)
This increases allocator freedom to inflate register classes to the AV class, we don't need to introduce a new restriction by basing the opcode on the current virtual register class. Ideally we would avoid this if we don't have any allocatable AGPRs for the function, but it probably doesn't make much difference in the end result if they are excluded from the final allocation order.
1 parent 0652807 commit 1614c3b

8 files changed

+159
-270
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 19 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1625,41 +1625,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
16251625
}
16261626
}
16271627

1628-
static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1629-
switch (Size) {
1630-
case 4:
1631-
return AMDGPU::SI_SPILL_A32_SAVE;
1632-
case 8:
1633-
return AMDGPU::SI_SPILL_A64_SAVE;
1634-
case 12:
1635-
return AMDGPU::SI_SPILL_A96_SAVE;
1636-
case 16:
1637-
return AMDGPU::SI_SPILL_A128_SAVE;
1638-
case 20:
1639-
return AMDGPU::SI_SPILL_A160_SAVE;
1640-
case 24:
1641-
return AMDGPU::SI_SPILL_A192_SAVE;
1642-
case 28:
1643-
return AMDGPU::SI_SPILL_A224_SAVE;
1644-
case 32:
1645-
return AMDGPU::SI_SPILL_A256_SAVE;
1646-
case 36:
1647-
return AMDGPU::SI_SPILL_A288_SAVE;
1648-
case 40:
1649-
return AMDGPU::SI_SPILL_A320_SAVE;
1650-
case 44:
1651-
return AMDGPU::SI_SPILL_A352_SAVE;
1652-
case 48:
1653-
return AMDGPU::SI_SPILL_A384_SAVE;
1654-
case 64:
1655-
return AMDGPU::SI_SPILL_A512_SAVE;
1656-
case 128:
1657-
return AMDGPU::SI_SPILL_A1024_SAVE;
1658-
default:
1659-
llvm_unreachable("unknown register size");
1660-
}
1661-
}
1662-
16631628
static unsigned getAVSpillSaveOpcode(unsigned Size) {
16641629
switch (Size) {
16651630
case 4:
@@ -1707,22 +1672,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
17071672
return AMDGPU::SI_SPILL_WWM_V32_SAVE;
17081673
}
17091674

1710-
static unsigned getVectorRegSpillSaveOpcode(Register Reg,
1711-
const TargetRegisterClass *RC,
1712-
unsigned Size,
1713-
const SIRegisterInfo &TRI,
1714-
const SIMachineFunctionInfo &MFI) {
1715-
bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1675+
unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
1676+
Register Reg, const TargetRegisterClass *RC, unsigned Size,
1677+
const SIMachineFunctionInfo &MFI) const {
1678+
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
17161679

17171680
// Choose the right opcode if spilling a WWM register.
17181681
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
17191682
return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
17201683

1721-
if (IsVectorSuperClass)
1684+
// TODO: Check if AGPRs are available
1685+
if (ST.hasMAIInsts())
17221686
return getAVSpillSaveOpcode(Size);
17231687

1724-
return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1725-
: getVGPRSpillSaveOpcode(Size);
1688+
return getVGPRSpillSaveOpcode(Size);
17261689
}
17271690

17281691
void SIInstrInfo::storeRegToStackSlot(
@@ -1770,8 +1733,8 @@ void SIInstrInfo::storeRegToStackSlot(
17701733
return;
17711734
}
17721735

1773-
unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1774-
SpillSize, RI, *MFI);
1736+
unsigned Opcode =
1737+
getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
17751738
MFI->setHasSpilledVGPRs();
17761739

17771740
BuildMI(MBB, MI, DL, get(Opcode))
@@ -1854,41 +1817,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
18541817
}
18551818
}
18561819

1857-
static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1858-
switch (Size) {
1859-
case 4:
1860-
return AMDGPU::SI_SPILL_A32_RESTORE;
1861-
case 8:
1862-
return AMDGPU::SI_SPILL_A64_RESTORE;
1863-
case 12:
1864-
return AMDGPU::SI_SPILL_A96_RESTORE;
1865-
case 16:
1866-
return AMDGPU::SI_SPILL_A128_RESTORE;
1867-
case 20:
1868-
return AMDGPU::SI_SPILL_A160_RESTORE;
1869-
case 24:
1870-
return AMDGPU::SI_SPILL_A192_RESTORE;
1871-
case 28:
1872-
return AMDGPU::SI_SPILL_A224_RESTORE;
1873-
case 32:
1874-
return AMDGPU::SI_SPILL_A256_RESTORE;
1875-
case 36:
1876-
return AMDGPU::SI_SPILL_A288_RESTORE;
1877-
case 40:
1878-
return AMDGPU::SI_SPILL_A320_RESTORE;
1879-
case 44:
1880-
return AMDGPU::SI_SPILL_A352_RESTORE;
1881-
case 48:
1882-
return AMDGPU::SI_SPILL_A384_RESTORE;
1883-
case 64:
1884-
return AMDGPU::SI_SPILL_A512_RESTORE;
1885-
case 128:
1886-
return AMDGPU::SI_SPILL_A1024_RESTORE;
1887-
default:
1888-
llvm_unreachable("unknown register size");
1889-
}
1890-
}
1891-
18921820
static unsigned getAVSpillRestoreOpcode(unsigned Size) {
18931821
switch (Size) {
18941822
case 4:
@@ -1930,27 +1858,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
19301858
if (Size != 4)
19311859
llvm_unreachable("unknown wwm register spill size");
19321860

1933-
if (IsVectorSuperClass)
1861+
if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
19341862
return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
19351863

19361864
return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
19371865
}
19381866

1939-
static unsigned
1940-
getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
1941-
unsigned Size, const SIRegisterInfo &TRI,
1942-
const SIMachineFunctionInfo &MFI) {
1943-
bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1867+
unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
1868+
Register Reg, const TargetRegisterClass *RC, unsigned Size,
1869+
const SIMachineFunctionInfo &MFI) const {
1870+
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
19441871

19451872
// Choose the right opcode if restoring a WWM register.
19461873
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
19471874
return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
19481875

1949-
if (IsVectorSuperClass)
1876+
// TODO: Check if AGPRs are available
1877+
if (ST.hasMAIInsts())
19501878
return getAVSpillRestoreOpcode(Size);
19511879

1952-
return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1953-
: getVGPRSpillRestoreOpcode(Size);
1880+
assert(!RI.isAGPRClass(RC));
1881+
return getVGPRSpillRestoreOpcode(Size);
19541882
}
19551883

19561884
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -1998,7 +1926,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
19981926
}
19991927

20001928
unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
2001-
SpillSize, RI, *MFI);
1929+
SpillSize, *MFI);
20021930
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
20031931
.addFrameIndex(FrameIndex) // vaddr
20041932
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class LiveVariables;
3333
class MachineDominatorTree;
3434
class MachineRegisterInfo;
3535
class RegScavenger;
36+
class SIMachineFunctionInfo;
3637
class TargetRegisterClass;
3738
class ScheduleHazardRecognizer;
3839

@@ -287,6 +288,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
287288
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
288289
int64_t &ImmVal) const override;
289290

291+
unsigned getVectorRegSpillSaveOpcode(Register Reg,
292+
const TargetRegisterClass *RC,
293+
unsigned Size,
294+
const SIMachineFunctionInfo &MFI) const;
295+
unsigned
296+
getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
297+
unsigned Size,
298+
const SIMachineFunctionInfo &MFI) const;
299+
290300
void storeRegToStackSlot(
291301
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
292302
bool isKill, int FrameIndex, const TargetRegisterClass *RC,

llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll

Lines changed: 31 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -444,14 +444,6 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
444444
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
445445
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
446446
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
447-
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
448-
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
449-
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
450-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
451-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
452-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
453-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
454-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
455447
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
456448
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
457449
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
@@ -464,20 +456,15 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
464456
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
465457
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
466458
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
467-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:240
468-
; GISEL-GFX942-NEXT: s_nop 0
459+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
460+
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
461+
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
462+
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
463+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
464+
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
469465
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
470466
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
471-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2)
472-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
473-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
474-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
475-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
476-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
477-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
478-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
479-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
480-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:32
467+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
481468
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
482469
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
483470
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
@@ -490,10 +477,8 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
490477
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
491478
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
492479
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
493-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
494-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
495-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
496-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
480+
; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
481+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
497482
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
498483
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1
499484
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
@@ -822,14 +807,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
822807
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
823808
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
824809
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
825-
; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
826-
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
827-
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
828-
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
829-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
830-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
831-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
832-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
833810
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
834811
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
835812
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
@@ -842,20 +819,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
842819
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
843820
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
844821
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
845-
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:240
846-
; SDAG-GFX942-NEXT: s_nop 0
822+
; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
823+
; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
824+
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
825+
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
826+
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
827+
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
828+
; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
847829
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
848830
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
849-
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(2)
850-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
851-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
852-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
853-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
854-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
855-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
856-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
857-
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
858-
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:32
831+
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
859832
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
860833
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
861834
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
@@ -868,10 +841,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
868841
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
869842
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
870843
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
871-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
872-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
873-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
874-
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
844+
; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
845+
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
875846
; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
876847
; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
877848
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
@@ -993,16 +964,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
993964
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
994965
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
995966
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
996-
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
997-
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
998-
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
999-
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
1000-
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
1001-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
1002-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
1003-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
1004-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
1005-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
1006967
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
1007968
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
1008969
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
@@ -1015,20 +976,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
1015976
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
1016977
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
1017978
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
1018-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:240
1019-
; GISEL-GFX942-NEXT: s_nop 0
979+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
980+
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
981+
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
982+
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
983+
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
984+
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
985+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
986+
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
987+
; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
1020988
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
1021989
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
1022-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2)
1023-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
1024-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
1025-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
1026-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
1027-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
1028-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
1029-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
1030-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
1031-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:32
990+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
1032991
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
1033992
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
1034993
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
@@ -1041,10 +1000,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
10411000
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
10421001
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
10431002
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
1044-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
1045-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
1046-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
1047-
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
1003+
; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
1004+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
10481005
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
10491006
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
10501007
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split

0 commit comments

Comments
 (0)