Skip to content

Commit 1c2298f

Browse files
committed
Use shouldSkipRematerializationForReturn
1 parent e911b1f commit 1c2298f

File tree

2 files changed

+85
-84
lines changed

2 files changed

+85
-84
lines changed

llvm/lib/CodeGen/RegisterCoalescer.cpp

Lines changed: 84 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,14 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate {
300300
bool reMaterializeDef(const CoalescerPair &CP, MachineInstr *CopyMI,
301301
bool &IsDefCopy);
302302

303+
/// Check if rematerialization should be skipped for a physical register
304+
/// used as a return value. This helps enable better coalescing by avoiding
305+
/// rematerialization when a copy to a return register can be coalesced.
306+
/// Returns true if rematerialization should be skipped.
307+
bool shouldSkipRematerializationForReturn(Register DstReg, Register SrcReg,
308+
MachineInstr *DefMI,
309+
MachineInstr *CopyMI);
310+
303311
/// Return true if a copy involving a physreg should be joined.
304312
bool canJoinPhys(const CoalescerPair &CP);
305313

@@ -1284,6 +1292,80 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
12841292

12851293
/// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
12861294
/// defining a subregister.
1295+
bool RegisterCoalescer::shouldSkipRematerializationForReturn(
1296+
Register DstReg, Register SrcReg, MachineInstr *DefMI,
1297+
MachineInstr *CopyMI) {
1298+
MachineBasicBlock *MBB = CopyMI->getParent();
1299+
if (!DstReg.isPhysical() || DefMI->getParent() != MBB || MBB->empty())
1300+
return false;
1301+
1302+
// Check if the last instruction is a return using DstReg
1303+
const MachineInstr &LastInstr = MBB->back();
1304+
if (!LastInstr.isReturn() || !LastInstr.readsRegister(DstReg, TRI))
1305+
return false;
1306+
1307+
// Exception: Allow rematerialization for zero-idiom instructions
1308+
// (e.g., xorps %xmm0, %xmm0) because rematerialization produces
1309+
// independent zero-latency instructions, which is better than copying
1310+
const TargetSubtargetInfo &STI = MF->getSubtarget();
1311+
APInt Mask;
1312+
if (STI.isZeroIdiom(DefMI, Mask)) {
1313+
LLVM_DEBUG(dbgs() << "\tAllow remat: zero-idiom instruction\n");
1314+
return false;
1315+
}
1316+
1317+
// Check for duplicate DefMI before CopyMI
1318+
bool HasDuplicateDef = false;
1319+
for (MachineBasicBlock::iterator I = MBB->begin(); &*I != CopyMI; ++I) {
1320+
if (&*I != DefMI && I->isIdenticalTo(*DefMI, MachineInstr::IgnoreDefs)) {
1321+
HasDuplicateDef = true;
1322+
break;
1323+
}
1324+
}
1325+
1326+
// Check if register is redefined after CopyMI
1327+
bool RegRedefinedAfterCopy = false;
1328+
for (auto I = std::next(CopyMI->getIterator()); I != MBB->end(); ++I) {
1329+
if (I->modifiesRegister(DstReg, TRI)) {
1330+
RegRedefinedAfterCopy = true;
1331+
break;
1332+
}
1333+
if (I->isReturn())
1334+
break;
1335+
}
1336+
1337+
// Only consider skipping remat if there's no duplicate def and register
1338+
// is not redefined after the copy
1339+
if (!HasDuplicateDef && !RegRedefinedAfterCopy) {
1340+
// Exception: Allow remat for constant moves with restricted usage
1341+
if (DefMI->isMoveImmediate()) {
1342+
// Single use is fine - allow remat
1343+
if (MRI->hasOneNonDBGUse(SrcReg))
1344+
return false;
1345+
1346+
// Multiple uses: only allow if all uses are copies
1347+
bool OnlyUsedByCopies = true;
1348+
for (const MachineOperand &MO : MRI->use_operands(SrcReg)) {
1349+
const MachineInstr *UseMI = MO.getParent();
1350+
if (!UseMI->isCopy() && !UseMI->isSubregToReg()) {
1351+
OnlyUsedByCopies = false;
1352+
break;
1353+
}
1354+
}
1355+
1356+
if (OnlyUsedByCopies)
1357+
return false;
1358+
}
1359+
1360+
// Skip remat for return register to enable better coalescing
1361+
LLVM_DEBUG(dbgs() << "\tSkip remat for return register: "
1362+
<< printReg(DstReg, TRI) << '\n');
1363+
return true;
1364+
}
1365+
1366+
return false;
1367+
}
1368+
12871369
static bool definesFullReg(const MachineInstr &MI, Register Reg) {
12881370
assert(!Reg.isPhysical() && "This code cannot handle physreg aliasing");
12891371

@@ -1326,78 +1408,8 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
13261408
if (!TII->isAsCheapAsAMove(*DefMI))
13271409
return false;
13281410

1329-
// Skip rematerialization for physical registers used as return values within
1330-
// the same basic block to enable better coalescing.
1331-
if (DstReg.isPhysical()) {
1332-
MachineBasicBlock *MBB = CopyMI->getParent();
1333-
if (DefMI->getParent() == MBB && !MBB->empty()) {
1334-
// Quick check: is the last instruction a return using DstReg?
1335-
const MachineInstr &LastInstr = MBB->back();
1336-
if (LastInstr.isReturn() && LastInstr.readsRegister(DstReg, TRI)) {
1337-
// This is a return register, perform checks
1338-
1339-
// Exception: allow rematerialization for zero-idiom instructions
1340-
// (e.g., xorps %xmm0, %xmm0) because rematerialization produces
1341-
// independent zero-latency instructions, which is better than copying
1342-
const TargetSubtargetInfo &STI = MF->getSubtarget();
1343-
APInt Mask;
1344-
if (STI.isZeroIdiom(DefMI, Mask)) {
1345-
LLVM_DEBUG(dbgs() << "\tAllow remat: zero-idiom instruction\n");
1346-
} else {
1347-
// Check for duplicate DefMI before CopyMI
1348-
bool HasDuplicateDef = false;
1349-
for (MachineBasicBlock::iterator I = MBB->begin(); &*I != CopyMI;
1350-
++I) {
1351-
if (&*I != DefMI &&
1352-
I->isIdenticalTo(*DefMI, MachineInstr::IgnoreDefs)) {
1353-
HasDuplicateDef = true;
1354-
break;
1355-
}
1356-
}
1357-
1358-
// Check if register is redefined after CopyMI
1359-
bool RegRedefinedAfterCopy = false;
1360-
for (MachineBasicBlock::iterator I = std::next(CopyMI->getIterator());
1361-
I != MBB->end(); ++I) {
1362-
if (I->modifiesRegister(DstReg, TRI)) {
1363-
RegRedefinedAfterCopy = true;
1364-
break;
1365-
}
1366-
if (I->isReturn())
1367-
break;
1368-
}
1369-
1370-
// Skip remat only if: no duplicate def AND reg not redefined
1371-
if (!HasDuplicateDef && !RegRedefinedAfterCopy) {
1372-
// Exception: allow remat for constant moves with limited uses
1373-
if (DefMI->isMoveImmediate()) {
1374-
if (!MRI->hasOneNonDBGUse(SrcReg)) {
1375-
// Check if all uses are copies
1376-
bool OnlyUsedByCopies = true;
1377-
for (const MachineOperand &MO : MRI->use_operands(SrcReg)) {
1378-
const MachineInstr *UseMI = MO.getParent();
1379-
if (!UseMI->isCopy() && !UseMI->isSubregToReg()) {
1380-
OnlyUsedByCopies = false;
1381-
break;
1382-
}
1383-
}
1384-
1385-
if (!OnlyUsedByCopies || MRI->use_empty(SrcReg)) {
1386-
LLVM_DEBUG(dbgs() << "\tSkip remat for return register: "
1387-
<< printReg(DstReg, TRI) << '\n');
1388-
return false;
1389-
}
1390-
}
1391-
} else {
1392-
LLVM_DEBUG(dbgs() << "\tSkip remat for return register: "
1393-
<< printReg(DstReg, TRI) << '\n');
1394-
return false;
1395-
}
1396-
}
1397-
}
1398-
}
1399-
}
1400-
}
1411+
if (shouldSkipRematerializationForReturn(DstReg, SrcReg, DefMI, CopyMI))
1412+
return false;
14011413

14021414
if (!TII->isReMaterializable(*DefMI))
14031415
return false;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -365,18 +365,6 @@ define { i32, <3 x i32> } @global_load_tr6_b96_vaddr_no_align2_requirement(ptr a
365365
}
366366

367367
define { i32, <3 x i32> } @global_load_tr6_b96_saddr_no_align2_requirement(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
368-
; GFX1250-LABEL: global_load_tr6_b96_saddr_no_align2_requirement:
369-
; GFX1250: ; %bb.0:
370-
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
371-
; GFX1250-NEXT: s_wait_kmcnt 0x0
372-
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
373-
; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:32
374-
; GFX1250-NEXT: s_wait_loadcnt 0x0
375-
; GFX1250-NEXT: s_wait_xcnt 0x0
376-
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
377-
; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
378-
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
379-
;
380368
; GFX1250-SDAG-LABEL: global_load_tr6_b96_saddr_no_align2_requirement:
381369
; GFX1250-SDAG: ; %bb.0:
382370
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -395,6 +383,7 @@ define { i32, <3 x i32> } @global_load_tr6_b96_saddr_no_align2_requirement(ptr a
395383
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
396384
; GFX1250-GISEL-NEXT: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:32
397385
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
386+
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
398387
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
399388
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
400389
; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]

0 commit comments

Comments
 (0)