-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[RegAlloc] Relax the split constrain on MBB prolog #168259
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-regalloc Author: Luo Yuanke (LuoYuanke) Changeshttps://reviews.llvm.org/D52052 is to prevent register split on the MBB Patch is 553.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168259.diff 11 Files Affected:
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 1bc7607890328..a6a65b444b466 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -773,8 +773,7 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
// Abort if the spill cannot be inserted at the MBB' start
if (((BC.Entry == SpillPlacement::MustSpill) ||
(BC.Entry == SpillPlacement::PrefSpill)) &&
- SlotIndex::isEarlierInstr(BI.FirstInstr,
- SA->getFirstSplitPoint(BC.Number)))
+ !SA->canSplitBeforeProlog(BC.Number))
return false;
}
@@ -829,11 +828,7 @@ bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
BCS[B].Number = Number;
// Abort if the spill cannot be inserted at the MBB' start
- MachineBasicBlock *MBB = MF->getBlockNumbered(Number);
- auto FirstNonDebugInstr = MBB->getFirstNonDebugInstr();
- if (FirstNonDebugInstr != MBB->end() &&
- SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*FirstNonDebugInstr),
- SA->getFirstSplitPoint(Number)))
+ if (!SA->canSplitBeforeProlog(Number))
return false;
// Interference for the live-in value.
if (Intf.first() <= Indexes->getMBBStartIdx(Number))
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index 8ec4bfbb5a330..f87c5f43ccaaa 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -147,6 +147,34 @@ InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI,
return LIS.getInstructionFromIndex(LIP);
}
+bool InsertPointAnalysis::canSplitBeforeProlog(const LiveInterval &CurLI,
+ const MachineBasicBlock &MBB) {
+ const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
+
+ for (auto &MI : MBB) {
+ if (MI.isPHI() || MI.isPosition() || MI.isDebugInstr() ||
+ MI.isPseudoProbe())
+ continue;
+
+ if (!TII->isBasicBlockPrologue(MI))
+ return true;
+
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
+ continue;
+
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
+ const TargetRegisterClass *CurRC = MRI.getRegClass(CurLI.reg());
+ if (TRI->getCommonSubClass(RC, CurRC))
+ return false;
+ }
+ }
+
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// Split Analysis
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index de255911268f2..a9fc921534d0e 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -89,6 +89,9 @@ class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis {
return Res;
}
+ /// Return true if we can split \pCurLI before \pMBB's prolog.
+ bool canSplitBeforeProlog(const LiveInterval &CurLI,
+ const MachineBasicBlock &MBB);
};
/// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
@@ -247,6 +250,11 @@ class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
SlotIndex getFirstSplitPoint(unsigned Num) {
return IPA.getFirstInsertPoint(*MF.getBlockNumbered(Num));
}
+
+ bool canSplitBeforeProlog(unsigned Num) {
+ MachineBasicBlock *BB = MF.getBlockNumbered(Num);
+ return IPA.canSplitBeforeProlog(*CurLI, *BB);
+ }
};
/// SplitEditor - Edit machine code and LiveIntervals for live range
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index d3ebd92f0677b..5ff30224f87ea 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -154717,13 +154717,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304
; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
-; SI-NEXT: s_mov_b32 s73, s21
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_writelane_b32 v41, s30, 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v44, s19, 0
; SI-NEXT: v_writelane_b32 v44, s18, 1
; SI-NEXT: v_writelane_b32 v44, s17, 2
; SI-NEXT: v_writelane_b32 v44, s16, 3
-; SI-NEXT: v_writelane_b32 v41, s30, 0
; SI-NEXT: v_writelane_b32 v41, s31, 1
; SI-NEXT: v_writelane_b32 v41, s34, 2
; SI-NEXT: v_writelane_b32 v41, s35, 3
@@ -154747,9 +154747,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s69, 21
; SI-NEXT: v_writelane_b32 v41, s70, 22
; SI-NEXT: v_writelane_b32 v41, s71, 23
-; SI-NEXT: s_mov_b32 s74, s29
-; SI-NEXT: s_mov_b32 s78, s28
-; SI-NEXT: s_mov_b32 s76, s27
+; SI-NEXT: s_mov_b32 s57, s28
+; SI-NEXT: s_mov_b32 s47, s27
; SI-NEXT: v_writelane_b32 v41, s80, 24
; SI-NEXT: v_writelane_b32 v41, s81, 25
; SI-NEXT: v_writelane_b32 v41, s82, 26
@@ -154759,7 +154758,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s86, 30
; SI-NEXT: v_writelane_b32 v41, s87, 31
; SI-NEXT: v_writelane_b32 v41, s96, 32
-; SI-NEXT: s_mov_b32 s47, s26
; SI-NEXT: v_writelane_b32 v41, s97, 33
; SI-NEXT: v_writelane_b32 v41, s98, 34
; SI-NEXT: v_writelane_b32 v41, s99, 35
@@ -154769,95 +154767,101 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
+; SI-NEXT: v_readfirstlane_b32 s89, v3
+; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; SI-NEXT: v_readfirstlane_b32 s90, v9
+; SI-NEXT: v_writelane_b32 v42, s89, 0
+; SI-NEXT: v_readfirstlane_b32 s91, v10
+; SI-NEXT: v_writelane_b32 v42, s90, 1
+; SI-NEXT: v_readfirstlane_b32 s92, v8
+; SI-NEXT: v_writelane_b32 v42, s91, 2
+; SI-NEXT: v_readfirstlane_b32 s93, v7
+; SI-NEXT: v_writelane_b32 v42, s92, 3
+; SI-NEXT: v_readfirstlane_b32 s94, v13
+; SI-NEXT: v_writelane_b32 v42, s93, 4
+; SI-NEXT: v_readfirstlane_b32 s95, v14
+; SI-NEXT: v_writelane_b32 v42, s94, 5
+; SI-NEXT: v_writelane_b32 v42, s95, 6
+; SI-NEXT: v_readfirstlane_b32 s30, v17
+; SI-NEXT: v_readfirstlane_b32 s31, v18
+; SI-NEXT: v_readfirstlane_b32 s34, v16
+; SI-NEXT: v_readfirstlane_b32 s35, v15
+; SI-NEXT: v_readfirstlane_b32 s36, v21
; SI-NEXT: v_readfirstlane_b32 s37, v22
-; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: v_readfirstlane_b32 s38, v20
-; SI-NEXT: v_writelane_b32 v43, s37, 0
; SI-NEXT: v_readfirstlane_b32 s39, v19
-; SI-NEXT: v_writelane_b32 v43, s38, 1
; SI-NEXT: v_readfirstlane_b32 s48, v25
-; SI-NEXT: v_writelane_b32 v43, s39, 2
; SI-NEXT: v_readfirstlane_b32 s49, v26
-; SI-NEXT: v_writelane_b32 v43, s48, 3
; SI-NEXT: v_readfirstlane_b32 s50, v24
-; SI-NEXT: v_writelane_b32 v43, s49, 4
; SI-NEXT: v_readfirstlane_b32 s51, v23
-; SI-NEXT: v_writelane_b32 v43, s50, 5
; SI-NEXT: v_readfirstlane_b32 s52, v29
-; SI-NEXT: v_writelane_b32 v43, s51, 6
; SI-NEXT: v_readfirstlane_b32 s53, v30
-; SI-NEXT: v_writelane_b32 v43, s52, 7
-; SI-NEXT: v_readfirstlane_b32 s54, v28
-; SI-NEXT: v_writelane_b32 v43, s53, 8
-; SI-NEXT: v_readfirstlane_b32 s55, v27
-; SI-NEXT: v_writelane_b32 v43, s54, 9
-; SI-NEXT: v_writelane_b32 v43, s55, 10
-; SI-NEXT: s_mov_b32 s57, s24
-; SI-NEXT: v_readfirstlane_b32 s16, v1
-; SI-NEXT: v_readfirstlane_b32 s17, v2
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s6, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280
+; SI-NEXT: v_writelane_b32 v44, s4, 4
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v32
-; SI-NEXT: v_writelane_b32 v44, s4, 4
+; SI-NEXT: v_writelane_b32 v44, s4, 5
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272
-; SI-NEXT: v_writelane_b32 v44, s4, 5
-; SI-NEXT: v_readfirstlane_b32 s4, v34
; SI-NEXT: v_writelane_b32 v44, s4, 6
-; SI-NEXT: v_readfirstlane_b32 s4, v35
+; SI-NEXT: v_readfirstlane_b32 s4, v34
; SI-NEXT: v_writelane_b32 v44, s4, 7
+; SI-NEXT: v_readfirstlane_b32 s4, v35
+; SI-NEXT: v_writelane_b32 v44, s4, 8
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v36
-; SI-NEXT: v_writelane_b32 v44, s4, 8
+; SI-NEXT: v_writelane_b32 v44, s4, 9
; SI-NEXT: v_readfirstlane_b32 s4, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT: v_writelane_b32 v44, s4, 9
+; SI-NEXT: v_writelane_b32 v44, s4, 10
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v38
-; SI-NEXT: v_writelane_b32 v44, s4, 10
+; SI-NEXT: v_writelane_b32 v44, s4, 11
+; SI-NEXT: v_readfirstlane_b32 s54, v28
+; SI-NEXT: v_readfirstlane_b32 s55, v27
+; SI-NEXT: s_mov_b32 s6, s23
+; SI-NEXT: s_mov_b32 s23, s21
+; SI-NEXT: s_mov_b32 s58, s26
+; SI-NEXT: s_mov_b32 s40, s25
+; SI-NEXT: s_mov_b32 s25, s24
+; SI-NEXT: v_readfirstlane_b32 s16, v1
+; SI-NEXT: v_readfirstlane_b32 s17, v2
; SI-NEXT: v_readfirstlane_b32 s18, v5
; SI-NEXT: v_readfirstlane_b32 s19, v6
; SI-NEXT: v_readfirstlane_b32 s77, v4
-; SI-NEXT: v_readfirstlane_b32 s89, v3
-; SI-NEXT: v_readfirstlane_b32 s90, v9
-; SI-NEXT: v_readfirstlane_b32 s91, v10
-; SI-NEXT: v_readfirstlane_b32 s92, v8
-; SI-NEXT: v_readfirstlane_b32 s93, v7
-; SI-NEXT: v_readfirstlane_b32 s94, v13
-; SI-NEXT: v_readfirstlane_b32 s95, v14
-; SI-NEXT: v_readfirstlane_b32 s30, v17
-; SI-NEXT: v_readfirstlane_b32 s31, v18
-; SI-NEXT: v_readfirstlane_b32 s34, v16
-; SI-NEXT: v_readfirstlane_b32 s35, v15
-; SI-NEXT: v_readfirstlane_b32 s36, v21
+; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; SI-NEXT: v_readfirstlane_b32 s26, v53
+; SI-NEXT: v_readfirstlane_b32 s46, v54
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_readfirstlane_b32 s61, v55
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s24, v40
+; SI-NEXT: v_readfirstlane_b32 s62, v40
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: v_writelane_b32 v44, s4, 11
+; SI-NEXT: v_writelane_b32 v44, s4, 12
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v39
-; SI-NEXT: v_writelane_b32 v44, s4, 12
+; SI-NEXT: v_writelane_b32 v44, s4, 13
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v48
-; SI-NEXT: v_writelane_b32 v44, s4, 13
+; SI-NEXT: v_writelane_b32 v44, s4, 14
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v49
-; SI-NEXT: v_writelane_b32 v44, s4, 14
+; SI-NEXT: v_writelane_b32 v44, s4, 15
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v50
-; SI-NEXT: v_writelane_b32 v44, s4, 15
+; SI-NEXT: v_writelane_b32 v44, s4, 16
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252
@@ -154867,40 +154871,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:236
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228
+; SI-NEXT: v_writelane_b32 v44, s4, 17
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s75, v32
+; SI-NEXT: v_readfirstlane_b32 s4, v32
+; SI-NEXT: v_writelane_b32 v44, s4, 18
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s21, v33
+; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220
-; SI-NEXT: v_writelane_b32 v44, s4, 16
+; SI-NEXT: v_writelane_b32 v44, s4, 19
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
+; SI-NEXT: v_writelane_b32 v44, s4, 20
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s40, v35
+; SI-NEXT: v_readfirstlane_b32 s4, v35
+; SI-NEXT: v_writelane_b32 v44, s4, 21
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s61, v36
+; SI-NEXT: v_readfirstlane_b32 s4, v36
+; SI-NEXT: v_writelane_b32 v44, s4, 22
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s63, v37
+; SI-NEXT: v_readfirstlane_b32 s4, v37
+; SI-NEXT: v_writelane_b32 v44, s4, 23
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204
-; SI-NEXT: v_writelane_b32 v44, s4, 17
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s59, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_writelane_b32 v44, s4, 24
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s56, v38
+; SI-NEXT: v_readfirstlane_b32 s4, v38
+; SI-NEXT: v_writelane_b32 v44, s4, 25
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s43, v39
+; SI-NEXT: v_readfirstlane_b32 s4, v39
+; SI-NEXT: v_writelane_b32 v44, s4, 26
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s46, v48
+; SI-NEXT: v_readfirstlane_b32 s4, v48
+; SI-NEXT: v_writelane_b32 v44, s4, 27
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s42, v49
+; SI-NEXT: v_readfirstlane_b32 s4, v49
+; SI-NEXT: v_writelane_b32 v44, s4, 28
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s13, v50
+; SI-NEXT: v_readfirstlane_b32 s4, v50
+; SI-NEXT: v_writelane_b32 v44, s4, 29
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s45, v51
+; SI-NEXT: v_readfirstlane_b32 s4, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192
@@ -154908,45 +154923,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176
+; SI-NEXT: v_writelane_b32 v44, s4, 30
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s88, v32
+; SI-NEXT: v_readfirstlane_b32 s4, v32
+; SI-NEXT: v_writelane_b32 v44, s4, 31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s79, v33
+; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168
+; SI-NEXT: v_writelane_b32 v44, s4, 32
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
-; SI-NEXT: v_writelane_b32 v44, s4, 18
+; SI-NEXT: v_writelane_b32 v44, s4, 33
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
-; SI-NEXT: v_writelane_b32 v44, s4, 19
+; SI-NEXT: v_writelane_b32 v44, s4, 34
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v36
-; SI-NEXT: v_writelane_b32 v44, s4, 20
+; SI-NEXT: v_writelane_b32 v44, s4, 35
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s4, v37
-; SI-NEXT: v_writelane_b32 v44, s4, 21
+; SI-NEXT: v_readfirstlane_b32 s43, v37
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: v_writelane_b32 v44, s4, 22
+; SI-NEXT: v_writelane_b32 v44, s4, 36
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v38
-; SI-NEXT: v_writelane_b32 v44, s4, 23
+; SI-NEXT: v_writelane_b32 v44, s4, 37
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v39
-; SI-NEXT: v_writelane_b32 v44, s4, 24
+; SI-NEXT: v_writelane_b32 v44, s4, 38
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s4, v48
-; SI-NEXT: v_writelane_b32 v44, s4, 25
+; SI-NEXT: v_writelane_b32 v44, s4, 39
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_readfirstlane_b32 s4, v49
-; SI-NEXT: v_writelane_b32 v44, s4, 26
+; SI-NEXT: v_writelane_b32 v44, s4, 40
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s4, v50
-; SI-NEXT: v_writelane_b32 v44, s4, 27
+; SI-NEXT: v_writelane_b32 v44, s4, 41
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s4, v51
-; SI-NEXT: v_writelane_b32 v44, s4, 28
+; SI-NEXT: v_writelane_b32 v44, s4, 42
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
; SI-NEXT: s_waitcnt vmcnt(3)
@@ -154962,41 +154979,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104
-; SI-NEXT: v_writelane_b32 v44, s4, 29
+; SI-NEXT: v_writelane_b32 v44, s4, 43
+; SI-NEXT: v_writelane_b32 v44, s22, 44
+; SI-NEXT: v_writelane_b32 v44, s6, 45
+; SI-NEXT: v_writelane_b32 v44, s23, 46
+; SI-NEXT: v_writelane_b32 v44, s20, 47
+; SI-NEXT: v_writelane_b32 v44, s58, 48
+; SI-NEXT: v_writelane_b32 v44, s47, 49
+; SI-NEXT: v_writelane_b32 v44, s40, 50
+; SI-NEXT: v_writelane_b32 v44, s25, 51
+; SI-NEXT: v_writelane_b32 v44, s29, 52
+; SI-NEXT: v_writelane_b32 v44, s57, 53
+; SI-NEXT: v_writelane_b32 v44, s62, 54
; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_readfirstlane_b32 s4, v52
-; SI-NEXT: v_writelane_b32 v44, s4, 30
-; SI-NEXT: v_readfirstlane_b32 s4, v53
-; SI-NEXT: v_writelane_b32 v44, s4, 31
-; SI-NEXT: v_readfirstlane_b32 s4, v54
-; SI-NEXT: v_writelane_b32 v44, s4, 32
-; SI-NEXT: v_readfirstlane_b32 s4, v55
-; SI-NEXT: v_writelane_b32 v44, s4, 33
-; SI-NEXT: v_writelane_b32 v44, s22, 34
-; SI-NEXT: v_writelane_b32 v44, s23, 35
-; SI-NEXT: v_writelane_b32 v44, s73, 36
-; SI-NEXT: v_writelane_b32 v44, s20, 37
-; SI-NEXT: v_writelane_b32 v44, s47, 38
-; SI-NEXT: v_writelane_b32 v44, s76, 39
-; SI-NEXT: v_writelane_b32 v44, s25, 40
-; SI-NEXT: v_writelane_b32 v44, s57, 41
-; SI-NEXT: v_writelane_b32 v44, s74, 42
-; SI-NEXT: v_writelane_b32 v44, s78, 43
-; SI-NEXT: v_writelane_b32 v44, s24, 44
-; SI-NEXT: v_writelane_b32 v44, s16, 45
-; SI-NEXT: v_writelane_b32 v44, s17, 46
-; SI-NEXT: v_writelane_b32 v44, s18, 47
-; SI-NEXT: v_writelane_b32 v44, s19, 48
-; SI-NEXT: v_writelane_b32 v44, s77, 49
-; SI-NEXT: v_writelane_b32 v44, s89, 50
-; SI-NEXT: v_writelane_b32 v44, s90, 51
-; SI-NEXT: v_writelane_b32 v44, s91, 52
-; SI-NEXT: v_writelane_b32 v44, s92, 53
-; SI-NEXT: v_writelane_b32 v44, s93, 54
-; SI-NEXT: v_writelane_b32 v44, s94, 55
-; SI-...
[truncated]
|
https://reviews.llvm.org/D52052 is to prevent register split on the MBB which have prolog instructions defining the exec register (or mask register that activate the threads of a warp in GPU). The constrain seems too strict, because 1) If the split is allowed, it may fit the free live range of a physical register, and no spill will happen; 2) The register class of register that is under splitting may not be the same to the register that is defined in prolog, so there is no interference with the register being defined in prolog. The current code has another small issue. The MBB->getFirstNonDebugInstr() just skip debug instructions, but SA->getFirstSplitPoint(Number) would skip label and phi instructions. This cause some MBB with label instruction being taken as prolog. This patch is to relax the split constrain on MMB with prolog by checking if the register defined in prolog has the common register class with the register being split. It allow the split if the register defined in prolog is physical register or there is no common register class.
8864ba0 to
7be6505
Compare
| # Check that physreg candidate is not used since cannot be spilled in a block, | ||
| # e.g. before exec mask preamble | ||
| # CHECK: , cannot spill all interferences. | ||
| # CHECK-NOT: , cannot spill all interferences. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where does it spill now? I am afraid we would have the same problem original patch was fixing: spilling/realoading with a wrong exec mask.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It spills at the entry of the function where there is no prolog instructions. The output remains the same after greedy register allocation.
It seems both SplitKit and InlineSpiller respect the prolog when inserting split or spill instructions, so the functionality should be ensured.
https://github.com/llvm/llvm-project/blob/main/llvm/lib/CodeGen/SplitKit.cpp#L846
https://github.com/llvm/llvm-project/blob/main/llvm/lib/CodeGen/InlineSpiller.cpp#L466
bb.0:
successors: %bb.1(0x80000000)
liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103
dead %28:sreg_64 = COPY $sgpr102_sgpr103
%1:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
SI_SPILL_S128_SAVE %1, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.0, align 4, addrspace 5)
SI_SPILL_S128_SAVE %1, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.1, align 4, addrspace 5)
SI_SPILL_S128_SAVE %1, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.2, align 4, addrspace 5)
SI_SPILL_S128_SAVE %1, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.3, align 4, addrspace 5)
SI_SPILL_S128_SAVE %1, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.4, align 4, addrspace 5)
%7:sgpr_128 = COPY %1
%8:sgpr_128 = COPY %1
%9:sgpr_128 = COPY %1
%10:sgpr_128 = COPY %1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Then where is restore? Add a second run line and generate full checks after greedy maybe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This -NOT check is basically worthless and breaks on debug message change. This needs to check something that is emitted
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Revise the test. The spill restore is after the $exec = IMPLICIT_DEF.
; CHECK-NEXT: bb.4:
; CHECK-NEXT: $exec = IMPLICIT_DEF
; CHECK-NEXT: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5)
; CHECK-NEXT: S_CMP_EQ_U64 [[COPY1]].sub0_sub1, [[SI_SPILL_S128_RESTORE]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: [[SI_SPILL_S128_RESTORE1:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5)
; CHECK-NEXT: [[SI_SPILL_S128_RESTORE2:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.2, align 4, addrspace 5)
; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE1]].sub0_sub1, [[SI_SPILL_S128_RESTORE2]].sub2_sub3, implicit-def $scc
; CHECK-NEXT: [[SI_SPILL_S128_RESTORE3:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.3, align 4, addrspace 5)
; CHECK-NEXT: [[SI_SPILL_S128_RESTORE4:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.4, align 4, addrspace 5)
; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE3]].sub0_sub1, [[SI_SPILL_S128_RESTORE4]].sub2_sub3, implicit-def $scc
🐧 Linux x64 Test Results
|
https://reviews.llvm.org/D52052 is to prevent register split on the MBB
which have prolog instructions defining the exec register (or mask register
that activate the threads of a warp in GPU). The constrain seems too
strict, because 1) If the split is allowed, it may fit the free live range
of a physical register, and no spill will happen; 2) The register class of
register that is under splitting may not be the same to the register that
is defined in prolog, so there is no interference with the register being
defined in prolog.
The current code has another small issue. The MBB->getFirstNonDebugInstr()
just skip debug instructions, but SA->getFirstSplitPoint(Number) would skip
label and phi instructions. This cause some MBB with label instruction being
taken as prolog.
This patch is to relax the split constrain on MMB with prolog by checking
if the register defined in prolog has the common register class with the
register being split. It allow the split if the register defined in prolog
is physical register or there is no common register class.