Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,10 @@ class LLVM_ABI TargetSubtargetInfo : public MCSubtargetInfo {
/// can be overridden.
virtual bool enableJoinGlobalCopies() const;

/// Hack to bring up option. This should be unconditionally true, all targets
/// should enable it and delete this.
virtual bool enableTerminalRule() const { return false; }

/// True if the subtarget should run a scheduler after register allocation.
///
/// By default this queries the PostRAScheduling bit in the scheduling model
Expand Down
12 changes: 9 additions & 3 deletions llvm/lib/CodeGen/RegisterCoalescer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ static cl::opt<bool> EnableJoining("join-liveintervals",
cl::desc("Coalesce copies (default=true)"),
cl::init(true), cl::Hidden);

static cl::opt<bool> UseTerminalRule("terminal-rule",
cl::desc("Apply the terminal rule"),
cl::init(false), cl::Hidden);
static cl::opt<cl::boolOrDefault>
EnableTerminalRule("terminal-rule", cl::desc("Apply the terminal rule"),
cl::init(cl::BOU_UNSET), cl::Hidden);

/// Temporary flag to test critical edge unsplitting.
static cl::opt<bool> EnableJoinSplits(
Expand Down Expand Up @@ -134,6 +134,7 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate {
SlotIndexes *SI = nullptr;
const MachineLoopInfo *Loops = nullptr;
RegisterClassInfo RegClassInfo;
bool UseTerminalRule = false;

/// Position and VReg of a PHI instruction during coalescing.
struct PHIValPos {
Expand Down Expand Up @@ -4312,6 +4313,11 @@ bool RegisterCoalescer::run(MachineFunction &fn) {
else
JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);

if (EnableTerminalRule == cl::BOU_UNSET)
UseTerminalRule = STI.enableTerminalRule();
else
UseTerminalRule = EnableTerminalRule == cl::BOU_TRUE;

// If there are PHIs tracked by debug-info, they will need updating during
// coalescing. Build an index of those PHIs to ease updating.
SlotIndexes *Slots = LIS->getSlotIndexes();
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return true;
}

bool enableTerminalRule() const override { return true; }

bool useAA() const override;

bool enableSubRegLiveness() const override {
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/R600Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ class R600Subtarget final : public R600GenSubtargetInfo,
return true;
}

bool enableTerminalRule() const override { return true; }

bool enableSubRegLiveness() const override {
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
; GFX10-NEXT: s_mov_b32 s8, exec_lo
; GFX10-NEXT: s_mov_b32 s9, s5
; GFX10-NEXT: s_add_i32 s6, s6, 1
; GFX10-NEXT: s_xor_b32 s8, s5, s8
; GFX10-NEXT: s_xor_b32 s5, s5, s8
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
; GFX10-NEXT: s_and_b32 s9, exec_lo, s5
; GFX10-NEXT: s_mov_b32 s5, s8
; GFX10-NEXT: s_or_b32 s7, s7, s9
; GFX10-NEXT: s_and_b32 s8, exec_lo, s9
; GFX10-NEXT: s_or_b32 s7, s7, s8
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %exit
Expand Down Expand Up @@ -240,11 +240,11 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB4_6
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: ; implicit-def: $sgpr10
; GFX10-NEXT: ; implicit-def: $sgpr11
; GFX10-NEXT: ; implicit-def: $sgpr9
Expand Down Expand Up @@ -345,8 +345,8 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
; GFX10-LABEL: divergent_i1_icmp_used_outside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: s_branch .LBB5_2
; GFX10-NEXT: .LBB5_1: ; %Flow
Expand Down Expand Up @@ -457,8 +457,8 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s1, exec_lo
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: ; implicit-def: $sgpr4
; GFX10-NEXT: ; implicit-def: $sgpr3
; GFX10-NEXT: s_branch .LBB6_2
Expand Down Expand Up @@ -534,8 +534,8 @@ exit:
define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
; GFX10-LABEL: loop_with_1break:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: ; implicit-def: $sgpr6
; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: ; implicit-def: $sgpr5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ exit:
define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) {
; GFX10-LABEL: loop_with_1break:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB2_2
; GFX10-NEXT: .LBB2_1: ; %Flow
Expand Down Expand Up @@ -180,8 +180,8 @@ exit:
define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
; GFX10-LABEL: loop_with_2breaks:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB3_3
; GFX10-NEXT: .LBB3_1: ; %Flow3
Expand Down Expand Up @@ -278,8 +278,8 @@ exit:
define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) {
; GFX10-LABEL: loop_with_3breaks:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB4_4
; GFX10-NEXT: .LBB4_1: ; %Flow5
Expand Down Expand Up @@ -404,8 +404,8 @@ exit:
define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
; GFX10-LABEL: loop_with_div_break_with_body:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: ; implicit-def: $sgpr6
; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: ; implicit-def: $sgpr5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
; GFX10-LABEL: loop_with_1break:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: ; implicit-def: $sgpr10
; GFX10-NEXT: ; implicit-def: $sgpr9
; GFX10-NEXT: s_branch .LBB2_3
Expand Down Expand Up @@ -197,14 +197,14 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n.
; GFX10-LABEL: nested_loops_temporal_divergence_inner:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: .LBB3_1: ; %OuterHeader
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB3_2 Depth 2
; GFX10-NEXT: s_ashr_i32 s7, s6, 31
; GFX10-NEXT: s_mov_b32 s4, s8
; GFX10-NEXT: s_mov_b32 s4, s5
; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2
; GFX10-NEXT: ; implicit-def: $sgpr9
; GFX10-NEXT: v_mov_b32_e32 v6, s10
Expand Down Expand Up @@ -239,13 +239,13 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n.
; GFX10-NEXT: s_add_i32 s6, s6, 1
; GFX10-NEXT: v_add_co_u32 v6, s4, v4, v6
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v5, v7, s4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX10-NEXT: flat_store_byte v[6:7], v0
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB3_1
; GFX10-NEXT: ; %bb.4: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
entry:
Expand Down Expand Up @@ -288,14 +288,14 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n.
; GFX10-LABEL: nested_loops_temporal_divergence_outer:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: .LBB4_1: ; %OuterHeader
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB4_2 Depth 2
; GFX10-NEXT: s_ashr_i32 s7, s6, 31
; GFX10-NEXT: s_mov_b32 s4, s8
; GFX10-NEXT: s_mov_b32 s4, s5
; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2
; GFX10-NEXT: ; implicit-def: $sgpr9
; GFX10-NEXT: v_mov_b32_e32 v6, s10
Expand Down Expand Up @@ -330,13 +330,13 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n.
; GFX10-NEXT: s_add_i32 s6, s6, 1
; GFX10-NEXT: v_add_co_u32 v6, s4, v4, v6
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v5, v7, s4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX10-NEXT: flat_store_byte v[6:7], v0
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB4_1
; GFX10-NEXT: ; %bb.4: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
entry:
Expand Down Expand Up @@ -379,15 +379,15 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
; GFX10-LABEL: nested_loops_temporal_divergence_both:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: ; implicit-def: $sgpr9
; GFX10-NEXT: .LBB5_1: ; %OuterHeader
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB5_2 Depth 2
; GFX10-NEXT: s_ashr_i32 s7, s6, 31
; GFX10-NEXT: s_mov_b32 s4, s8
; GFX10-NEXT: s_mov_b32 s4, s5
; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2
; GFX10-NEXT: v_mov_b32_e32 v8, s10
; GFX10-NEXT: v_mov_b32_e32 v9, s11
Expand Down Expand Up @@ -421,13 +421,13 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
; GFX10-NEXT: s_add_i32 s6, s6, 1
; GFX10-NEXT: v_add_co_u32 v8, s4, v4, v8
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v5, v9, s4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX10-NEXT: flat_store_byte v[8:9], v0
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB5_1
; GFX10-NEXT: ; %bb.4: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: flat_store_byte v[6:7], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
Original file line number Diff line number Diff line change
Expand Up @@ -547,8 +547,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
;
; NEW_RBS-LABEL: loop_with_2breaks:
; NEW_RBS: ; %bb.0: ; %entry
; NEW_RBS-NEXT: s_mov_b32 s4, 0
; NEW_RBS-NEXT: s_mov_b32 s0, 0
; NEW_RBS-NEXT: s_mov_b32 s4, 0
; NEW_RBS-NEXT: ; implicit-def: $sgpr5
; NEW_RBS-NEXT: s_branch .LBB16_3
; NEW_RBS-NEXT: .LBB16_1: ; %Flow3
Expand Down
Loading