Skip to content

Commit e9d7dcc

Browse files
committed
[Uniformity] Fixed control-div early stop
Control-divergence finds joins by propagating labels from the divergent control branch. The code that checks the early stop for propagation is not correct in some cases. This change fixes this issue by stopping at the post-dominator of the successors of the divergent branch. #137277
1 parent 4b09eed commit e9d7dcc

File tree

4 files changed

+193
-35
lines changed

4 files changed

+193
-35
lines changed

llvm/include/llvm/ADT/GenericUniformityImpl.h

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -610,9 +610,6 @@ template <typename ContextT> class DivergencePropagator {
610610
LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: "
611611
<< Context.print(&DivTermBlock) << "\n");
612612

613-
// Early stopping criterion
614-
int FloorIdx = CyclePOT.size() - 1;
615-
const BlockT *FloorLabel = nullptr;
616613
int DivTermIdx = CyclePOT.getIndex(&DivTermBlock);
617614

618615
// Bootstrap with branch targets
@@ -626,14 +623,36 @@ template <typename ContextT> class DivergencePropagator {
626623
LLVM_DEBUG(dbgs() << "\tImmediate divergent cycle exit: "
627624
<< Context.print(SuccBlock) << "\n");
628625
}
629-
auto SuccIdx = CyclePOT.getIndex(SuccBlock);
630626
visitEdge(*SuccBlock, *SuccBlock);
631-
FloorIdx = std::min<int>(FloorIdx, SuccIdx);
632627
}
633628

629+
// Return true if B is inside an irreducible cycle
630+
auto IsInIrreducibleCycle = [this](const BlockT *B) {
631+
for (const auto *Cycle = CI.getCycle(B); Cycle;
632+
Cycle = Cycle->getParentCycle()) {
633+
if (!Cycle->isReducible())
634+
return true;
635+
}
636+
return false;
637+
};
638+
639+
// Technically propagation can continue until it reaches the last node.
640+
//
641+
// For efficiency, propagation can just stop at the IPD (immediate
642+
// post-dominator) of successors(DivTemBlock) for any reducible graph.
643+
// If FreshLabels.count()=1, the block in FreshLabels should be the IPD.
644+
//
645+
// For irreducible cycle, propagation continues until it reaches out of
646+
// any irreducible cycles first, then stop when FreshLabels.count()=1.
634647
while (true) {
635648
auto BlockIdx = FreshLabels.find_last();
636-
if (BlockIdx == -1 || BlockIdx < FloorIdx)
649+
if (BlockIdx == -1)
650+
break;
651+
652+
const auto *Block = CyclePOT[BlockIdx];
653+
// If no irreducible cycle, stop if freshLable.count() = 1 and Block
654+
// is the IPD. If it is in any irreducible cycle, continue propagation.
655+
if (FreshLabels.count() == 1 && !IsInIrreducibleCycle(Block))
637656
break;
638657

639658
LLVM_DEBUG(dbgs() << "Current labels:\n"; printDefs(dbgs()));
@@ -644,16 +663,12 @@ template <typename ContextT> class DivergencePropagator {
644663
continue;
645664
}
646665

647-
const auto *Block = CyclePOT[BlockIdx];
648666
LLVM_DEBUG(dbgs() << "visiting " << Context.print(Block) << " at index "
649667
<< BlockIdx << "\n");
650668

651669
const auto *Label = BlockLabels[Block];
652670
assert(Label);
653671

654-
bool CausedJoin = false;
655-
int LoweredFloorIdx = FloorIdx;
656-
657672
// If the current block is the header of a reducible cycle that
658673
// contains the divergent branch, then the label should be
659674
// propagated to the cycle exits. Such a header is the "last
@@ -681,28 +696,11 @@ template <typename ContextT> class DivergencePropagator {
681696
if (const auto *BlockCycle = getReducibleParent(Block)) {
682697
SmallVector<BlockT *, 4> BlockCycleExits;
683698
BlockCycle->getExitBlocks(BlockCycleExits);
684-
for (auto *BlockCycleExit : BlockCycleExits) {
685-
CausedJoin |= visitCycleExitEdge(*BlockCycleExit, *Label);
686-
LoweredFloorIdx =
687-
std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(BlockCycleExit));
688-
}
699+
for (auto *BlockCycleExit : BlockCycleExits)
700+
visitCycleExitEdge(*BlockCycleExit, *Label);
689701
} else {
690-
for (const auto *SuccBlock : successors(Block)) {
691-
CausedJoin |= visitEdge(*SuccBlock, *Label);
692-
LoweredFloorIdx =
693-
std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(SuccBlock));
694-
}
695-
}
696-
697-
// Floor update
698-
if (CausedJoin) {
699-
// 1. Different labels pushed to successors
700-
FloorIdx = LoweredFloorIdx;
701-
} else if (FloorLabel != Label) {
702-
// 2. No join caused BUT we pushed a label that is different than the
703-
// last pushed label
704-
FloorIdx = LoweredFloorIdx;
705-
FloorLabel = Label;
702+
for (const auto *SuccBlock : successors(Block))
703+
visitEdge(*SuccBlock, *Label);
706704
}
707705
}
708706

llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,10 +126,10 @@ exit:
126126
;; only the inner cycle is reported as diverged.
127127
;;
128128
;; CHECK-LABEL: UniformityInfo for function 'headers_b_t':
129-
;; CHECK: CYCLES ASSSUMED DIVERGENT:
130-
;; CHECK: depth=2: entries(T P) S Q R
131-
;; CHECK: CYCLES WITH DIVERGENT EXIT:
132-
;; CHECK: depth=1: entries(B A) D T S Q P R C
129+
;; NOCHECK: CYCLES ASSSUMED DIVERGENT:
130+
;; NOCHECK: depth=2: entries(T P) S Q R
131+
;; NOCHECK: CYCLES WITH DIVERGENT EXIT:
132+
;; NOCHECK: depth=1: entries(B A) D T S Q P R C
133133

134134
define amdgpu_kernel void @headers_b_t(i32 %a, i32 %b, i32 %c) {
135135
entry:
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
;
2+
; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
3+
;
4+
; This is to test an if-then-else case with some unmerged basic blocks
5+
; (https://github.com/llvm/llvm-project/issues/137277)
6+
;
7+
; Entry (div.cond)
8+
; / \
9+
; B0 B3
10+
; | |
11+
; B1 B4
12+
; | |
13+
; B2 B5
14+
; \ /
15+
; B6 (phi: divergent)
16+
;
17+
18+
19+
; CHECK-LABEL: 'test_ctrl_divergence':
20+
; CHECK-LABEL: BLOCK Entry
21+
; CHECK: DIVERGENT: %div.cond = icmp eq i32 %tid, 0
22+
; CHECK: DIVERGENT: br i1 %div.cond, label %B3, label %B0
23+
;
24+
; CHECK-LABEL: BLOCK B6
25+
; CHECK: DIVERGENT: %div_a = phi i32 [ %a0, %B2 ], [ %a1, %B5 ]
26+
; CHECK: DIVERGENT: %div_b = phi i32 [ %b0, %B2 ], [ %b1, %B5 ]
27+
; CHECK: DIVERGENT: %div_c = phi i32 [ %c0, %B2 ], [ %c1, %B5 ]
28+
29+
30+
define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
31+
Entry:
32+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
33+
%div.cond = icmp eq i32 %tid, 0
34+
br i1 %div.cond, label %B3, label %B0 ; divergent branch
35+
36+
B0:
37+
%a0 = add i32 %a, 1
38+
br label %B1
39+
40+
B1:
41+
%b0 = add i32 %b, 2
42+
br label %B2
43+
44+
B2:
45+
%c0 = add i32 %c, 3
46+
br label %B6
47+
48+
B3:
49+
%a1 = add i32 %a, 10
50+
br label %B4
51+
52+
B4:
53+
%b1 = add i32 %b, 20
54+
br label %B5
55+
56+
B5:
57+
%c1 = add i32 %c, 30
58+
br label %B6
59+
60+
B6:
61+
%div_a = phi i32 [%a0, %B2], [%a1, %B5]
62+
%div_b = phi i32 [%b0, %B2], [%b1, %B5]
63+
%div_c = phi i32 [%c0, %B2], [%c1, %B5]
64+
br i1 %div.cond, label %B8, label %B7 ; divergent branch
65+
66+
B7:
67+
%d1 = add i32 %d, 1
68+
br label %B8
69+
70+
B8:
71+
%div_d = phi i32 [%d1, %B7], [%d, %B6]
72+
ret void
73+
}
74+
75+
76+
declare i32 @llvm.amdgcn.workitem.id.x() #0
77+
78+
attributes #0 = {nounwind readnone }
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
;
2+
; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
3+
;
4+
; This is to test a divergent phi involving loops
5+
; (https://github.com/llvm/llvm-project/issues/137277).
6+
;
7+
; B0 (div.cond)
8+
; / \
9+
; (L)B1 B4
10+
; | |
11+
; B2 B5 (L)
12+
; | |
13+
; B3 /
14+
; \ /
15+
; B6 (phi: divergent)
16+
;
17+
18+
;
19+
; CHECK-LABEL: UniformityInfo for function 'test_loop_ctrl_divergence':
20+
; CHECK-LABEL: BLOCK Entry
21+
; CHECK: DIVERGENT: %tid = call i32 @llvm.amdgcn.workitem.id.x()
22+
; CHECK-LABEL: BLOCK B0
23+
; CHECK: DIVERGENT: %div.cond = icmp eq i32 %tid, 0
24+
; CHECK-LABEL: BLOCK B3
25+
; CHECK: %uni_a = phi i32 [ %a1, %B2 ], [ %a, %Entry ]
26+
; CHECK-LABEL: BLOCK B5
27+
; CHECK: %uni.a3 = phi i32 [ %a2, %B4 ], [ %uni_a3, %B5 ]
28+
; CHECK-LABEL BLOCK B6
29+
; CHECK: DIVERGENT: %div_a = phi i32 [ %uni_a, %B3 ], [ %uni_a3, %B5 ]
30+
;
31+
32+
define amdgpu_kernel void @test_loop_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
33+
Entry:
34+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
35+
%uni.cond0 = icmp eq i32 %d, 0
36+
br i1 %uni.cond0, label %B3, label %B0 ; uniform branch
37+
38+
B0:
39+
%div.cond = icmp eq i32 %tid, 0
40+
br i1 %div.cond, label %B4, label %B1 ; divergent branch
41+
42+
B1:
43+
%uni.a0 = phi i32 [%a, %B0], [%a0, %B1]
44+
%a0 = add i32 %uni.a0, 1
45+
%uni.cond1 = icmp slt i32 %a0, %b
46+
br i1 %uni.cond1, label %B1, label %B2
47+
48+
B2:
49+
%a1 = add i32 %a0, 10
50+
br label %B3
51+
52+
B3:
53+
%uni_a = phi i32 [%a1, %B2], [%a, %Entry]
54+
br label %B6
55+
56+
B4:
57+
%a2 = add i32 %a, 20
58+
br label %B5
59+
60+
B5:
61+
%uni.a3= phi i32 [%a2, %B4], [%uni_a3, %B5]
62+
%uni_a3 = add i32 %uni.a3, 1
63+
%uni.cond2 = icmp slt i32 %uni_a3, %c
64+
br i1 %uni.cond2, label %B5, label %B6
65+
66+
B6:
67+
%div_a = phi i32 [%uni_a, %B3], [%uni_a3, %B5] ; divergent
68+
%div.cond2 = icmp eq i32 %tid, 2
69+
br i1 %div.cond2, label %B7, label %B8 ; divergent branch
70+
71+
B7:
72+
%c0 = add i32 %div_a, 2 ; divergent
73+
br label %B8
74+
75+
B8:
76+
%ret = phi i32 [%c0, %B7], [0, %B6] ; divergent
77+
ret void
78+
}
79+
80+
declare i32 @llvm.amdgcn.workitem.id.x() #0
81+
82+
attributes #0 = {nounwind readnone }

0 commit comments

Comments
 (0)