Skip to content

Commit 7d0af28

Browse files
Match and replace icmp ballot,0 with XOR
1 parent 94a4787 commit 7d0af28

File tree

2 files changed

+47
-60
lines changed

2 files changed

+47
-60
lines changed

llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -95,19 +95,10 @@ AMDGPUUniformIntrinsicCombinePass::run(Function &F,
9595

9696
bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
9797
bool IsChanged{false};
98-
Module *M = F.getParent();
99-
100-
// If none of the relevant intrinsics are declared, return early.
101-
// if (!Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_permlane64, {}) &&
102-
// !Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_readfirstlane, {}) &&
103-
// !Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_readlane, {}) &&
104-
// !Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_ballot, {})) {
105-
// return false;
106-
// }
10798

10899
// Iterate over each instruction in the function to get the desired intrinsic
109100
// inst to check for optimization.
110-
for (Instruction &I : instructions(F)) {
101+
for (Instruction &I : make_early_inc_range(instructions(F))) {
111102
if (auto *Intrinsic = dyn_cast<IntrinsicInst>(&I)) {
112103
IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
113104
}
@@ -135,22 +126,24 @@ bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
135126
Value *Src = II.getArgOperand(0);
136127
if (UI->isDivergentUse(II.getOperandUse(0)))
137128
return false;
138-
139129
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << "\n");
140130

141-
// Look for a direct `icmp eq` use of the ballot result.
142131
bool Changed = false;
143132
for (User *U : make_early_inc_range(II.users())) {
144-
if (match(U, m_ICmp(m_Specific(&II), m_Zero()))) {
145-
ICmpInst *ICmp = dyn_cast<ICmpInst>(U);
146-
IRBuilder<> Builder(ICmp);
147-
Value *ConvertedSrc = Builder.CreateZExtOrTrunc(Src, II.getType());
148-
149-
LLVM_DEBUG(dbgs() << "Replacing ballot result in icmp: " << *ICmp
150-
<< " with " << *ConvertedSrc << "\n");
151-
152-
ICmp->setOperand(0, ConvertedSrc);
153-
Changed = true;
133+
if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
134+
Value *Op0 = ICmp->getOperand(0);
135+
Value *Op1 = ICmp->getOperand(1);
136+
137+
if (ICmp->getPredicate() == ICmpInst::ICMP_EQ &&
138+
((Op0 == &II && match(Op1, m_Zero())) ||
139+
(Op1 == &II && match(Op0, m_Zero())))) {
140+
141+
IRBuilder<> Builder(ICmp);
142+
Value *Xor = Builder.CreateXor(Src, Builder.getTrue());
143+
LLVM_DEBUG(dbgs() << "Replacing with XOR: " << *Xor << "\n");
144+
ICmp->replaceAllUsesWith(Xor);
145+
Changed = true;
146+
}
154147
}
155148
}
156149
return Changed;

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll

Lines changed: 32 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
22
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine -S < %s | FileCheck %s -check-prefix=PASS-CHECK
3-
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,instcombine,early-cse,simplifycfg -S < %s | FileCheck %s -check-prefix=DCE-CHECK
3+
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-uniform-intrinsic-combine,early-cse,instcombine,simplifycfg -S < %s | FileCheck %s -check-prefix=DCE-CHECK
44

55
define protected amdgpu_kernel void @trivial_waterfall(ptr addrspace(1) %out) {
66
; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall(
77
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
88
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
99
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
1010
; PASS-CHECK: [[WHILE]]:
11-
; PASS-CHECK-NEXT: [[DONE1:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
12-
; PASS-CHECK-NEXT: [[DONE:%.*]] = xor i1 [[DONE1]], true
13-
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
14-
; PASS-CHECK-NEXT: [[TMP0:%.*]] = zext i1 [[DONE]] to i64
15-
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[TMP0]], 0
16-
; PASS-CHECK-NEXT: br i1 [[IS_DONE]], label %[[EXIT:.*]], label %[[IF]]
11+
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
12+
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
13+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
14+
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
15+
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
16+
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
1717
; PASS-CHECK: [[IF]]:
1818
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
1919
; PASS-CHECK-NEXT: br label %[[WHILE]]
@@ -49,18 +49,18 @@ define protected amdgpu_kernel void @waterfall(ptr addrspace(1) %out) {
4949
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
5050
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
5151
; PASS-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
52-
; PASS-CHECK-NEXT: [[TMP1:%.*]] = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP0]])
52+
; PASS-CHECK-NEXT: [[TID:%.*]] = tail call noundef i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP0]])
5353
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
5454
; PASS-CHECK: [[WHILE]]:
55-
; PASS-CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
56-
; PASS-CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[TMP3]], true
57-
; PASS-CHECK-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP4]])
58-
; PASS-CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 0
59-
; PASS-CHECK-NEXT: br i1 [[TMP9]], label %[[EXIT:.*]], label %[[IF:.*]]
55+
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
56+
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
57+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
58+
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
59+
; PASS-CHECK-NEXT: br i1 [[IS_DONE]], label %[[EXIT:.*]], label %[[IF:.*]]
6060
; PASS-CHECK: [[IF]]:
61-
; PASS-CHECK-NEXT: [[TMP12:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP1]])
62-
; PASS-CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP1]], [[TMP12]]
63-
; PASS-CHECK-NEXT: br i1 [[TMP13]], label %[[WORK:.*]], label %[[TAIL]]
61+
; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TID]])
62+
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[TID]], [[FIRST_ACTIVE_ID]]
63+
; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
6464
; PASS-CHECK: [[WORK]]:
6565
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
6666
; PASS-CHECK-NEXT: br label %[[TAIL]]
@@ -76,15 +76,15 @@ define protected amdgpu_kernel void @waterfall(ptr addrspace(1) %out) {
7676
; DCE-CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
7777
; DCE-CHECK-NEXT: br label %[[WHILE:.*]]
7878
; DCE-CHECK: [[WHILE]]:
79-
; DCE-CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[TMP12:%.*]], %[[TAIL:.*]] ]
80-
; DCE-CHECK-NEXT: [[TMP3:%.*]] = xor i1 [[TMP2]], true
81-
; DCE-CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP3]])
82-
; DCE-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
83-
; DCE-CHECK-NEXT: br i1 [[TMP8]], label %[[EXIT:.*]], label %[[IF:.*]]
79+
; DCE-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[IS_FIRST_ACTIVE_ID:%.*]], %[[TAIL:.*]] ]
80+
; DCE-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
81+
; DCE-CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[NOT_DONE]])
82+
; DCE-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i32 [[TMP1]], 0
83+
; DCE-CHECK-NEXT: br i1 [[IS_DONE]], label %[[EXIT:.*]], label %[[IF:.*]]
8484
; DCE-CHECK: [[IF]]:
85-
; DCE-CHECK-NEXT: [[TMP11:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]])
86-
; DCE-CHECK-NEXT: [[TMP12]] = icmp eq i32 [[TMP0]], [[TMP11]]
87-
; DCE-CHECK-NEXT: br i1 [[TMP12]], label %[[WORK:.*]], label %[[TAIL]]
85+
; DCE-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]])
86+
; DCE-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID]] = icmp eq i32 [[TMP0]], [[FIRST_ACTIVE_ID]]
87+
; DCE-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
8888
; DCE-CHECK: [[WORK]]:
8989
; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
9090
; DCE-CHECK-NEXT: br label %[[TAIL]]
@@ -122,29 +122,25 @@ exit:
122122
ret void
123123
}
124124

125-
define protected amdgpu_kernel void @trivial_waterfall_multiple_icmp(ptr addrspace(1) %out) {
126-
; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_multiple_icmp(
125+
define protected amdgpu_kernel void @trivial_waterfall_swap_op(ptr addrspace(1) %out) {
126+
; PASS-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_swap_op(
127127
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
128128
; PASS-CHECK-NEXT: [[ENTRY:.*]]:
129129
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
130130
; PASS-CHECK: [[WHILE]]:
131131
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
132132
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
133133
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
134-
; PASS-CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[NOT_DONE]] to i64
135-
; PASS-CHECK-NEXT: [[IS_DONE_1:%.*]] = icmp eq i64 [[TMP1]], 0
136-
; PASS-CHECK-NEXT: [[TMP0:%.*]] = zext i1 [[NOT_DONE]] to i64
137-
; PASS-CHECK-NEXT: [[IS_DONE_3:%.*]] = icmp eq i64 [[TMP0]], 0
138-
; PASS-CHECK-NEXT: br i1 [[IS_DONE_1]], label %[[EXIT:.*]], label %[[IF]]
134+
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
135+
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 0, [[BALLOT]]
136+
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
139137
; PASS-CHECK: [[IF]]:
140138
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
141-
; PASS-CHECK-NEXT: [[TMP2:%.*]] = zext i1 [[NOT_DONE]] to i64
142-
; PASS-CHECK-NEXT: [[IS_DONE_4:%.*]] = icmp eq i64 [[TMP2]], 0
143139
; PASS-CHECK-NEXT: br label %[[WHILE]]
144140
; PASS-CHECK: [[EXIT]]:
145141
; PASS-CHECK-NEXT: ret void
146142
;
147-
; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_multiple_icmp(
143+
; DCE-CHECK-LABEL: define protected amdgpu_kernel void @trivial_waterfall_swap_op(
148144
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
149145
; DCE-CHECK-NEXT: [[ENTRY:.*:]]
150146
; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -157,13 +153,11 @@ while:
157153
%done = phi i1 [ 0, %entry ], [ 1, %if ]
158154
%not_done = xor i1 %done, true
159155
%ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done)
160-
%is_done_1 = icmp eq i64 %ballot, 0
161-
%is_done_2 = icmp eq i64 %ballot, 0
162-
br i1 %is_done_1, label %exit, label %if
156+
%is_done = icmp eq i64 0, %ballot
157+
br i1 %is_done, label %exit, label %if
163158

164159
if:
165160
store i32 5, ptr addrspace(1) %out
166-
%is_done_3 = icmp eq i64 %ballot, 0
167161
br label %while
168162

169163
exit:

0 commit comments

Comments
 (0)