Skip to content

Commit 20532c0

Browse files
[AMDGPU] make AMDGPUUniformIntrinsicCombine a function pass (#165265)
There has been an issue(using function analysis inside the module pass in OPM) integrating this pass into the LLC pipeline, which currently lacks NPM support. I tried finding a way to get the per-function analysis, but it seems that in OPM, we don't have that option. So the best approach would be to make it a function pass. Ref: #116953
1 parent 0926265 commit 20532c0

File tree

6 files changed

+85
-28
lines changed

6 files changed

+85
-28
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -562,9 +562,13 @@ class AMDGPURewriteAGPRCopyMFMAPass
562562
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
563563
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
564564

565+
void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
566+
extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
567+
FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass();
568+
565569
struct AMDGPUUniformIntrinsicCombinePass
566570
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
567-
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
571+
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
568572
};
569573

570574
namespace AMDGPU {

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
3030
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
3131
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
3232
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
33-
MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
3433
#undef MODULE_PASS
3534

3635
#ifndef MODULE_PASS_WITH_PARAMS
@@ -69,6 +68,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
6968
AMDGPUUnifyDivergentExitNodesPass())
7069
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
7170
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
71+
FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
7272
#undef FUNCTION_PASS
7373

7474
#ifndef FUNCTION_ANALYSIS

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
619619
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
620620
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
621621
initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
622+
initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
622623
}
623624

624625
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -887,9 +888,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
887888

888889
if (EarlyInlineAll && !EnableFunctionCalls)
889890
PM.addPass(AMDGPUAlwaysInlinePass());
890-
891-
if (EnableUniformIntrinsicCombine)
892-
PM.addPass(AMDGPUUniformIntrinsicCombinePass());
893891
});
894892

895893
PB.registerPeepholeEPCallback(
@@ -900,6 +898,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
900898
FPM.addPass(AMDGPUUseNativeCallsPass());
901899
if (EnableLibCallSimplify)
902900
FPM.addPass(AMDGPUSimplifyLibCallsPass());
901+
902+
if (EnableUniformIntrinsicCombine)
903+
FPM.addPass(AMDGPUUniformIntrinsicCombinePass());
903904
});
904905

905906
PB.registerCGSCCOptimizerLateEPCallback(

llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp

Lines changed: 55 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,6 @@
1616
/// uniformity. And every instruction that's downstream and cares about dynamic
1717
/// uniformity must be convergent (and isel will introduce v_readfirstlane for
1818
/// them if their operands can't be proven statically uniform).
19-
///
20-
/// This pass is implemented as a ModulePass because intrinsic declarations
21-
/// exist at the module scope, allowing us to skip processing entirely if no
22-
/// declarations are present and to traverse their user lists directly when
23-
/// they are. A FunctionPass would instead require scanning every instruction
24-
/// in every function to find relevant intrinsics, which is far less efficient.
2519
//===----------------------------------------------------------------------===//
2620

2721
#include "AMDGPU.h"
@@ -97,14 +91,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
9791
Tracker[NotOp] = true; // NOT preserves uniformity
9892
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
9993
ICmp->replaceAllUsesWith(NotOp);
100-
ICmp->eraseFromParent();
10194
Changed = true;
10295
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
10396
// Case: (icmp ne %ballot, 0) -> %ballot_arg
10497
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
10598
<< *Src << '\n');
10699
ICmp->replaceAllUsesWith(Src);
107-
ICmp->eraseFromParent();
108100
Changed = true;
109101
}
110102
}
@@ -120,15 +112,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
120112
return false;
121113
}
122114

123-
/// Iterates over intrinsic declarations in the module to optimize their uses.
124-
static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
115+
/// Iterates over intrinsic calls in the Function to optimize.
116+
static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
125117
bool IsChanged = false;
126118
ValueMap<const Value *, bool> Tracker;
127119

128-
FunctionAnalysisManager &FAM =
129-
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
130-
for (Function &F : M) {
131-
switch (F.getIntrinsicID()) {
120+
for (Instruction &I : make_early_inc_range(instructions(F))) {
121+
auto *II = dyn_cast<IntrinsicInst>(&I);
122+
if (!II)
123+
continue;
124+
125+
switch (II->getIntrinsicID()) {
132126
case Intrinsic::amdgcn_permlane64:
133127
case Intrinsic::amdgcn_readfirstlane:
134128
case Intrinsic::amdgcn_readlane:
@@ -137,23 +131,61 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
137131
default:
138132
continue;
139133
}
140-
141-
for (User *U : make_early_inc_range(F.users())) {
142-
auto *II = cast<IntrinsicInst>(U);
143-
Function *ParentF = II->getFunction();
144-
const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
145-
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
146-
}
134+
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
147135
}
148136
return IsChanged;
149137
}
150138

151139
PreservedAnalyses
152-
AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
153-
if (!runUniformIntrinsicCombine(M, AM))
140+
AMDGPUUniformIntrinsicCombinePass::run(Function &F,
141+
FunctionAnalysisManager &AM) {
142+
const auto &UI = AM.getResult<UniformityInfoAnalysis>(F);
143+
if (!runUniformIntrinsicCombine(F, UI))
154144
return PreservedAnalyses::all();
155145

156146
PreservedAnalyses PA;
157147
PA.preserve<UniformityInfoAnalysis>();
158148
return PA;
159149
}
150+
151+
namespace {
152+
class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
153+
public:
154+
static char ID;
155+
AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {
156+
initializeAMDGPUUniformIntrinsicCombineLegacyPass(
157+
*PassRegistry::getPassRegistry());
158+
}
159+
160+
private:
161+
bool runOnFunction(Function &F) override;
162+
void getAnalysisUsage(AnalysisUsage &AU) const override {
163+
AU.setPreservesCFG();
164+
AU.addRequired<UniformityInfoWrapperPass>();
165+
AU.addRequired<TargetPassConfig>();
166+
}
167+
};
168+
} // namespace
169+
170+
char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
171+
char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
172+
AMDGPUUniformIntrinsicCombineLegacy::ID;
173+
174+
bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
175+
if (skipFunction(F))
176+
return false;
177+
const UniformityInfo &UI =
178+
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
179+
return runUniformIntrinsicCombine(F, UI);
180+
}
181+
182+
INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
183+
"AMDGPU Uniform Intrinsic Combine", false, false)
184+
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
185+
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
186+
INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
187+
"AMDGPU Uniform Intrinsic Combine", false, false)
188+
189+
FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
190+
return new AMDGPUUniformIntrinsicCombineLegacy();
191+
}

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1)
2323
; PASS-CHECK: [[WHILE]]:
2424
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
2525
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
26+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
2627
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
28+
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
2729
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
2830
; PASS-CHECK: [[IF]]:
2931
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -75,7 +77,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrs
7577
; PASS-CHECK: [[WHILE]]:
7678
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
7779
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
80+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
7881
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
82+
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 0, [[BALLOT]]
7983
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
8084
; PASS-CHECK: [[IF]]:
8185
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -126,6 +130,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1)
126130
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
127131
; PASS-CHECK: [[WHILE]]:
128132
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
133+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
134+
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 0, [[BALLOT]]
129135
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
130136
; PASS-CHECK: [[IF]]:
131137
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -175,6 +181,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspac
175181
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
176182
; PASS-CHECK: [[WHILE]]:
177183
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
184+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
185+
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 [[BALLOT]], 0
178186
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
179187
; PASS-CHECK: [[IF]]:
180188
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -225,7 +233,9 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1)
225233
; PASS-CHECK: [[WHILE]]:
226234
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
227235
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
236+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
228237
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
238+
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
229239
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
230240
; PASS-CHECK: [[IF]]:
231241
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
@@ -292,7 +302,9 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3
292302
; PASS-CHECK: [[WHILE]]:
293303
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
294304
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
305+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
295306
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
307+
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
296308
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
297309
; PASS-CHECK: [[IF]]:
298310
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
@@ -359,7 +371,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace
359371
; PASS-CHECK: [[WHILE]]:
360372
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
361373
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
374+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[NOT_DONE]])
362375
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
376+
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i32 [[BALLOT]], 0
363377
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
364378
; PASS-CHECK: [[IF]]:
365379
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -410,6 +424,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace
410424
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
411425
; PASS-CHECK: [[WHILE]]:
412426
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
427+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[DONE]])
428+
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i32 0, [[BALLOT]]
413429
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
414430
; PASS-CHECK: [[IF]]:
415431
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4

llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,8 @@ define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) {
595595
; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
596596
; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
597597
; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
598+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
599+
; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0
598600
; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
599601
; PASS-CHECK-NEXT: ret void
600602
;
@@ -623,6 +625,8 @@ define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) {
623625
; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
624626
; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
625627
; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
628+
; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[C]])
629+
; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i64 [[BALLOT]], 0
626630
; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
627631
; PASS-CHECK-NEXT: ret void
628632
;

0 commit comments

Comments
 (0)