Skip to content

Commit 535b891

Browse files
Automerge: [AMDGPU] Introduce "amdgpu-uniform-intrinsic-combine" pass to combine uniform AMDGPU lane Intrinsics. (#116953)
This pass introduces optimizations for AMDGPU intrinsics by leveraging the uniformity of their arguments. When an intrinsic's arguments are detected as uniform, redundant computations are eliminated, and the intrinsic calls are simplified accordingly. By utilizing the UniformityInfo analysis, this pass identifies cases where intrinsic calls are uniform across all lanes, allowing transformations that reduce unnecessary operations and improve the IR's efficiency. These changes enhance performance by streamlining intrinsic usage in uniform scenarios without altering the program's semantics. For background, see PR #99878
2 parents a69c87d + 53aad35 commit 535b891

8 files changed

+1473
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,11 @@ class AMDGPURewriteAGPRCopyMFMAPass
562562
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
563563
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
564564

565+
struct AMDGPUUniformIntrinsicCombinePass
566+
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
567+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
568+
};
569+
565570
namespace AMDGPU {
566571
enum TargetIndex {
567572
TI_CONSTDATA_START,

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
3030
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
3131
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
3232
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
33+
MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
3334
#undef MODULE_PASS
3435

3536
#ifndef MODULE_PASS_WITH_PARAMS

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
526526
cl::desc("Whether has closed-world assumption at link time"),
527527
cl::init(false), cl::Hidden);
528528

529+
static cl::opt<bool> EnableUniformIntrinsicCombine(
530+
"amdgpu-enable-uniform-intrinsic-combine",
531+
cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
532+
cl::init(true), cl::Hidden);
533+
529534
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
530535
// Register the target
531536
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
879884

880885
if (EarlyInlineAll && !EnableFunctionCalls)
881886
PM.addPass(AMDGPUAlwaysInlinePass());
887+
888+
if (EnableUniformIntrinsicCombine)
889+
PM.addPass(AMDGPUUniformIntrinsicCombinePass());
882890
});
883891

884892
PB.registerPeepholeEPCallback(
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// This pass simplifies certain intrinsic calls when the arguments are uniform.
11+
/// It's true that this pass has transforms that can lead to a situation where
12+
/// some instruction whose operand was previously recognized as statically
13+
/// uniform is later on no longer recognized as statically uniform. However, the
14+
/// semantics of how programs execute don't (and must not, for this precise
15+
/// reason) care about static uniformity, they only ever care about dynamic
16+
/// uniformity. And every instruction that's downstream and cares about dynamic
17+
/// uniformity must be convergent (and isel will introduce v_readfirstlane for
18+
/// them if their operands can't be proven statically uniform).
19+
///
20+
/// This pass is implemented as a ModulePass because intrinsic declarations
21+
/// exist at the module scope, allowing us to skip processing entirely if no
22+
/// declarations are present and to traverse their user lists directly when
23+
/// they are. A FunctionPass would instead require scanning every instruction
24+
/// in every function to find relevant intrinsics, which is far less efficient.
25+
//===----------------------------------------------------------------------===//
26+
27+
#include "AMDGPU.h"
28+
#include "GCNSubtarget.h"
29+
#include "llvm/Analysis/DomTreeUpdater.h"
30+
#include "llvm/Analysis/LoopInfo.h"
31+
#include "llvm/Analysis/ScalarEvolution.h"
32+
#include "llvm/Analysis/TargetLibraryInfo.h"
33+
#include "llvm/Analysis/UniformityAnalysis.h"
34+
#include "llvm/CodeGen/TargetPassConfig.h"
35+
#include "llvm/IR/IRBuilder.h"
36+
#include "llvm/IR/InstIterator.h"
37+
#include "llvm/IR/InstVisitor.h"
38+
#include "llvm/IR/IntrinsicsAMDGPU.h"
39+
#include "llvm/IR/PatternMatch.h"
40+
#include "llvm/InitializePasses.h"
41+
#include "llvm/Target/TargetMachine.h"
42+
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
43+
44+
#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
45+
46+
using namespace llvm;
47+
using namespace llvm::AMDGPU;
48+
using namespace llvm::PatternMatch;
49+
50+
/// Wrapper for querying uniformity info that first checks locally tracked
51+
/// instructions.
52+
static bool
53+
isDivergentUseWithNew(const Use &U, const UniformityInfo &UI,
54+
const ValueMap<const Value *, bool> &Tracker) {
55+
Value *V = U.get();
56+
if (auto It = Tracker.find(V); It != Tracker.end())
57+
return !It->second; // divergent if marked false
58+
return UI.isDivergentUse(U);
59+
}
60+
61+
/// Optimizes uniform intrinsics calls if their operand can be proven uniform.
62+
static bool optimizeUniformIntrinsic(IntrinsicInst &II,
63+
const UniformityInfo &UI,
64+
ValueMap<const Value *, bool> &Tracker) {
65+
llvm::Intrinsic::ID IID = II.getIntrinsicID();
66+
67+
switch (IID) {
68+
case Intrinsic::amdgcn_permlane64:
69+
case Intrinsic::amdgcn_readfirstlane:
70+
case Intrinsic::amdgcn_readlane: {
71+
Value *Src = II.getArgOperand(0);
72+
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
73+
return false;
74+
LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
75+
II.replaceAllUsesWith(Src);
76+
II.eraseFromParent();
77+
return true;
78+
}
79+
case Intrinsic::amdgcn_ballot: {
80+
Value *Src = II.getArgOperand(0);
81+
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
82+
return false;
83+
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
84+
85+
bool Changed = false;
86+
for (User *U : make_early_inc_range(II.users())) {
87+
if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
88+
Value *Op0 = ICmp->getOperand(0);
89+
Value *Op1 = ICmp->getOperand(1);
90+
ICmpInst::Predicate Pred = ICmp->getPredicate();
91+
Value *OtherOp = Op0 == &II ? Op1 : Op0;
92+
93+
if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
94+
// Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
95+
Instruction *NotOp =
96+
BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
97+
Tracker[NotOp] = true; // NOT preserves uniformity
98+
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
99+
ICmp->replaceAllUsesWith(NotOp);
100+
ICmp->eraseFromParent();
101+
Changed = true;
102+
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
103+
// Case: (icmp ne %ballot, 0) -> %ballot_arg
104+
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
105+
<< *Src << '\n');
106+
ICmp->replaceAllUsesWith(Src);
107+
ICmp->eraseFromParent();
108+
Changed = true;
109+
}
110+
}
111+
}
112+
// Erase the intrinsic if it has no remaining uses.
113+
if (II.use_empty())
114+
II.eraseFromParent();
115+
return Changed;
116+
}
117+
default:
118+
llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
119+
}
120+
return false;
121+
}
122+
123+
/// Iterates over intrinsic declarations in the module to optimize their uses.
124+
static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
125+
bool IsChanged = false;
126+
ValueMap<const Value *, bool> Tracker;
127+
128+
FunctionAnalysisManager &FAM =
129+
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
130+
for (Function &F : M) {
131+
switch (F.getIntrinsicID()) {
132+
case Intrinsic::amdgcn_permlane64:
133+
case Intrinsic::amdgcn_readfirstlane:
134+
case Intrinsic::amdgcn_readlane:
135+
case Intrinsic::amdgcn_ballot:
136+
break;
137+
default:
138+
continue;
139+
}
140+
141+
for (User *U : make_early_inc_range(F.users())) {
142+
auto *II = cast<IntrinsicInst>(U);
143+
Function *ParentF = II->getFunction();
144+
const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
145+
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
146+
}
147+
}
148+
return IsChanged;
149+
}
150+
151+
PreservedAnalyses
152+
AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
153+
if (!runUniformIntrinsicCombine(M, AM))
154+
return PreservedAnalyses::all();
155+
156+
PreservedAnalyses PA;
157+
PA.preserve<UniformityInfoAnalysis>();
158+
return PA;
159+
}

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen
6464
AMDGPUHSAMetadataStreamer.cpp
6565
AMDGPUInsertDelayAlu.cpp
6666
AMDGPUInstCombineIntrinsic.cpp
67+
AMDGPUUniformIntrinsicCombine.cpp
6768
AMDGPUInstrInfo.cpp
6869
AMDGPUInstructionSelector.cpp
6970
AMDGPUISelDAGToDAG.cpp

0 commit comments

Comments
 (0)