Skip to content

Commit e51500b

Browse files
[AMDGPU] combine uniform AMDGPU lane Intrinsics
1 parent 1a07e67 commit e51500b

8 files changed

+1527
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,11 @@ class AMDGPURewriteAGPRCopyMFMAPass
562562
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
563563
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
564564

565+
struct AMDGPUUniformIntrinsicCombinePass
566+
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
567+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
568+
};
569+
565570
namespace AMDGPU {
566571
enum TargetIndex {
567572
TI_CONSTDATA_START,

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
3030
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
3131
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
3232
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
33+
MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
3334
#undef MODULE_PASS
3435

3536
#ifndef MODULE_PASS_WITH_PARAMS

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
526526
cl::desc("Whether has closed-world assumption at link time"),
527527
cl::init(false), cl::Hidden);
528528

529+
static cl::opt<bool> EnableUniformIntrinsicCombine(
530+
"amdgpu-enable-uniform-intrinsic-combine",
531+
cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
532+
cl::init(true), cl::Hidden);
533+
529534
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
530535
// Register the target
531536
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
879884

880885
if (EarlyInlineAll && !EnableFunctionCalls)
881886
PM.addPass(AMDGPUAlwaysInlinePass());
887+
888+
if (EnableUniformIntrinsicCombine)
889+
PM.addPass(AMDGPUUniformIntrinsicCombinePass());
882890
});
883891

884892
PB.registerPeepholeEPCallback(
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// This pass simplifies certain intrinsic calls when the arguments are uniform.
11+
/// also, this pass relies on the fact that uniformity analysis remains safe
12+
/// across valid transformations in LLVM. A transformation does not alter
13+
/// program behavior across threads: each instruction in the original IR
14+
/// continues to have a well-defined counterpart in the transformed IR, both
15+
/// statically and dynamically.
16+
///
17+
/// Valid transformations respect three invariants:
18+
/// 1. Use-def relationships are preserved. If one instruction produces a value
19+
/// and another consumes it, that dependency must remain intact.
20+
/// 2. Uniformity classification is preserved. Certain values are always uniform
21+
/// (constants, kernel arguments, convergent operations), while others are
22+
/// always divergent (atomics, most function calls). Transformations may turn
23+
/// divergent computations into uniform ones, but never the reverse.
24+
/// 3. Uniformity must hold not only at the point of value computation but also
25+
/// at all later uses of that value, consistently across the same set of
26+
/// threads.
27+
///
28+
/// Together, these invariants ensure that transformations in this pass are
29+
/// correctness-preserving and remain safe for uniformity analysis.
30+
//===----------------------------------------------------------------------===//
31+
32+
#include "AMDGPU.h"
33+
#include "GCNSubtarget.h"
34+
#include "llvm/Analysis/DomTreeUpdater.h"
35+
#include "llvm/Analysis/LoopInfo.h"
36+
#include "llvm/Analysis/ScalarEvolution.h"
37+
#include "llvm/Analysis/TargetLibraryInfo.h"
38+
#include "llvm/Analysis/UniformityAnalysis.h"
39+
#include "llvm/CodeGen/TargetPassConfig.h"
40+
#include "llvm/IR/IRBuilder.h"
41+
#include "llvm/IR/InstIterator.h"
42+
#include "llvm/IR/InstVisitor.h"
43+
#include "llvm/IR/IntrinsicsAMDGPU.h"
44+
#include "llvm/IR/PatternMatch.h"
45+
#include "llvm/InitializePasses.h"
46+
#include "llvm/Target/TargetMachine.h"
47+
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
48+
49+
#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
50+
51+
using namespace llvm;
52+
using namespace llvm::AMDGPU;
53+
using namespace llvm::PatternMatch;
54+
55+
/// Optimizes uniform intrinsics.
56+
static bool optimizeUniformIntrinsic(IntrinsicInst &II,
57+
const UniformityInfo &UI) {
58+
llvm::Intrinsic::ID IID = II.getIntrinsicID();
59+
60+
switch (IID) {
61+
case Intrinsic::amdgcn_permlane64:
62+
case Intrinsic::amdgcn_readfirstlane:
63+
case Intrinsic::amdgcn_readlane: {
64+
Value *Src = II.getArgOperand(0);
65+
// Check if the argument use is divergent
66+
if (UI.isDivergentUse(II.getOperandUse(0)))
67+
return false;
68+
LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
69+
II.replaceAllUsesWith(Src);
70+
II.eraseFromParent();
71+
return true;
72+
}
73+
case Intrinsic::amdgcn_ballot: {
74+
Value *Src = II.getArgOperand(0);
75+
if (UI.isDivergentUse(II.getOperandUse(0)))
76+
return false;
77+
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
78+
79+
// If there are no ICmp users, return early.
80+
if (none_of(II.users(), [](User *U) { return isa<ICmpInst>(U); }))
81+
return false;
82+
83+
bool Changed = false;
84+
for (User *U : make_early_inc_range(II.users())) {
85+
if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
86+
Value *Op0 = ICmp->getOperand(0);
87+
Value *Op1 = ICmp->getOperand(1);
88+
ICmpInst::Predicate Pred = ICmp->getPredicate();
89+
Value *OtherOp = Op0 == &II ? Op1 : Op0;
90+
91+
if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
92+
// Case (icmp eq %ballot, 0) --> xor %ballot_arg, 1
93+
Instruction *NotOp =
94+
BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
95+
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
96+
ICmp->replaceAllUsesWith(NotOp);
97+
ICmp->eraseFromParent();
98+
Changed = true;
99+
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
100+
// (icmp ne %ballot, 0) --> %ballot_arg
101+
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
102+
<< *Src << '\n');
103+
ICmp->replaceAllUsesWith(Src);
104+
ICmp->eraseFromParent();
105+
Changed = true;
106+
}
107+
}
108+
}
109+
// Erase the intrinsic if it has no remaining uses.
110+
if (II.use_empty())
111+
II.eraseFromParent();
112+
return Changed;
113+
}
114+
default:
115+
llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
116+
}
117+
return false;
118+
}
119+
120+
/// Iterate over the Intrinsics use in the Module to optimise.
121+
static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
122+
bool IsChanged = false;
123+
FunctionAnalysisManager &FAM =
124+
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
125+
for (Function &F : M) {
126+
switch (F.getIntrinsicID()) {
127+
case Intrinsic::amdgcn_permlane64:
128+
case Intrinsic::amdgcn_readfirstlane:
129+
case Intrinsic::amdgcn_readlane:
130+
case Intrinsic::amdgcn_ballot:
131+
break;
132+
default:
133+
continue;
134+
}
135+
136+
for (User *U : F.users()) {
137+
auto *II = cast<IntrinsicInst>(U);
138+
Function *ParentF = II->getFunction();
139+
if (ParentF->isDeclaration())
140+
continue;
141+
142+
const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
143+
IsChanged |= optimizeUniformIntrinsic(*II, UI);
144+
}
145+
}
146+
return IsChanged;
147+
}
148+
149+
PreservedAnalyses
150+
AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
151+
if (!runUniformIntrinsicCombine(M, AM))
152+
return PreservedAnalyses::all();
153+
154+
PreservedAnalyses PA;
155+
PA.preserve<UniformityInfoAnalysis>();
156+
return PA;
157+
}

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen
6464
AMDGPUHSAMetadataStreamer.cpp
6565
AMDGPUInsertDelayAlu.cpp
6666
AMDGPUInstCombineIntrinsic.cpp
67+
AMDGPUUniformIntrinsicCombine.cpp
6768
AMDGPUInstrInfo.cpp
6869
AMDGPUInstructionSelector.cpp
6970
AMDGPUISelDAGToDAG.cpp

0 commit comments

Comments
 (0)