Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,11 @@ class AMDGPURewriteAGPRCopyMFMAPass
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;

struct AMDGPUUniformIntrinsicCombinePass
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};

namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef MODULE_PASS

#ifndef MODULE_PASS_WITH_PARAMS
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
cl::desc("Whether has closed-world assumption at link time"),
cl::init(false), cl::Hidden);

static cl::opt<bool> EnableUniformIntrinsicCombine(
"amdgpu-enable-uniform-intrinsic-combine",
cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
cl::init(true), cl::Hidden);

extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
Expand Down Expand Up @@ -879,6 +884,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {

if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());

if (EnableUniformIntrinsicCombine)
PM.addPass(AMDGPUUniformIntrinsicCombinePass());
});

PB.registerPeepholeEPCallback(
Expand Down
159 changes: 159 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass simplifies certain intrinsic calls when the arguments are uniform.
/// It's true that this pass has transforms that can lead to a situation where
/// some instruction whose operand was previously recognized as statically
/// uniform is later on no longer recognized as statically uniform. However, the
/// semantics of how programs execute don't (and must not, for this precise
/// reason) care about static uniformity, they only ever care about dynamic
/// uniformity. And every instruction that's downstream and cares about dynamic
/// uniformity must be convergent (and isel will introduce v_readfirstlane for
/// them if their operands can't be proven statically uniform).
///
/// This pass is implemented as a ModulePass because intrinsic declarations
/// exist at the module scope, allowing us to skip processing entirely if no
/// declarations are present and to traverse their user lists directly when
/// they are. A FunctionPass would instead require scanning every instruction
/// in every function to find relevant intrinsics, which is far less efficient.
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"

using namespace llvm;
using namespace llvm::AMDGPU;
using namespace llvm::PatternMatch;

/// Wrapper for querying uniformity info that first checks locally tracked
/// instructions.
static bool
isDivergentUseWithNew(const Use &U, const UniformityInfo &UI,
const ValueMap<const Value *, bool> &Tracker) {
Value *V = U.get();
if (auto It = Tracker.find(V); It != Tracker.end())
return !It->second; // divergent if marked false
return UI.isDivergentUse(U);
}

/// Optimizes uniform intrinsics calls if their operand can be proven uniform.
static bool optimizeUniformIntrinsic(IntrinsicInst &II,
const UniformityInfo &UI,
ValueMap<const Value *, bool> &Tracker) {
llvm::Intrinsic::ID IID = II.getIntrinsicID();

switch (IID) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
return false;
LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
II.replaceAllUsesWith(Src);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is replacing a contextually / path dependent query with a value that is not. I think we need to attach some kind of convergent use call to capture the point here. What if later code motion moves it such that an assumed uniform value is no longer use-point uniform?

You can maybe get away with replace only dominated uses by this instruction, but I'd need to think if there are still potential hazards if later transforms introduce divergence

Copy link
Contributor Author

@PankajDwivedi-25 PankajDwivedi-25 Apr 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haven't we already discussed something similar earlier? #116953 (review)

Copy link
Collaborator

@ssahasra ssahasra Apr 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are replacing an "always uniform" value X (the intrinsic call) with its uniform operand (Y), which can potentially become uniform divergent after other transformations. But that is no different from a later transformation treating X (the intrinsic call) as divergent by simply ignoring the uniformity analysis at that point of the optimization pipeline. By replacing X with Y, we lose the ability to preserve uniformity but this is not a correctness issue. This is still a performance issue, and then the question is whether the current optimization is useful in general or not.

II.eraseFromParent();
return true;
}
case Intrinsic::amdgcn_ballot: {
Value *Src = II.getArgOperand(0);
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
return false;
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');

bool Changed = false;
for (User *U : make_early_inc_range(II.users())) {
if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
Value *Op0 = ICmp->getOperand(0);
Value *Op1 = ICmp->getOperand(1);
ICmpInst::Predicate Pred = ICmp->getPredicate();
Value *OtherOp = Op0 == &II ? Op1 : Op0;

if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
// Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
Instruction *NotOp =
BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
Tracker[NotOp] = true; // NOT preserves uniformity
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
ICmp->replaceAllUsesWith(NotOp);
ICmp->eraseFromParent();
Changed = true;
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
// Case: (icmp ne %ballot, 0) -> %ballot_arg
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
<< *Src << '\n');
ICmp->replaceAllUsesWith(Src);
ICmp->eraseFromParent();
Changed = true;
}
}
}
// Erase the intrinsic if it has no remaining uses.
if (II.use_empty())
II.eraseFromParent();
return Changed;
}
default:
llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
}
return false;
}

/// Iterates over intrinsic declarations in the module to optimize their uses.
static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be a function pass

Copy link
Contributor Author

@PankajDwivedi-25 PankajDwivedi-25 Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as discussed in here :#116953 (comment)
#116953 (comment)
We will be inspecting intrinsic declarations outside of the function.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the intrinsic declarations is fine, but you can also just not use the function use list. You can just directly pattern match inside the function

Copy link
Contributor Author

@PankajDwivedi-25 PankajDwivedi-25 Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can also just not use the function use list

Inside a module pass it should be ok right?

You can just directly pattern match inside the function

right, that is going to be costlier. we have to iterate over all the instructions in the function.

Copy link
Contributor Author

@PankajDwivedi-25 PankajDwivedi-25 Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We had a series of discussions on this and ended up making it a module pass.

bool IsChanged = false;
ValueMap<const Value *, bool> Tracker;

FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
for (Function &F : M) {
switch (F.getIntrinsicID()) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_ballot:
break;
default:
continue;
}

for (User *U : make_early_inc_range(F.users())) {
auto *II = cast<IntrinsicInst>(U);
Function *ParentF = II->getFunction();
const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
}
}
return IsChanged;
}

PreservedAnalyses
AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
if (!runUniformIntrinsicCombine(M, AM))
return PreservedAnalyses::all();

PreservedAnalyses PA;
PA.preserve<UniformityInfoAnalysis>();
return PA;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
AMDGPUUniformIntrinsicCombine.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp
AMDGPUISelDAGToDAG.cpp
Expand Down
Loading