Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
ModulePass *createAMDGPULowerBufferFatPointersPass();
ModulePass *createAMDGPULowerIntrinsicsLegacyPass();
FunctionPass *createSIModeRegisterPass();
FunctionPass *createGCNPreRAOptimizationsLegacyPass();
FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
Expand Down Expand Up @@ -153,6 +154,16 @@ struct AMDGPULowerBufferFatPointersPass
const TargetMachine &TM;
};

void initializeAMDGPULowerIntrinsicsLegacyPass(PassRegistry &);

struct AMDGPULowerIntrinsicsPass : PassInfoMixin<AMDGPULowerIntrinsicsPass> {
AMDGPULowerIntrinsicsPass(const AMDGPUTargetMachine &TM) : TM(TM) {}
PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);

private:
const AMDGPUTargetMachine &TM;
};

void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
extern char &AMDGPUPrepareAGPRAllocLegacyID;

Expand Down
37 changes: 0 additions & 37 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1989,39 +1989,6 @@ bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
return selectImpl(MI, *CoverageInfo);
}

bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
if (TM.getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
if (WGSize <= STI.getWavefrontSize()) {
// If the workgroup fits in a wave, remove s_barrier_signal and lower
// s_barrier/s_barrier_wait to wave_barrier.
if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
}
MI.eraseFromParent();
return true;
}
}

if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
.addImm(AMDGPU::Barrier::WORKGROUP);
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
.addImm(AMDGPU::Barrier::WORKGROUP);
MI.eraseFromParent();
Comment on lines -2011 to -2018
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't so much code that duplicating between globalisel and selection dag is a problem. Is a whole pass really needed for this one tiny case? Can we get this into one of the existing IR lowering passes (e.g. PreISelIntrinsicLowering)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PreISelIntrinsicLowering doesn't handle target-specific intrinsics. So is it better to teach that pass how to call into target hooks to handle target intrinsics, or to have a target-specific pass like in this patch?

Copy link
Collaborator Author

@nhaehnle nhaehnle Aug 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've looked at this. We could perhaps add a hook to TargetTransformInfo, but it feels a bit out of place. PreISelIntrinsicLowering iterates over the functions of the module and looks at calls from there, which means that hooking anything into that pass would basically only re-use this loop over the functions of the module.

I can sort of see the point about breaking the module pass manager (although -- isn't it really the other way around? Breaking up a function pass manager hurts, but a function pass inside a module pass manager ought to be fairly unproblematic), and perhaps a larger point that iterating over all code just for this is a bit excessive.

So I'm going to change this into a module pass that iterates like PreISelIntrinsicLowering.

As for how much code this is: I ran into this precisely because it's about to become a whole lot more code in the near future. This change is simply trying to prepare upstream for that.

return true;
}

return selectImpl(MI, *CoverageInfo);
}

static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
bool &IsTexFail) {
if (TexFailCtrl)
Expand Down Expand Up @@ -2338,10 +2305,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectDSAppendConsume(I, false);
case Intrinsic::amdgcn_init_whole_wave:
return selectInitWholeWave(I);
case Intrinsic::amdgcn_s_barrier:
case Intrinsic::amdgcn_s_barrier_signal:
case Intrinsic::amdgcn_s_barrier_wait:
return selectSBarrier(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
bool selectInitWholeWave(MachineInstr &MI) const;
bool selectSBarrier(MachineInstr &MI) const;
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;

bool selectImageIntrinsic(MachineInstr &MI,
Expand Down
161 changes: 161 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Lower intrinsics that would otherwise require separate handling in both
// SelectionDAG and GlobalISel.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"

#define DEBUG_TYPE "amdgpu-lower-intrinsics"

using namespace llvm;

namespace {

class AMDGPULowerIntrinsicsImpl {
public:
Module &M;
const AMDGPUTargetMachine &TM;

AMDGPULowerIntrinsicsImpl(Module &M, const AMDGPUTargetMachine &TM)
: M(M), TM(TM) {}

bool run();

private:
bool visitBarrier(IntrinsicInst &I);
};

class AMDGPULowerIntrinsicsLegacy : public ModulePass {
public:
static char ID;

AMDGPULowerIntrinsicsLegacy() : ModulePass(ID) {}

bool runOnModule(Module &M) override;

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
AU.setPreservesCFG();
}
};

template <class T> static void forEachCall(Function &Intrin, T Callback) {
for (User *U : make_early_inc_range(Intrin.users())) {
if (auto *CI = dyn_cast<IntrinsicInst>(U))
Callback(CI);
}
}

} // anonymous namespace

bool AMDGPULowerIntrinsicsImpl::run() {
bool Changed = false;

for (Function &F : M) {
switch (F.getIntrinsicID()) {
default:
continue;
case Intrinsic::amdgcn_s_barrier:
case Intrinsic::amdgcn_s_barrier_signal:
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
case Intrinsic::amdgcn_s_barrier_wait:
forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); });
break;
}
}

return Changed;
}

// Optimize barriers and lower s_barrier to a sequence of split barrier
// intrinsics.
bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait);

const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
bool IsSingleWaveWG = false;

if (TM.getOptLevel() > CodeGenOptLevel::None) {
unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second;
IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize();
}

IRBuilder<> B(&I);

if (IsSingleWaveWG) {
// Down-grade waits, remove split signals.
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
} else if (I.getIntrinsicID() ==
Intrinsic::amdgcn_s_barrier_signal_isfirst) {
// If we're the only wave of the workgroup, we're always first.
I.replaceAllUsesWith(B.getInt1(true));
}
I.eraseFromParent();
return true;
}

if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier &&
ST.hasSplitBarriers()) {
// Lower to split barriers.
Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
{BarrierID_32});
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
{BarrierID_16});
I.eraseFromParent();
return true;
}

return false;
}

PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M,
ModuleAnalysisManager &MAM) {
AMDGPULowerIntrinsicsImpl Impl(M, TM);
if (!Impl.run())
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
return PA;
}

bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {
auto &TPC = getAnalysis<TargetPassConfig>();
const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>();

AMDGPULowerIntrinsicsImpl Impl(M, TM);
return Impl.run();
}

#define PASS_DESC "AMDGPU lower intrinsics"
INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
false)

char AMDGPULowerIntrinsicsLegacy::ID = 0;

ModulePass *llvm::createAMDGPULowerIntrinsicsLegacyPass() {
return new AMDGPULowerIntrinsicsLegacy;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass())
MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass())
MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
AMDGPULowerBufferFatPointersPass(*this))
MODULE_PASS("amdgpu-lower-intrinsics", AMDGPULowerIntrinsicsPass(*this))
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
MODULE_PASS("amdgpu-perf-hint",
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
Expand Down Expand Up @@ -1418,6 +1419,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(createAMDGPULowerBufferFatPointersPass());
addPass(createAMDGPULowerIntrinsicsLegacyPass());
// In accordance with the above FIXME, manually force all the
// function-level passes into a CGSCCPassManager.
addPass(new DummyCGSCCPass());
Expand Down Expand Up @@ -2155,9 +2157,10 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(AMDGPULowerBufferFatPointersPass(TM));

addPass.requireCGSCCOrder();

addPass(AMDGPULowerIntrinsicsPass(TM));

Base::addCodeGenPrepare(addPass);

if (isPassEnabled(EnableLoadStoreVectorizer))
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUImageIntrinsicOptimizer.cpp
AMDGPULibFunc.cpp
AMDGPULowerBufferFatPointers.cpp
AMDGPULowerIntrinsics.cpp
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
Expand Down
35 changes: 0 additions & 35 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10421,41 +10421,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
}
case Intrinsic::amdgcn_s_barrier:
case Intrinsic::amdgcn_s_barrier_signal:
case Intrinsic::amdgcn_s_barrier_wait: {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
if (WGSize <= ST.getWavefrontSize()) {
// If the workgroup fits in a wave, remove s_barrier_signal and lower
// s_barrier/s_barrier_wait to wave_barrier.
if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
return Op.getOperand(0);
else
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
MVT::Other, Op.getOperand(0)),
0);
}
}

if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
SDValue K =
DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
SDValue BarSignal =
SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
MVT::Other, K, Op.getOperand(0)),
0);
SDValue BarWait =
SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
BarSignal.getValue(0)),
0);
return BarWait;
}

return SDValue();
};

case Intrinsic::amdgcn_struct_tbuffer_store:
case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
Expand Down
Loading