From de0e91f68141680944e0269fc6142be9c3e0dc48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Thu, 7 Aug 2025 16:44:33 -0700 Subject: [PATCH 1/2] AMDGPU: Refactor lowering of s_barrier to split barriers Let's do the lowering of non-split into split barriers and the downgrading of barriers based on the workgroup size in a new IR pass, AMDGPULowerIntrinsics. That way, there is no code duplication between SelectionDAG and GlobalISel. This simplifies some upcoming extensions to the code. v2: - turn into a Module pass - also handle the downgrading of barriers for single-wave workgroups in the IR pass - add tests for the new pass (cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72) --- llvm/lib/Target/AMDGPU/AMDGPU.h | 11 ++ .../AMDGPU/AMDGPUInstructionSelector.cpp | 37 ---- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 1 - .../Target/AMDGPU/AMDGPULowerIntrinsics.cpp | 164 ++++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +- llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 ---- llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 6 +- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 5 + .../AMDGPU/lower-intrinsics-barriers.ll | 84 +++++++++ .../AMDGPU/lower-intrinsics-split-barriers.ll | 80 +++++++++ 12 files changed, 353 insertions(+), 77 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp create mode 100644 llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 0059a862ba9b2..ebe38de1636be 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -62,6 +62,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass(); ModulePass * createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr); ModulePass *createAMDGPULowerBufferFatPointersPass(); +ModulePass *createAMDGPULowerIntrinsicsLegacyPass(); FunctionPass *createSIModeRegisterPass(); FunctionPass *createGCNPreRAOptimizationsLegacyPass(); FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass(); @@ -153,6 +154,16 @@ struct AMDGPULowerBufferFatPointersPass const TargetMachine &TM; }; +void initializeAMDGPULowerIntrinsicsLegacyPass(PassRegistry &); + +struct AMDGPULowerIntrinsicsPass : PassInfoMixin { + AMDGPULowerIntrinsicsPass(const AMDGPUTargetMachine &TM) : TM(TM) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); + +private: + const AMDGPUTargetMachine &TM; +}; + void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &); extern char &AMDGPUPrepareAGPRAllocLegacyID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 5d31eed8fe7d7..fac365d015d95 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1989,39 +1989,6 @@ bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const { return selectImpl(MI, *CoverageInfo); } -bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { - Intrinsic::ID IntrinsicID = cast(MI).getIntrinsicID(); - if (TM.getOptLevel() > CodeGenOptLevel::None) { - unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; - if (WGSize <= STI.getWavefrontSize()) { - // If the workgroup fits in a wave, remove s_barrier_signal and lower - // s_barrier/s_barrier_wait to wave_barrier. - if (IntrinsicID == Intrinsic::amdgcn_s_barrier || - IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) { - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); - } - MI.eraseFromParent(); - return true; - } - } - - if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) { - // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait - MachineBasicBlock *MBB = MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) - .addImm(AMDGPU::Barrier::WORKGROUP); - BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT)) - .addImm(AMDGPU::Barrier::WORKGROUP); - MI.eraseFromParent(); - return true; - } - - return selectImpl(MI, *CoverageInfo); -} - static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail) { if (TexFailCtrl) @@ -2338,10 +2305,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( return selectDSAppendConsume(I, false); case Intrinsic::amdgcn_init_whole_wave: return selectInitWholeWave(I); - case Intrinsic::amdgcn_s_barrier: - case Intrinsic::amdgcn_s_barrier_signal: - case Intrinsic::amdgcn_s_barrier_wait: - return selectSBarrier(I); case Intrinsic::amdgcn_raw_buffer_load_lds: case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 092439693f399..4db46064999c0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -124,7 +124,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; bool selectInitWholeWave(MachineInstr &MI) const; - bool selectSBarrier(MachineInstr &MI) const; bool selectDSBvhStackIntrinsic(MachineInstr &MI) const; bool selectImageIntrinsic(MachineInstr &MI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp new file mode 100644 index 0000000000000..86549f3032ed1 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -0,0 +1,164 @@ +//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower intrinsics that would otherwise require separate handling in both +// SelectionDAG and GlobalISel. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/InitializePasses.h" + +#define DEBUG_TYPE "amdgpu-lower-intrinsics" + +using namespace llvm; + +namespace { + +class AMDGPULowerIntrinsicsImpl { +public: + Module &M; + const AMDGPUTargetMachine &TM; + + AMDGPULowerIntrinsicsImpl(Module &M, const AMDGPUTargetMachine &TM) + : M(M), TM(TM) {} + + bool run(); + +private: + bool visitBarrier(IntrinsicInst &I); +}; + +class AMDGPULowerIntrinsicsLegacy : public ModulePass { +public: + static char ID; + + AMDGPULowerIntrinsicsLegacy() : ModulePass(ID) {} + + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } +}; + +template static void forEachCall(Function &Intrin, T Callback) { + for (User *U : make_early_inc_range(Intrin.users())) { + if (auto *CI = dyn_cast(U)) + Callback(CI); + } +} + +} // anonymous namespace + +bool AMDGPULowerIntrinsicsImpl::run() { + bool Changed = false; + + for (Function &F : M) { + switch (F.getIntrinsicID()) { + default: + continue; + case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_s_barrier_signal: + case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_wait: + forEachCall(F, [&](IntrinsicInst *II) { + if (visitBarrier(*II)) + Changed = true; + }); + break; + } + } + + return Changed; +} + +// Optimize barriers and lower s_barrier to a sequence of split barrier +// intrinsics. +bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) { + assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier || + I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal || + I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst || + I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait); + + const GCNSubtarget &ST = TM.getSubtarget(*I.getFunction()); + bool IsSingleWaveWG = false; + + if (TM.getOptLevel() > CodeGenOptLevel::None) { + unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second; + IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize(); + } + + IRBuilder<> B(&I); + + if (IsSingleWaveWG) { + // Down-grade waits, remove split signals. + if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier || + I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) { + B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {}); + } else if (I.getIntrinsicID() == + Intrinsic::amdgcn_s_barrier_signal_isfirst) { + // If we're the only wave of the workgroup, we're always first. + I.replaceAllUsesWith(B.getInt1(true)); + } + I.eraseFromParent(); + return true; + } + + if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier && + ST.hasSplitBarriers()) { + // Lower to split barriers. + Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP); + Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP); + B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal, + {BarrierID_32}); + B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait, + {BarrierID_16}); + I.eraseFromParent(); + return true; + } + + return false; +} + +PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M, + ModuleAnalysisManager &MAM) { + AMDGPULowerIntrinsicsImpl Impl(M, TM); + if (!Impl.run()) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} + +bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) { + auto &TPC = getAnalysis(); + const AMDGPUTargetMachine &TM = TPC.getTM(); + + AMDGPULowerIntrinsicsImpl Impl(M, TM); + return Impl.run(); +} + +#define PASS_DESC "AMDGPU lower intrinsics" +INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false, + false) + +char AMDGPULowerIntrinsicsLegacy::ID = 0; + +ModulePass *llvm::createAMDGPULowerIntrinsicsLegacyPass() { + return new AMDGPULowerIntrinsicsLegacy; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 6ddfa386e8ac9..48448833721bf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -20,6 +20,7 @@ MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass()) MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass()) MODULE_PASS("amdgpu-lower-buffer-fat-pointers", AMDGPULowerBufferFatPointersPass(*this)) +MODULE_PASS("amdgpu-lower-intrinsics", AMDGPULowerIntrinsicsPass(*this)) MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass()) MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this)) MODULE_PASS("amdgpu-perf-hint", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e969f9ec88899..4a2f0a13b1325 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -577,6 +577,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR); initializeAMDGPULowerModuleLDSLegacyPass(*PR); initializeAMDGPULowerBufferFatPointersPass(*PR); + initializeAMDGPULowerIntrinsicsLegacyPass(*PR); initializeAMDGPUReserveWWMRegsLegacyPass(*PR); initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); @@ -1418,6 +1419,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() { // nodes out of the graph, which leads to function-level passes not // being run on them, which causes crashes in the resource usage analysis). addPass(createAMDGPULowerBufferFatPointersPass()); + addPass(createAMDGPULowerIntrinsicsLegacyPass()); // In accordance with the above FIXME, manually force all the // function-level passes into a CGSCCPassManager. addPass(new DummyCGSCCPass()); @@ -2155,9 +2157,10 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { // nodes out of the graph, which leads to function-level passes not // being run on them, which causes crashes in the resource usage analysis). addPass(AMDGPULowerBufferFatPointersPass(TM)); - addPass.requireCGSCCOrder(); + addPass(AMDGPULowerIntrinsicsPass(TM)); + Base::addCodeGenPrepare(addPass); if (isPassEnabled(EnableLoadStoreVectorizer)) diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index dc9dd220130ea..619ff4e5c73c4 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -71,6 +71,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUImageIntrinsicOptimizer.cpp AMDGPULibFunc.cpp AMDGPULowerBufferFatPointers.cpp + AMDGPULowerIntrinsics.cpp AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 66c1dfc71c2f5..e568b2d14d7ef 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10421,41 +10421,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE; return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); } - case Intrinsic::amdgcn_s_barrier: - case Intrinsic::amdgcn_s_barrier_signal: - case Intrinsic::amdgcn_s_barrier_wait: { - const GCNSubtarget &ST = MF.getSubtarget(); - if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) { - unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; - if (WGSize <= ST.getWavefrontSize()) { - // If the workgroup fits in a wave, remove s_barrier_signal and lower - // s_barrier/s_barrier_wait to wave_barrier. - if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal) - return Op.getOperand(0); - else - return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, - MVT::Other, Op.getOperand(0)), - 0); - } - } - - if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) { - // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait - SDValue K = - DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); - SDValue BarSignal = - SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL, - MVT::Other, K, Op.getOperand(0)), - 0); - SDValue BarWait = - SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K, - BarSignal.getValue(0)), - 0); - return BarWait; - } - - return SDValue(); - }; case Intrinsic::amdgcn_struct_tbuffer_store: case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index ceed41f3ed7c5..6df3d255244d2 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -8,11 +8,11 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) define void @empty() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 3e17be6b34a57..36231abda87db 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -49,6 +49,7 @@ ; GCN-O0-NEXT: Expand reduction intrinsics ; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources +; GCN-O0-NEXT: AMDGPU lower intrinsics ; GCN-O0-NEXT: CallGraph Construction ; GCN-O0-NEXT: Call Graph SCC Pass Manager ; GCN-O0-NEXT: DummyCGSCCPass @@ -231,6 +232,7 @@ ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources +; GCN-O1-NEXT: AMDGPU lower intrinsics ; GCN-O1-NEXT: CallGraph Construction ; GCN-O1-NEXT: Call Graph SCC Pass Manager ; GCN-O1-NEXT: DummyCGSCCPass @@ -530,6 +532,7 @@ ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources +; GCN-O1-OPTS-NEXT: AMDGPU lower intrinsics ; GCN-O1-OPTS-NEXT: CallGraph Construction ; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager ; GCN-O1-OPTS-NEXT: DummyCGSCCPass @@ -847,6 +850,7 @@ ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources +; GCN-O2-NEXT: AMDGPU lower intrinsics ; GCN-O2-NEXT: CallGraph Construction ; GCN-O2-NEXT: Call Graph SCC Pass Manager ; GCN-O2-NEXT: DummyCGSCCPass @@ -1179,6 +1183,7 @@ ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources +; GCN-O3-NEXT: AMDGPU lower intrinsics ; GCN-O3-NEXT: CallGraph Construction ; GCN-O3-NEXT: Call Graph SCC Pass Manager ; GCN-O3-NEXT: DummyCGSCCPass diff --git a/llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll new file mode 100644 index 0000000000000..bc70c3b36d45a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -codegen-opt-level=0 | FileCheck --check-prefixes=GFX11,GFX11-NOOPT %s +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=GFX11,OPT-WAVE32,GFX11-OPT-WAVE32 %s +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=GFX11,OPT-WAVE64,GFX11-OPT-WAVE64 %s +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=0 | FileCheck --check-prefixes=GFX12,GFX12-NOOPT %s +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=GFX12,OPT-WAVE32,GFX12-OPT-WAVE32 %s +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=GFX12,OPT-WAVE64,GFX12-OPT-WAVE64 %s + +define amdgpu_kernel void @barrier() { +; GFX11-LABEL: define amdgpu_kernel void @barrier( +; GFX11-SAME: ) #[[ATTR0:[0-9]+]] { +; GFX11-NEXT: call void @llvm.amdgcn.s.barrier() +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define amdgpu_kernel void @barrier( +; GFX12-SAME: ) #[[ATTR0:[0-9]+]] { +; GFX12-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) +; GFX12-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; GFX12-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier() + ret void +} + +define amdgpu_kernel void @barrier_32threads() "amdgpu-flat-work-group-size"="32,32" { +; GFX11-NOOPT-LABEL: define amdgpu_kernel void @barrier_32threads( +; GFX11-NOOPT-SAME: ) #[[ATTR1:[0-9]+]] { +; GFX11-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier() +; GFX11-NOOPT-NEXT: ret void +; +; OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_32threads( +; OPT-WAVE32-SAME: ) #[[ATTR1:[0-9]+]] { +; OPT-WAVE32-NEXT: call void @llvm.amdgcn.wave.barrier() +; OPT-WAVE32-NEXT: ret void +; +; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_32threads( +; OPT-WAVE64-SAME: ) #[[ATTR1:[0-9]+]] { +; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier() +; OPT-WAVE64-NEXT: ret void +; +; GFX12-NOOPT-LABEL: define amdgpu_kernel void @barrier_32threads( +; GFX12-NOOPT-SAME: ) #[[ATTR1:[0-9]+]] { +; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) +; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; GFX12-NOOPT-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier() + ret void +} + +define amdgpu_kernel void @barrier_64threads() "amdgpu-flat-work-group-size"="64,64" { +; GFX11-NOOPT-LABEL: define amdgpu_kernel void @barrier_64threads( +; GFX11-NOOPT-SAME: ) #[[ATTR2:[0-9]+]] { +; GFX11-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier() +; GFX11-NOOPT-NEXT: ret void +; +; GFX11-OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_64threads( +; GFX11-OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] { +; GFX11-OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier() +; GFX11-OPT-WAVE32-NEXT: ret void +; +; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_64threads( +; OPT-WAVE64-SAME: ) #[[ATTR2:[0-9]+]] { +; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier() +; OPT-WAVE64-NEXT: ret void +; +; GFX12-NOOPT-LABEL: define amdgpu_kernel void @barrier_64threads( +; GFX12-NOOPT-SAME: ) #[[ATTR2:[0-9]+]] { +; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) +; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; GFX12-NOOPT-NEXT: ret void +; +; GFX12-OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_64threads( +; GFX12-OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] { +; GFX12-OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) +; GFX12-OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; GFX12-OPT-WAVE32-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier() + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-OPT-WAVE64: {{.*}} +; GFX12-OPT-WAVE64: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll new file mode 100644 index 0000000000000..69ad4b6793c1d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=0 | FileCheck --check-prefixes=CHECK,NOOPT %s +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=CHECK,OPT-WAVE32 %s +; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=CHECK,OPT-WAVE64 %s + +declare void @foo(i1) + +define amdgpu_kernel void @barrier() { +; CHECK-LABEL: define amdgpu_kernel void @barrier( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; CHECK-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) +; CHECK-NEXT: call void @foo(i1 [[ISFIRST]]) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal(i32 -1) + call void @llvm.amdgcn.s.barrier.wait(i16 -1) + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) + call void @foo(i1 %isfirst) + ret void +} + +define amdgpu_kernel void @barrier_32threads() "amdgpu-flat-work-group-size"="32,32" { +; NOOPT-LABEL: define amdgpu_kernel void @barrier_32threads( +; NOOPT-SAME: ) #[[ATTR1:[0-9]+]] { +; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) +; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; NOOPT-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) +; NOOPT-NEXT: call void @foo(i1 [[ISFIRST]]) +; NOOPT-NEXT: ret void +; +; OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_32threads( +; OPT-WAVE32-SAME: ) #[[ATTR1:[0-9]+]] { +; OPT-WAVE32-NEXT: call void @llvm.amdgcn.wave.barrier() +; OPT-WAVE32-NEXT: call void @foo(i1 true) +; OPT-WAVE32-NEXT: ret void +; +; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_32threads( +; OPT-WAVE64-SAME: ) #[[ATTR1:[0-9]+]] { +; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier() +; OPT-WAVE64-NEXT: call void @foo(i1 true) +; OPT-WAVE64-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal(i32 -1) + call void @llvm.amdgcn.s.barrier.wait(i16 -1) + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) + call void @foo(i1 %isfirst) + ret void +} + +define amdgpu_kernel void @barrier_64threads() "amdgpu-flat-work-group-size"="64,64" { +; NOOPT-LABEL: define amdgpu_kernel void @barrier_64threads( +; NOOPT-SAME: ) #[[ATTR2:[0-9]+]] { +; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) +; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; NOOPT-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) +; NOOPT-NEXT: call void @foo(i1 [[ISFIRST]]) +; NOOPT-NEXT: ret void +; +; OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_64threads( +; OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] { +; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) +; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +; OPT-WAVE32-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) +; OPT-WAVE32-NEXT: call void @foo(i1 [[ISFIRST]]) +; OPT-WAVE32-NEXT: ret void +; +; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_64threads( +; OPT-WAVE64-SAME: ) #[[ATTR2:[0-9]+]] { +; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier() +; OPT-WAVE64-NEXT: call void @foo(i1 true) +; OPT-WAVE64-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal(i32 -1) + call void @llvm.amdgcn.s.barrier.wait(i16 -1) + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) + call void @foo(i1 %isfirst) + ret void +} From 56f898c9d589c9cc2d07499e469e762344cc19cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Wed, 27 Aug 2025 15:40:33 -0700 Subject: [PATCH 2/2] Stylistic change --- llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index 86549f3032ed1..a30d9cb0412a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -73,10 +73,7 @@ bool AMDGPULowerIntrinsicsImpl::run() { case Intrinsic::amdgcn_s_barrier_signal: case Intrinsic::amdgcn_s_barrier_signal_isfirst: case Intrinsic::amdgcn_s_barrier_wait: - forEachCall(F, [&](IntrinsicInst *II) { - if (visitBarrier(*II)) - Changed = true; - }); + forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); }); break; } }