-
Notifications
You must be signed in to change notification settings - Fork 14.9k
AMDGPU: Refactor lowering of s_barrier to split barriers #154648
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Nicolai Hähnle (nhaehnle) ChangesLet's do the lowering of non-split into split barriers in a new IR pass, AMDGPULowerIntrinsics. That way, there is no code duplication between SelectionDAG and GlobalISel. This simplifies some upcoming extensions to the code. (cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72) Patch is 29.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154648.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0059a862ba9b2..e3d4670af9e14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -62,6 +62,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
ModulePass *createAMDGPULowerBufferFatPointersPass();
+FunctionPass *createAMDGPULowerIntrinsicsLegacyPass();
FunctionPass *createSIModeRegisterPass();
FunctionPass *createGCNPreRAOptimizationsLegacyPass();
FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
@@ -153,6 +154,16 @@ struct AMDGPULowerBufferFatPointersPass
const TargetMachine &TM;
};
+void initializeAMDGPULowerIntrinsicsLegacyPass(PassRegistry &);
+
+struct AMDGPULowerIntrinsicsPass : PassInfoMixin<AMDGPULowerIntrinsicsPass> {
+ AMDGPULowerIntrinsicsPass(const TargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ const TargetMachine &TM;
+};
+
void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
extern char &AMDGPUPrepareAGPRAllocLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 5d31eed8fe7d7..eabf729d4853a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2007,18 +2007,6 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
}
}
- if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
- // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
- MachineBasicBlock *MBB = MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
- .addImm(AMDGPU::Barrier::WORKGROUP);
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
- .addImm(AMDGPU::Barrier::WORKGROUP);
- MI.eraseFromParent();
- return true;
- }
-
return selectImpl(MI, *CoverageInfo);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
new file mode 100644
index 0000000000000..fe70edb9d18f4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -0,0 +1,174 @@
+//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower intrinsics that would otherwise require separate handling in both
+// SelectionDAG and GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-lower-intrinsics"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerIntrinsicsImpl {
+public:
+ Function &F;
+ const AMDGPUTargetMachine &TM;
+ const GCNSubtarget &ST;
+ DominatorTree *DT = nullptr;
+ PostDominatorTree *PDT = nullptr;
+ LoopInfo *LI = nullptr;
+
+ AMDGPULowerIntrinsicsImpl(Function &F, const AMDGPUTargetMachine &TM)
+ : F(F), TM(TM), ST(TM.getSubtarget<GCNSubtarget>(F)) {}
+
+ bool run();
+
+private:
+ bool visit(Instruction *I);
+ void splitBarrier(IntrinsicInst &I);
+};
+
+class AMDGPULowerIntrinsicsLegacy : public FunctionPass {
+public:
+ static char ID;
+
+ AMDGPULowerIntrinsicsLegacy() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ }
+};
+
+} // anonymous namespace
+
+bool AMDGPULowerIntrinsicsImpl::run() {
+ if (!ST.hasSplitBarriers())
+ return false;
+
+ bool Changed = false;
+
+ for (auto BBIt = F.begin(); BBIt != F.end(); ++BBIt) {
+ for (auto IIt = BBIt->begin(); IIt != BBIt->end();) {
+ Instruction *I = &*IIt;
+ ++IIt; // early increment in case the instruction is erased
+ if (I->isTerminator())
+ continue;
+
+ Changed |= visit(I);
+
+ // Fixup the basic block iterator in case the CFG was changed.
+ BBIt = IIt->getParent()->getIterator();
+ }
+ }
+
+ return Changed;
+}
+
+bool AMDGPULowerIntrinsicsImpl::visit(Instruction *I) {
+ if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::amdgcn_s_barrier:
+ if (ST.hasSplitBarriers()) {
+ splitBarrier(*II);
+ return true;
+ }
+ return false;
+ default:
+ return false;
+ }
+ }
+
+ return false;
+}
+
+// Lower s_{cluster_}barrier to a sequence of split barrier intrinsics.
+void AMDGPULowerIntrinsicsImpl::splitBarrier(IntrinsicInst &I) {
+ assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier);
+
+ IRBuilder<> B(&I);
+ unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second;
+ bool IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize();
+
+ if (IsSingleWaveWG) {
+ // The workgroup fits in a single wave. Downgrade to a wave barrier.
+ B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
+ } else {
+ // Lower to split barriers.
+ Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
+ Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
+ B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
+ {BarrierID_32});
+ B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
+ {BarrierID_16});
+ }
+
+ I.eraseFromParent();
+}
+
+PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ AMDGPULowerIntrinsicsImpl Impl(F,
+ static_cast<const AMDGPUTargetMachine &>(TM));
+ Impl.DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+ Impl.PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
+ Impl.LI = AM.getCachedResult<LoopAnalysis>(F);
+ if (!Impl.run())
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<PostDominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ return PA;
+}
+
+bool AMDGPULowerIntrinsicsLegacy::runOnFunction(Function &F) {
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>();
+
+ AMDGPULowerIntrinsicsImpl Impl(F, TM);
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+ auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+ Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ Impl.PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+ Impl.LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+ return Impl.run();
+}
+
+#define PASS_DESC "AMDGPU lower intrinsics"
+INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+ false)
+
+char AMDGPULowerIntrinsicsLegacy::ID = 0;
+
+FunctionPass *llvm::createAMDGPULowerIntrinsicsLegacyPass() {
+ return new AMDGPULowerIntrinsicsLegacy;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e393aa1987744..d9d06d6814386 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -538,6 +538,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
+ initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -1379,6 +1380,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(createAMDGPULowerBufferFatPointersPass());
+ addPass(createAMDGPULowerIntrinsicsLegacyPass());
// In accordance with the above FIXME, manually force all the
// function-level passes into a CGSCCPassManager.
addPass(new DummyCGSCCPass());
@@ -2116,9 +2118,10 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(AMDGPULowerBufferFatPointersPass(TM));
-
addPass.requireCGSCCOrder();
+ addPass(AMDGPULowerIntrinsicsPass(TM));
+
Base::addCodeGenPrepare(addPass);
if (isPassEnabled(EnableLoadStoreVectorizer))
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dc9dd220130ea..619ff4e5c73c4 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -71,6 +71,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUImageIntrinsicOptimizer.cpp
AMDGPULibFunc.cpp
AMDGPULowerBufferFatPointers.cpp
+ AMDGPULowerIntrinsics.cpp
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..19e93742633ca 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10439,21 +10439,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
}
- if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
- // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
- SDValue K =
- DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
- SDValue BarSignal =
- SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
- MVT::Other, K, Op.getOperand(0)),
- 0);
- SDValue BarWait =
- SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
- BarSignal.getValue(0)),
- 0);
- return BarWait;
- }
-
return SDValue();
};
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index ceed41f3ed7c5..460e69622a652 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -8,11 +8,11 @@
; RUN: | FileCheck -check-prefix=GCN-O3 %s
-; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(AMDGPULowerIntrinsicsPass,lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(AMDGPULowerIntrinsicsPass,codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-m...
[truncated]
|
// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait | ||
MachineBasicBlock *MBB = MI.getParent(); | ||
const DebugLoc &DL = MI.getDebugLoc(); | ||
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) | ||
.addImm(AMDGPU::Barrier::WORKGROUP); | ||
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT)) | ||
.addImm(AMDGPU::Barrier::WORKGROUP); | ||
MI.eraseFromParent(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't so much code that duplicating between globalisel and selection dag is a problem. Is a whole pass really needed for this one tiny case? Can we get this into one of the existing IR lowering passes (e.g. PreISelIntrinsicLowering)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PreISelIntrinsicLowering doesn't handle target-specific intrinsics. So is it better to teach that pass how to call into target hooks to handle target intrinsics, or to have a target-specific pass like in this patch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've looked at this. We could perhaps add a hook to TargetTransformInfo, but it feels a bit out of place. PreISelIntrinsicLowering iterates over the functions of the module and looks at calls from there, which means that hooking anything into that pass would basically only re-use this loop over the functions of the module.
I can sort of see the point about breaking the module pass manager (although -- isn't it really the other way around? Breaking up a function pass manager hurts, but a function pass inside a module pass manager ought to be fairly unproblematic), and perhaps a larger point that iterating over all code just for this is a bit excessive.
So I'm going to change this into a module pass that iterates like PreISelIntrinsicLowering.
As for how much code this is: I ran into this precisely because it's about to become a whole lot more code in the near future. This change is simply trying to prepare upstream for that.
fb93c7e
to
773769f
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
49f1318
to
6727443
Compare
Addressed the review comments and rebased on main to fix a build issue. |
Gentle ping |
Let's do the lowering of non-split into split barriers and the downgrading of barriers based on the workgroup size in a new IR pass, AMDGPULowerIntrinsics. That way, there is no code duplication between SelectionDAG and GlobalISel. This simplifies some upcoming extensions to the code. v2: - turn into a Module pass - also handle the downgrading of barriers for single-wave workgroups in the IR pass - add tests for the new pass (cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72)
6727443
to
de0e91f
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
I don't think this justifies a new pass and I'd rather just duplicate the code between the selectors. If you really wanted the code savings it would make more sense as an isel pseudo |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/186/builds/11973 Here is the relevant piece of the build log for the reference
|
I'm not a fan of doing ever more stuff in MachineIR. The tooling around IR is better, the compiler code working on it tends to be cleaner, and it's generally better understood by more people. In this particular case for example, we have downstream changes that introduce CFG manipulation into this code. And yeah, we have plenty of code that does that in MachineIR, but I still think it's better to do that kind of thing in IR. |
Let's do the lowering of non-split into split barriers in a new IR pass, AMDGPULowerIntrinsics. That way, there is no code duplication between SelectionDAG and GlobalISel. This simplifies some upcoming extensions to the code.
(cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72)