From d6f729761c77b7fb380d4c23296f8f5c9cb6d984 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Mon, 6 Oct 2025 20:43:27 -0400 Subject: [PATCH 1/4] AMDGPU: don't call getSubtarget on R600 targets. Unfortunately, `TargetMachine::getSubtarget` does an unchecked static_cast to `ST&`, which makes it easy to get wrong. The modifications here were created by running check-llvm with an assert added to getSubtarget. However, that asssert requires that RTTI is enabled, which LLVM doesn't use, so I've reverted the assert before sending this fix upstream. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 26 ++++++++++++------- .../AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp | 2 +- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index cb49936871e74..f6186f1fae777 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -162,12 +162,16 @@ class AMDGPUInformationCache : public InformationCache { /// Check if the subtarget has aperture regs. bool hasApertureRegs(Function &F) { + if (!TM.getTargetTriple().isAMDGCN()) + return false; const GCNSubtarget &ST = TM.getSubtarget(F); return ST.hasApertureRegs(); } /// Check if the subtarget supports GetDoorbellID. bool supportsGetDoorbellID(Function &F) { + if (!TM.getTargetTriple().isAMDGCN()) + return false; const GCNSubtarget &ST = TM.getSubtarget(F); return ST.supportsGetDoorbellID(); } @@ -182,18 +186,18 @@ class AMDGPUInformationCache : public InformationCache { std::pair getDefaultFlatWorkGroupSize(const Function &F) const { - const GCNSubtarget &ST = TM.getSubtarget(F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); return ST.getDefaultFlatWorkGroupSize(F.getCallingConv()); } std::pair getMaximumFlatWorkGroupRange(const Function &F) { - const GCNSubtarget &ST = TM.getSubtarget(F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()}; } SmallVector getMaxNumWorkGroups(const Function &F) { - const GCNSubtarget &ST = TM.getSubtarget(F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); return ST.getMaxNumWorkGroups(F); } @@ -206,7 +210,7 @@ class AMDGPUInformationCache : public InformationCache { std::pair getWavesPerEU(const Function &F, std::pair FlatWorkGroupSize) { - const GCNSubtarget &ST = TM.getSubtarget(F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F); } @@ -217,7 +221,7 @@ class AMDGPUInformationCache : public InformationCache { if (!Val) return std::nullopt; if (!Val->second) { - const GCNSubtarget &ST = TM.getSubtarget(F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); Val->second = ST.getMaxWavesPerEU(); } return std::make_pair(Val->first, *(Val->second)); @@ -227,13 +231,13 @@ class AMDGPUInformationCache : public InformationCache { getEffectiveWavesPerEU(const Function &F, std::pair WavesPerEU, std::pair FlatWorkGroupSize) { - const GCNSubtarget &ST = TM.getSubtarget(F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize, getLDSSize(F)); } unsigned getMaxWavesPerEU(const Function &F) { - const GCNSubtarget &ST = TM.getSubtarget(F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); return ST.getMaxWavesPerEU(); } @@ -1511,9 +1515,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, A.getOrCreateAAFor(IRPosition::function(*F)); } - const GCNSubtarget &ST = TM.getSubtarget(*F); - if (!F->isDeclaration() && ST.hasClusters()) - A.getOrCreateAAFor(IRPosition::function(*F)); + if (TM.getTargetTriple().isAMDGCN()) { + const GCNSubtarget &ST = TM.getSubtarget(*F); + if (!F->isDeclaration() && ST.hasClusters()) + A.getOrCreateAAFor(IRPosition::function(*F)); + } for (auto &I : instructions(F)) { Value *Ptr = nullptr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp index 639089c75a33e..155c7dad904c5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp @@ -281,7 +281,7 @@ bool optimizeSection(ArrayRef> MergeableInsts) { } static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { - if (!TM) + if (!TM || !TM->getTargetTriple().isAMDGCN()) return false; // This optimization only applies to GFX11 and beyond. From 4becdd9fab562fb01305c1ebc7ac5a9c2ef6fe5a Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Mon, 6 Oct 2025 21:53:26 -0400 Subject: [PATCH 2/4] Review feedback: don't add AMDGPUImageIntrinsicOptimizer pass for R600. --- llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp index 155c7dad904c5..639089c75a33e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp @@ -281,7 +281,7 @@ bool optimizeSection(ArrayRef> MergeableInsts) { } static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { - if (!TM || !TM->getTargetTriple().isAMDGCN()) + if (!TM) return false; // This optimization only applies to GFX11 and beyond. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 280fbe20667c6..d56e743d753a2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1296,7 +1296,8 @@ void AMDGPUPassConfig::addIRPasses() { if (LowerCtorDtor) addPass(createAMDGPUCtorDtorLoweringLegacyPass()); - if (isPassEnabled(EnableImageIntrinsicOptimizer)) + if (TM.getTargetTriple().isAMDGCN() && + isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); // This can be disabled by passing ::Disable here or on the command line From 9102ec66ca374f505c30ca10968953d03a4701b0 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Tue, 7 Oct 2025 09:56:28 -0400 Subject: [PATCH 3/4] Review feedback 2: Don't run AMDGPUAttributor for R600. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 26 +++++++------------ .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 20 +++++++------- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index f6186f1fae777..cb49936871e74 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -162,16 +162,12 @@ class AMDGPUInformationCache : public InformationCache { /// Check if the subtarget has aperture regs. bool hasApertureRegs(Function &F) { - if (!TM.getTargetTriple().isAMDGCN()) - return false; const GCNSubtarget &ST = TM.getSubtarget(F); return ST.hasApertureRegs(); } /// Check if the subtarget supports GetDoorbellID. bool supportsGetDoorbellID(Function &F) { - if (!TM.getTargetTriple().isAMDGCN()) - return false; const GCNSubtarget &ST = TM.getSubtarget(F); return ST.supportsGetDoorbellID(); } @@ -186,18 +182,18 @@ class AMDGPUInformationCache : public InformationCache { std::pair getDefaultFlatWorkGroupSize(const Function &F) const { - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + const GCNSubtarget &ST = TM.getSubtarget(F); return ST.getDefaultFlatWorkGroupSize(F.getCallingConv()); } std::pair getMaximumFlatWorkGroupRange(const Function &F) { - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + const GCNSubtarget &ST = TM.getSubtarget(F); return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()}; } SmallVector getMaxNumWorkGroups(const Function &F) { - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + const GCNSubtarget &ST = TM.getSubtarget(F); return ST.getMaxNumWorkGroups(F); } @@ -210,7 +206,7 @@ class AMDGPUInformationCache : public InformationCache { std::pair getWavesPerEU(const Function &F, std::pair FlatWorkGroupSize) { - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + const GCNSubtarget &ST = TM.getSubtarget(F); return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F); } @@ -221,7 +217,7 @@ class AMDGPUInformationCache : public InformationCache { if (!Val) return std::nullopt; if (!Val->second) { - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + const GCNSubtarget &ST = TM.getSubtarget(F); Val->second = ST.getMaxWavesPerEU(); } return std::make_pair(Val->first, *(Val->second)); @@ -231,13 +227,13 @@ class AMDGPUInformationCache : public InformationCache { getEffectiveWavesPerEU(const Function &F, std::pair WavesPerEU, std::pair FlatWorkGroupSize) { - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + const GCNSubtarget &ST = TM.getSubtarget(F); return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize, getLDSSize(F)); } unsigned getMaxWavesPerEU(const Function &F) { - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + const GCNSubtarget &ST = TM.getSubtarget(F); return ST.getMaxWavesPerEU(); } @@ -1515,11 +1511,9 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, A.getOrCreateAAFor(IRPosition::function(*F)); } - if (TM.getTargetTriple().isAMDGCN()) { - const GCNSubtarget &ST = TM.getSubtarget(*F); - if (!F->isDeclaration() && ST.hasClusters()) - A.getOrCreateAAFor(IRPosition::function(*F)); - } + const GCNSubtarget &ST = TM.getSubtarget(*F); + if (!F->isDeclaration() && ST.hasClusters()) + A.getOrCreateAAFor(IRPosition::function(*F)); for (auto &I : instructions(F)) { Value *Ptr = nullptr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d56e743d753a2..0eeee02ce1b50 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -924,16 +924,18 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { }); // FIXME: Why is AMDGPUAttributor not in CGSCC? - PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM, - OptimizationLevel Level, - ThinOrFullLTOPhase Phase) { - if (Level != OptimizationLevel::O0) { - if (!isLTOPreLink(Phase)) { - AMDGPUAttributorOptions Opts; - MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + if (getTargetTriple().isAMDGCN()) { + PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM, + OptimizationLevel Level, + ThinOrFullLTOPhase Phase) { + if (Level != OptimizationLevel::O0) { + if (!isLTOPreLink(Phase)) { + AMDGPUAttributorOptions Opts; + MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + } } - } - }); + }); + } PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { From 95348a6d6a5b57fc2d0c947fce7f8246cb55dd4e Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Tue, 7 Oct 2025 11:35:46 -0400 Subject: [PATCH 4/4] Address review comment. --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0eeee02ce1b50..723d07e8ea079 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -924,18 +924,18 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { }); // FIXME: Why is AMDGPUAttributor not in CGSCC? - if (getTargetTriple().isAMDGCN()) { - PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM, - OptimizationLevel Level, - ThinOrFullLTOPhase Phase) { - if (Level != OptimizationLevel::O0) { - if (!isLTOPreLink(Phase)) { + PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM, + OptimizationLevel Level, + ThinOrFullLTOPhase Phase) { + if (Level != OptimizationLevel::O0) { + if (!isLTOPreLink(Phase)) { + if (getTargetTriple().isAMDGCN()) { AMDGPUAttributorOptions Opts; MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); } } - }); - } + } + }); PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) {