Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions clang/lib/CodeGen/BackendUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1115,6 +1115,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
if (CodeGenOpts.LinkBitcodePostopt)
MPM.addPass(LinkInModulesPass(BC));

if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
LangOpts.HIPStdParInterposeAlloc)
MPM.addPass(HipStdParAllocationInterpositionPass());

// Add a verifier pass if requested. We don't have to do this if the action
// requires code generation because there will already be a verifier pass in
// the code-generation pipeline.
Expand Down Expand Up @@ -1178,10 +1182,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
return;
}

if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
LangOpts.HIPStdParInterposeAlloc)
MPM.addPass(HipStdParAllocationInterpositionPass());

// Now that we have all of the passes ready, run them.
{
PrettyStackTraceString CrashInfo("Optimizer");
Expand Down
7 changes: 4 additions & 3 deletions clang/lib/Driver/ToolChains/HIPAMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,10 +231,11 @@ void HIPAMDToolChain::addClangTargetOptions(
CC1Args.append({"-fcuda-is-device", "-fno-threadsafe-statics"});

if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false))
false)) {
CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"});
if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
}

StringRef MaxThreadsPerBlock =
DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
Expand Down
20 changes: 13 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -802,17 +802,17 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
#include "llvm/Passes/TargetPassRegistry.inc"

PB.registerPipelineStartEPCallback(
[](ModulePassManager &PM, OptimizationLevel Level) {
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
});

PB.registerPipelineEarlySimplificationEPCallback(
[](ModulePassManager &PM, OptimizationLevel Level,
ThinOrFullLTOPhase Phase) {
if (!isLTOPreLink(Phase))
if (!isLTOPreLink(Phase)) {
// When we are not using -fgpu-rdc, we can run accelerator code
// selection relatively early, but still after linking to prevent
// eager removal of potentially reachable symbols.
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
PM.addPass(AMDGPUPrintfRuntimeBindingPass());
}

if (Level == OptimizationLevel::O0)
return;
Expand Down Expand Up @@ -883,6 +883,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {

PB.registerFullLinkTimeOptimizationLastEPCallback(
[this](ModulePassManager &PM, OptimizationLevel Level) {
// When we are using -fgpu-rdc, we can onky run accelerator code
// selection after linking to prevent, otherwise we end up removing
// potentially reachable symbols that were exported as external in other
// modules.
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
// We want to support the -lto-partitions=N option as "best effort".
// For that, we need to lower LDS earlier in the pipeline before the
// module is partitioned for codegen.
Expand Down