Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions clang/lib/CodeGen/BackendUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1115,6 +1115,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
if (CodeGenOpts.LinkBitcodePostopt)
MPM.addPass(LinkInModulesPass(BC));

if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
LangOpts.HIPStdParInterposeAlloc)
MPM.addPass(HipStdParAllocationInterpositionPass());

// Add a verifier pass if requested. We don't have to do this if the action
// requires code generation because there will already be a verifier pass in
// the code-generation pipeline.
Expand Down Expand Up @@ -1178,10 +1182,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
return;
}

if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
LangOpts.HIPStdParInterposeAlloc)
MPM.addPass(HipStdParAllocationInterpositionPass());

// Now that we have all of the passes ready, run them.
{
PrettyStackTraceString CrashInfo("Optimizer");
Expand Down
7 changes: 4 additions & 3 deletions clang/lib/Driver/ToolChains/HIPAMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,10 +231,11 @@ void HIPAMDToolChain::addClangTargetOptions(
CC1Args.append({"-fcuda-is-device", "-fno-threadsafe-statics"});

if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false))
false)) {
CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"});
if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
}

StringRef MaxThreadsPerBlock =
DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
Expand Down
11 changes: 11 additions & 0 deletions clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Check that if we are compiling with fgpu-rdc amdgpu-enable-hipstdpar is not
// passed to CC1, to avoid eager, per TU, removal of potentially accessible
// functions.

// RUN: %clang -### --hipstdpar --offload-arch=gfx906 %s -nogpulib -nogpuinc \
// RUN: 2>&1 | FileCheck -check-prefix=NORDC %s
// NORDC: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"

// RUN: %clang -### --hipstdpar --offload-arch=gfx906 %s -nogpulib -nogpuinc -fgpu-rdc \
// RUN: 2>&1 | FileCheck -check-prefix=RDC %s
// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
// RDC-NOT: -amdgpu-enable-hipstdpar

-NOT checks are hazardous and should be as permissive as possible

Suggested change
// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case it actually has to be "-mllvm" because we only care about it not being passed to the initial from source, per TU compilation; forming the check as you suggest would (erroneously) match the (intentional) passing of the argument via -plugin-opt, when we do the final lowering from bitcode. This merely tests/validates the change we did in HIPAMDToolChain::addClangTargetOptions.

Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Test that the accelerator code selection pass only gets invoked after linking

// Ensure Pass HipStdParAcceleratorCodeSelectionPass is not invoked in PreLink.
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -flto -emit-llvm-bc -fcuda-is-device -fdebug-pass-manager \
// RUN: %s -o - 2>&1 | FileCheck --check-prefix=HIPSTDPAR-PRE %s
// HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to use -NEXT checks with the passes before and after it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This still should use -next checks around where it should run


// Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink.
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \
// RUN: %s -o - 2>&1 | FileCheck --check-prefix=HIPSTDPAR-POST %s
// HIPSTDPAR-POST: Running pass: HipStdParAcceleratorCodeSelection

#define __device__ __attribute__((device))

void foo(float *a, float b) {
*a = b;
}

__device__ void bar(float *a, float b) {
*a = b;
}
20 changes: 13 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -805,17 +805,17 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
#include "llvm/Passes/TargetPassRegistry.inc"

PB.registerPipelineStartEPCallback(
[](ModulePassManager &PM, OptimizationLevel Level) {
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
});

PB.registerPipelineEarlySimplificationEPCallback(
[](ModulePassManager &PM, OptimizationLevel Level,
ThinOrFullLTOPhase Phase) {
if (!isLTOPreLink(Phase))
if (!isLTOPreLink(Phase)) {
// When we are not using -fgpu-rdc, we can run accelerator code
// selection relatively early, but still after linking to prevent
// eager removal of potentially reachable symbols.
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
PM.addPass(AMDGPUPrintfRuntimeBindingPass());
}

if (Level == OptimizationLevel::O0)
return;
Expand Down Expand Up @@ -886,6 +886,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {

PB.registerFullLinkTimeOptimizationLastEPCallback(
[this](ModulePassManager &PM, OptimizationLevel Level) {
// When we are using -fgpu-rdc, we can only run accelerator code
// selection after linking to prevent, otherwise we end up removing
// potentially reachable symbols that were exported as external in other
// modules.
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
// We want to support the -lto-partitions=N option as "best effort".
// For that, we need to lower LDS earlier in the pipeline before the
// module is partitioned for codegen.
Expand Down
Loading