Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions clang/lib/CodeGen/BackendUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1115,6 +1115,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
if (CodeGenOpts.LinkBitcodePostopt)
MPM.addPass(LinkInModulesPass(BC));

if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
LangOpts.HIPStdParInterposeAlloc)
MPM.addPass(HipStdParAllocationInterpositionPass());

// Add a verifier pass if requested. We don't have to do this if the action
// requires code generation because there will already be a verifier pass in
// the code-generation pipeline.
Expand Down Expand Up @@ -1178,10 +1182,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
return;
}

if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
LangOpts.HIPStdParInterposeAlloc)
MPM.addPass(HipStdParAllocationInterpositionPass());

// Now that we have all of the passes ready, run them.
{
PrettyStackTraceString CrashInfo("Optimizer");
Expand Down
7 changes: 4 additions & 3 deletions clang/lib/Driver/ToolChains/HIPAMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,10 +231,11 @@ void HIPAMDToolChain::addClangTargetOptions(
CC1Args.append({"-fcuda-is-device", "-fno-threadsafe-statics"});

if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false))
false)) {
CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"});
if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
}

StringRef MaxThreadsPerBlock =
DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
Expand Down
17 changes: 17 additions & 0 deletions clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Check that if we are compiling with fgpu-rdc amdgpu-enable-hipstdpar is not
// passed to CC1, to avoid eager, per TU, removal of potentially accessible
// functions.

// RUN: %clang -### --hipstdpar --offload-arch=gfx906 -nogpulib -nogpuinc %s \
// RUN: --hipstdpar-path=%S/../Driver/Inputs/hipstdpar \
// RUN: --hipstdpar-thrust-path=%S/../Driver/Inputs/hipstdpar/thrust \
// RUN: --hipstdpar-prim-path=%S/../Driver/Inputs/hipstdpar/rocprim 2>&1 \
// RUN: | FileCheck %s -check-prefix=NORDC
// NORDC: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"

// RUN: %clang -### --hipstdpar --offload-arch=gfx906 -nogpulib -nogpuinc %s \
// RUN: -fgpu-rdc --hipstdpar-path=%S/../Driver/Inputs/hipstdpar \
// RUN: --hipstdpar-thrust-path=%S/../Driver/Inputs/hipstdpar/thrust \
// RUN: --hipstdpar-prim-path=%S/../Driver/Inputs/hipstdpar/rocprim 2>&1 \
// RUN: | FileCheck %s -check-prefix=RDC
// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
// RDC-NOT: -amdgpu-enable-hipstdpar

-NOT checks are hazardous and should be as permissive as possible

Suggested change
// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case it actually has to be "-mllvm" because we only care about it not being passed to the initial from source, per TU compilation; forming the check as you suggest would (erroneously) match the (intentional) passing of the argument via -plugin-opt, when we do the final lowering from bitcode. This merely tests/validates the change we did in HIPAMDToolChain::addClangTargetOptions.

Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Test that the accelerator code selection pass only gets invoked after linking

// Ensure Pass HipStdParAcceleratorCodeSelectionPass is not invoked in PreLink.
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -flto -emit-llvm-bc -fcuda-is-device -fdebug-pass-manager \
// RUN: %s -o /dev/null 2>&1 | FileCheck --check-prefix=HIPSTDPAR-PRE %s
// HIPSTDPAR-PRE: Running pass: EntryExitInstrumenterPass
// HIPSTDPAR-PRE-NEXT: Running pass: EntryExitInstrumenterPass
// HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to use -NEXT checks with the passes before and after it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This still should use -next checks around where it should run

// HIPSTDPAR-PRE-NEXT: Running pass: AlwaysInlinerPass

// Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink.
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \
// RUN: %s -o /dev/null 2>&1 | FileCheck --check-prefix=HIPSTDPAR-POST %s
// HIPSTDPAR-POST: Running pass: HipStdParAcceleratorCodeSelection

#define __device__ __attribute__((device))

void foo(float *a, float b) {
*a = b;
}

__device__ void bar(float *a, float b) {
*a = b;
}
20 changes: 13 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -809,17 +809,17 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
#include "llvm/Passes/TargetPassRegistry.inc"

PB.registerPipelineStartEPCallback(
[](ModulePassManager &PM, OptimizationLevel Level) {
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
});

PB.registerPipelineEarlySimplificationEPCallback(
[](ModulePassManager &PM, OptimizationLevel Level,
ThinOrFullLTOPhase Phase) {
if (!isLTOPreLink(Phase))
if (!isLTOPreLink(Phase)) {
// When we are not using -fgpu-rdc, we can run accelerator code
// selection relatively early, but still after linking to prevent
// eager removal of potentially reachable symbols.
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
PM.addPass(AMDGPUPrintfRuntimeBindingPass());
}

if (Level == OptimizationLevel::O0)
return;
Expand Down Expand Up @@ -890,6 +890,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {

PB.registerFullLinkTimeOptimizationLastEPCallback(
[this](ModulePassManager &PM, OptimizationLevel Level) {
// When we are using -fgpu-rdc, we can only run accelerator code
// selection after linking to prevent, otherwise we end up removing
// potentially reachable symbols that were exported as external in other
// modules.
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
// We want to support the -lto-partitions=N option as "best effort".
// For that, we need to lower LDS earlier in the pipeline before the
// module is partitioned for codegen.
Expand Down
Loading