Skip to content

Commit 1bcec03

Browse files
authored
[HIP][HIPSTDPAR][NFC] Re-order & adapt hipstdpar specific passes (#134753)
The `hipstdpar` specific passes were not ordered ideally, especially for `fgpu-rdc` compilations, which meant that we'd eagerly run accelerator code selection and remove symbols that might end up used. This change corrects that aspect by ensuring that accelerator code selection is only done after linking (this will have to be revisited in the future once the closed-world assumption no longer holds). Furthermore, we take the opportunity to move allocation interposition so that it properly gets printed when print-pipeline-passes is requested. NFC.
1 parent cd7d2c3 commit 1bcec03

File tree

5 files changed

+62
-14
lines changed

5 files changed

+62
-14
lines changed

clang/lib/CodeGen/BackendUtil.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,6 +1115,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
11151115
if (CodeGenOpts.LinkBitcodePostopt)
11161116
MPM.addPass(LinkInModulesPass(BC));
11171117

1118+
if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
1119+
LangOpts.HIPStdParInterposeAlloc)
1120+
MPM.addPass(HipStdParAllocationInterpositionPass());
1121+
11181122
// Add a verifier pass if requested. We don't have to do this if the action
11191123
// requires code generation because there will already be a verifier pass in
11201124
// the code-generation pipeline.
@@ -1178,10 +1182,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
11781182
return;
11791183
}
11801184

1181-
if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
1182-
LangOpts.HIPStdParInterposeAlloc)
1183-
MPM.addPass(HipStdParAllocationInterpositionPass());
1184-
11851185
// Now that we have all of the passes ready, run them.
11861186
{
11871187
PrettyStackTraceString CrashInfo("Optimizer");

clang/lib/Driver/ToolChains/HIPAMD.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -233,10 +233,11 @@ void HIPAMDToolChain::addClangTargetOptions(
233233
CC1Args.append({"-fcuda-is-device", "-fno-threadsafe-statics"});
234234

235235
if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
236-
false))
236+
false)) {
237237
CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"});
238-
if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
239-
CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
238+
if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
239+
CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
240+
}
240241

241242
StringRef MaxThreadsPerBlock =
242243
DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// Check that if we are compiling with fgpu-rdc amdgpu-enable-hipstdpar is not
2+
// passed to CC1, to avoid eager, per TU, removal of potentially accessible
3+
// functions.
4+
5+
// RUN: %clang -### --hipstdpar --offload-arch=gfx906 -nogpulib -nogpuinc %s \
6+
// RUN: --hipstdpar-path=%S/../Driver/Inputs/hipstdpar \
7+
// RUN: --hipstdpar-thrust-path=%S/../Driver/Inputs/hipstdpar/thrust \
8+
// RUN: --hipstdpar-prim-path=%S/../Driver/Inputs/hipstdpar/rocprim 2>&1 \
9+
// RUN: | FileCheck %s -check-prefix=NORDC
10+
// NORDC: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
11+
12+
// RUN: %clang -### --hipstdpar --offload-arch=gfx906 -nogpulib -nogpuinc %s \
13+
// RUN: -fgpu-rdc --hipstdpar-path=%S/../Driver/Inputs/hipstdpar \
14+
// RUN: --hipstdpar-thrust-path=%S/../Driver/Inputs/hipstdpar/thrust \
15+
// RUN: --hipstdpar-prim-path=%S/../Driver/Inputs/hipstdpar/rocprim 2>&1 \
16+
// RUN: | FileCheck %s -check-prefix=RDC
17+
// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// Test that the accelerator code selection pass only gets invoked after linking
2+
3+
// Ensure Pass HipStdParAcceleratorCodeSelectionPass is not invoked in PreLink.
4+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -flto -emit-llvm-bc -fcuda-is-device -fdebug-pass-manager \
5+
// RUN: %s -o /dev/null 2>&1 | FileCheck --check-prefix=HIPSTDPAR-PRE %s
6+
// HIPSTDPAR-PRE: Running pass: EntryExitInstrumenterPass
7+
// HIPSTDPAR-PRE-NEXT: Running pass: EntryExitInstrumenterPass
8+
// HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass
9+
// HIPSTDPAR-PRE-NEXT: Running pass: AlwaysInlinerPass
10+
11+
// Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink.
12+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \
13+
// RUN: %s -o /dev/null 2>&1 | FileCheck --check-prefix=HIPSTDPAR-POST %s
14+
// HIPSTDPAR-POST: Running pass: HipStdParAcceleratorCodeSelection
15+
16+
#define __device__ __attribute__((device))
17+
18+
void foo(float *a, float b) {
19+
*a = b;
20+
}
21+
22+
__device__ void bar(float *a, float b) {
23+
*a = b;
24+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -810,17 +810,17 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
810810
#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
811811
#include "llvm/Passes/TargetPassRegistry.inc"
812812

813-
PB.registerPipelineStartEPCallback(
814-
[](ModulePassManager &PM, OptimizationLevel Level) {
815-
if (EnableHipStdPar)
816-
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
817-
});
818-
819813
PB.registerPipelineEarlySimplificationEPCallback(
820814
[](ModulePassManager &PM, OptimizationLevel Level,
821815
ThinOrFullLTOPhase Phase) {
822-
if (!isLTOPreLink(Phase))
816+
if (!isLTOPreLink(Phase)) {
817+
// When we are not using -fgpu-rdc, we can run accelerator code
818+
// selection relatively early, but still after linking to prevent
819+
// eager removal of potentially reachable symbols.
820+
if (EnableHipStdPar)
821+
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
823822
PM.addPass(AMDGPUPrintfRuntimeBindingPass());
823+
}
824824

825825
if (Level == OptimizationLevel::O0)
826826
return;
@@ -891,6 +891,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
891891

892892
PB.registerFullLinkTimeOptimizationLastEPCallback(
893893
[this](ModulePassManager &PM, OptimizationLevel Level) {
894+
// When we are using -fgpu-rdc, we can only run accelerator code
895+
// selection after linking to prevent, otherwise we end up removing
896+
// potentially reachable symbols that were exported as external in other
897+
// modules.
898+
if (EnableHipStdPar)
899+
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
894900
// We want to support the -lto-partitions=N option as "best effort".
895901
// For that, we need to lower LDS earlier in the pipeline before the
896902
// module is partitioned for codegen.

0 commit comments

Comments
 (0)