llvm · AlexVlx · Apr 14, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 8, 2025
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1115,6 +1115,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
   if (CodeGenOpts.LinkBitcodePostopt)
     MPM.addPass(LinkInModulesPass(BC));
 
+  if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
+      LangOpts.HIPStdParInterposeAlloc)
+    MPM.addPass(HipStdParAllocationInterpositionPass());
+
   // Add a verifier pass if requested. We don't have to do this if the action
   // requires code generation because there will already be a verifier pass in
   // the code-generation pipeline.
@@ -1178,10 +1182,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     return;
   }
 
-  if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
-      LangOpts.HIPStdParInterposeAlloc)
-    MPM.addPass(HipStdParAllocationInterpositionPass());
-
   // Now that we have all of the passes ready, run them.
   {
     PrettyStackTraceString CrashInfo("Optimizer");

diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -231,10 +231,11 @@ void HIPAMDToolChain::addClangTargetOptions(
   CC1Args.append({"-fcuda-is-device", "-fno-threadsafe-statics"});
 
   if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                          false))
+                          false)) {
     CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"});
-  if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
-    CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
+    if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar))
+      CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
+  }
 
   StringRef MaxThreadsPerBlock =
       DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);

diff --git a/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp b/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp
@@ -0,0 +1,11 @@
+// Check that if we are compiling with fgpu-rdc amdgpu-enable-hipstdpar is not
+// passed to CC1, to avoid eager, per TU, removal of potentially accessible
+// functions.
+
+// RUN: %clang -### --hipstdpar --offload-arch=gfx906 %s -nogpulib -nogpuinc \
+// RUN:   2>&1 | FileCheck -check-prefix=NORDC %s
+// NORDC: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
+
+// RUN: %clang -### --hipstdpar --offload-arch=gfx906 %s -nogpulib -nogpuinc -fgpu-rdc \
+// RUN:   2>&1 | FileCheck -check-prefix=RDC %s
+// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
-// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
+// RDC-NOT: -amdgpu-enable-hipstdpar
-// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
+// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
-// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
+// RDC-NOT: -amdgpu-enable-hipstdpar
-// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
+// RDC-NOT: {{.*}}"-mllvm" "-amdgpu-enable-hipstdpar"
diff --git a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
@@ -0,0 +1,21 @@
+// Test that the accelerator code selection pass only gets invoked after linking
+
+// Ensure Pass HipStdParAcceleratorCodeSelectionPass is not invoked in PreLink.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -flto -emit-llvm-bc -fcuda-is-device -fdebug-pass-manager \
+// RUN:  %s -o - 2>&1 | FileCheck --check-prefix=HIPSTDPAR-PRE %s
+// HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass
+
+// Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink.
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \
+// RUN:  %s -o - 2>&1 | FileCheck --check-prefix=HIPSTDPAR-POST %s
+// HIPSTDPAR-POST: Running pass: HipStdParAcceleratorCodeSelection
+
+#define __device__ __attribute__((device))
+
+void foo(float *a, float b) {
+  *a = b;
+}
+
+__device__ void bar(float *a, float b) {
+  *a = b;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -805,17 +805,17 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
 #include "llvm/Passes/TargetPassRegistry.inc"
 
-  PB.registerPipelineStartEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level) {
-        if (EnableHipStdPar)
-          PM.addPass(HipStdParAcceleratorCodeSelectionPass());
-      });
-
   PB.registerPipelineEarlySimplificationEPCallback(
       [](ModulePassManager &PM, OptimizationLevel Level,
          ThinOrFullLTOPhase Phase) {
-        if (!isLTOPreLink(Phase))
+        if (!isLTOPreLink(Phase)) {
+          // When we are not using -fgpu-rdc, we can run accelerator code
+          // selection relatively early, but still after linking to prevent
+          // eager removal of potentially reachable symbols.
+          if (EnableHipStdPar)
+            PM.addPass(HipStdParAcceleratorCodeSelectionPass());
           PM.addPass(AMDGPUPrintfRuntimeBindingPass());
+        }
 
         if (Level == OptimizationLevel::O0)
           return;
@@ -886,6 +886,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
   PB.registerFullLinkTimeOptimizationLastEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
+        // When we are using -fgpu-rdc, we can only run accelerator code
+        // selection after linking to prevent, otherwise we end up removing
+        // potentially reachable symbols that were exported as external in other
+        // modules.
+        if (EnableHipStdPar)
+          PM.addPass(HipStdParAcceleratorCodeSelectionPass());
         // We want to support the -lto-partitions=N option as "best effort".
         // For that, we need to lower LDS earlier in the pipeline before the
         // module is partitioned for codegen.