llvm · jhuber6 · Jan 7, 2025 · Jan 6, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -35,9 +35,10 @@ void NVPTXSubtarget::anchor() {}
 NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
     // Provide the default CPU if we don't have one.
-    TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
+    TargetName = std::string(CPU);
 
-    ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
+    ParseSubtargetFeatures(CPU.empty() ? "sm_30" : CPU,
+                           /*TuneCPU=*/CPU.empty() ? "sm_30" : CPU, FS);
 
     // Re-map SM version numbers, SmVersion carries the regular SMs which do
     // have relative order, while FullSmVersion allows distinguishing sm_90 from

diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -111,7 +111,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // - 0 represents base GPU model,
   // - non-zero value identifies particular architecture-accelerated variant.
   bool hasAAFeatures() const { return getFullSmVersion() % 10; }
-  std::string getTargetName() const { return TargetName; }
+  std::string getTargetName() const {
+    return TargetName.empty() ? "sm_30" : TargetName;
+  }
+  bool hasTargetName() const { return !TargetName.empty(); }
 
   // Get maximum value of required alignments among the supported data types.
   // From the PTX ISA doc, section 8.2.3:

diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -255,7 +255,10 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
   PB.registerPipelineStartEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
-        FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
+        // We do not want to fold out calls to nvvm.reflect early if the user
+        // has not provided a target architecture just yet.
+        if (Subtarget.hasTargetName())
+          FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
         // Note: NVVMIntrRangePass was causing numerical discrepancies at one
         // point, if issues crop up, consider disabling.
         FPM.addPass(NVVMIntrRangePass());

diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -21,6 +21,7 @@
 #include "NVPTX.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -219,7 +220,12 @@ bool NVVMReflect::runOnFunction(Function &F) {
   return runNVVMReflect(F, SmVersion);
 }
 
-NVVMReflectPass::NVVMReflectPass() : NVVMReflectPass(0) {}
+NVVMReflectPass::NVVMReflectPass() {
+  // Get the CPU string from the command line if not provided.
+  StringRef SM = codegen::getMCPU();
+  if (!SM.consume_front("sm_") || SM.consumeInteger(10, SmVersion))
+    SmVersion = 0;
+}
 
 PreservedAnalyses NVVMReflectPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {

diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
@@ -1,9 +1,9 @@
 ; Libdevice in recent CUDA versions relies on __CUDA_ARCH reflecting GPU type.
 ; Verify that __nvvm_reflect() is replaced with an appropriate value.
 ;
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM20
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM35
 
 @"$str" = private addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"

diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
@@ -1,8 +1,8 @@
 ; Verify that __nvvm_reflect_ocl() is replaced with an appropriate value
 ;
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM20
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM35
 
 @"$str" = private addrspace(4) constant [12 x i8] c"__CUDA_ARCH\00"

diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
@@ -43,7 +43,7 @@ exit:
 
 declare i32 @llvm.nvvm.reflect(ptr)
 
-; CHECK-LABEL: define noundef i32 @intrinsic
+; CHECK-LABEL: define i32 @intrinsic
 define i32 @intrinsic() {
 ; CHECK-NOT: call i32 @llvm.nvvm.reflect
 ; USE_FTZ_0: ret i32 0

diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
@@ -43,7 +43,8 @@ exit:
 
 declare i32 @llvm.nvvm.reflect(ptr)
 
-; CHECK-LABEL: define noundef i32 @intrinsic
+; CHECK-LABEL: define i32 @intrinsic
+
 define i32 @intrinsic() {
 ; CHECK-NOT: call i32 @llvm.nvvm.reflect
 ; USE_FTZ_0: ret i32 0