[Clang] Add -f[no-]cuda-prec-sqrt flag

Lai-YT · Lai-YT · commit 523ad696c4b8 · 2025-04-03T20:35:20.000+08:00
NVCC provides the `-prec-sqrt` flag to control whether a precise or
approximate square root function is used. However, LLVM previously
always use the approximated version.

With this change, Clang introduces the `-f[no-]cuda-prec-sqrt` flag,
allowing users to specify precision behavior. The default is set to
false to maintain existing behavior.
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
@@ -317,6 +317,10 @@ class CodeGenOptions : public CodeGenOptionsBase {
   /// CUDA runtime back-end for incorporating them into host-side object file.
   std::string CudaGpuBinaryFileName;
 
+  /// Whether a precise or approximate square root should be used for CUDA
+  /// device code.
+  bool CudaPreciseSqrt;
+
   /// List of filenames passed in using the -fembed-offload-object option. These
   /// are offloading binaries containing device images and metadata.
   std::vector<std::string> OffloadObjects;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
@@ -1279,6 +1279,11 @@ def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">
   Alias<fgpu_flush_denormals_to_zero>;
 def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">,
   Alias<fno_gpu_flush_denormals_to_zero>;
+defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
+  CodeGenOpts<"CudaPreciseSqrt">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Enable">,
+  NegFlag<SetFalse, [], [ClangOption], "Disable">,
+  BothFlags<[], [ClangOption], " precise square root for CUDA device code.">>;
 def : Flag<["-"], "fcuda-rdc">, Alias<fgpu_rdc>;
 def : Flag<["-"], "fno-cuda-rdc">, Alias<fno_gpu_rdc>;
 defm cuda_short_ptr : BoolFOption<"cuda-short-ptr",
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
 #include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -862,6 +863,10 @@ void CudaToolChain::addClangTargetOptions(
   if (CudaInstallation.version() >= CudaVersion::CUDA_90)
     CC1Args.push_back("-fcuda-allow-variadic-functions");
 
+  if (DriverArgs.hasFlag(options::OPT_fcuda_prec_sqrt,
+                         options::OPT_fno_cuda_prec_sqrt, false))
+    CC1Args.append({"-fcuda-prec-sqrt"});
+
   if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
                          options::OPT_fno_cuda_short_ptr, false))
     CC1Args.append({"-mllvm", "--nvptx-short-ptr"});
diff --git a/clang/test/Driver/cuda-prec-sqrt.cu b/clang/test/Driver/cuda-prec-sqrt.cu
@@ -0,0 +1,6 @@
+// Checks that the -fcuda-prec-sqrt flag is passed to the cc1 frontend.
+
+// RUN: %clang -### --target=x86_64-linux-gnu -c -fcuda-prec-sqrt -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 | FileCheck %s
+
+// CHECK: "-triple" "nvptx64-nvidia-cuda"
+// CHECK-SAME: "-fcuda-prec-sqrt"