diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index e39a73bdb13ac..1ca4360d67820 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -317,6 +317,10 @@ class CodeGenOptions : public CodeGenOptionsBase { /// CUDA runtime back-end for incorporating them into host-side object file. std::string CudaGpuBinaryFileName; + /// Whether a precise or approximate square root should be used for CUDA + /// device code. + bool CudaPreciseSqrt; + /// List of filenames passed in using the -fembed-offload-object option. These /// are offloading binaries containing device images and metadata. std::vector OffloadObjects; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index e69b804de63b5..88ec378222840 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1279,6 +1279,11 @@ def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero"> Alias; def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">, Alias; +defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt", + CodeGenOpts<"CudaPreciseSqrt">, DefaultFalse, + PosFlag, + NegFlag, + BothFlags<[], [ClangOption], " precise square root for CUDA device code.">>; def : Flag<["-"], "fcuda-rdc">, Alias; def : Flag<["-"], "fno-cuda-rdc">, Alias; defm cuda_short_ptr : BoolFOption<"cuda-short-ptr", diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 8f9cf965af2b9..7f99a951ab97f 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1286,6 +1286,10 @@ void CodeGenModule::Release() { } if (LangOpts.CUDAIsDevice && getTriple().isNVPTX()) { + // Indicate whether __nvvm_reflect should be configured to use precise + // square root. (This corresponds to its "__CUDA_PREC_SQRT" property.) + getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt", + CodeGenOpts.CudaPreciseSqrt); // Indicate whether __nvvm_reflect should be configured to flush denormal // floating point values to 0. (This corresponds to its "__CUDA_FTZ" // property.) diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 06b0b0913d24e..00048e9217518 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE #include "llvm/Option/ArgList.h" +#include "llvm/Option/Option.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/FormatAdapters.h" #include "llvm/Support/FormatVariadic.h" @@ -862,6 +863,10 @@ void CudaToolChain::addClangTargetOptions( if (CudaInstallation.version() >= CudaVersion::CUDA_90) CC1Args.push_back("-fcuda-allow-variadic-functions"); + if (DriverArgs.hasFlag(options::OPT_fcuda_prec_sqrt, + options::OPT_fno_cuda_prec_sqrt, false)) + CC1Args.append({"-fcuda-prec-sqrt"}); + if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, options::OPT_fno_cuda_short_ptr, false)) CC1Args.append({"-mllvm", "--nvptx-short-ptr"}); diff --git a/clang/test/CodeGenCUDA/prec-sqrt.cu b/clang/test/CodeGenCUDA/prec-sqrt.cu new file mode 100644 index 0000000000000..88c7692e8bb0a --- /dev/null +++ b/clang/test/CodeGenCUDA/prec-sqrt.cu @@ -0,0 +1,15 @@ +// RUN: %clang_cc1 -fcuda-is-device \ +// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \ +// RUN: FileCheck -check-prefixes=NO-PREC-SQRT %s + +// RUN: %clang_cc1 -fcuda-is-device -fcuda-prec-sqrt \ +// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \ +// RUN: FileCheck -check-prefixes=PREC-SQRT %s + +#include "Inputs/cuda.h" + +extern "C" __device__ void foo() {} + + +// NO-PREC-SQRT: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 0} +// PREC-SQRT: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1} diff --git a/clang/test/Driver/cuda-prec-sqrt.cu b/clang/test/Driver/cuda-prec-sqrt.cu new file mode 100644 index 0000000000000..563c41b75d49a --- /dev/null +++ b/clang/test/Driver/cuda-prec-sqrt.cu @@ -0,0 +1,6 @@ +// Checks that the -fcuda-prec-sqrt flag is passed to the cc1 frontend. + +// RUN: %clang -### --target=x86_64-linux-gnu -c -fcuda-prec-sqrt -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 | FileCheck %s + +// CHECK: "-triple" "nvptx64-nvidia-cuda" +// CHECK-SAME: "-fcuda-prec-sqrt" diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 20b8bef1899b4..593c98ea036c5 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -173,6 +173,12 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { if (auto *Flag = mdconst::extract_or_null( F.getParent()->getModuleFlag("nvvm-reflect-ftz"))) ReflectVal = Flag->getSExtValue(); + } else if (ReflectArg == "__CUDA_PREC_SQRT") { + // Try to pull __CUDA_PREC_SQRT from the nvvm-reflect-prec-sqrt module + // flag. + if (auto *Flag = mdconst::extract_or_null( + F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt"))) + ReflectVal = Flag->getSExtValue(); } else if (ReflectArg == "__CUDA_ARCH") { ReflectVal = SmVersion * 10; } diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-sqrt.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-sqrt.ll new file mode 100644 index 0000000000000..5b584547f836b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-sqrt.ll @@ -0,0 +1,28 @@ +; We run nvvm-reflect (and then optimize) this module twice, once with metadata +; that enables precise sqrt, and again with metadata that disables it. + +; RUN: cat %s > %t.noprec +; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 0}' >> %t.noprec +; RUN: opt %t.noprec -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \ +; RUN: | FileCheck %s --check-prefix=PREC_SQRT_0 --check-prefix=CHECK + +; RUN: cat %s > %t.prec +; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1}' >> %t.prec +; RUN: opt %t.prec -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect' \ +; RUN: | FileCheck %s --check-prefix=PREC_SQRT_1 --check-prefix=CHECK + +@.str = private unnamed_addr constant [17 x i8] c"__CUDA_PREC_SQRT\00", align 1 + +declare i32 @__nvvm_reflect(ptr) + +; CHECK-LABEL: @foo +define i32 @foo() { + ; CHECK-NOT: call i32 @__nvvm_reflect + %reflect = call i32 @__nvvm_reflect(ptr @.str) + ; PREC_SQRT_0: ret i32 0 + ; PREC_SQRT_1: ret i32 1 + ret i32 %reflect +} + +!llvm.module.flags = !{!0} +; A module flag is added to the end of this file by the RUN lines at the top.