From eec98b6755d0aba625c033b104f1065b98b9d3ce Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 30 Sep 2024 12:15:47 +0100
Subject: [PATCH] [SYCL][NVPTX] Enable approximate div/sqrt with -ffast-math

The generation of approximate div/sqrt in the NVPTX backend is driven by
the "unsafe-fp-math" function attribute, as there is currently no
suitable instruction-level representation (flag, metadata, etc.) for
this optimization.

The problem with this function attribute is that when inlining it must
be set on *both* caller/callee functions, otherwise it is wiped.

Since CUDA's devicelib bytecode library has hundreds functions with
unsafe-fp-math explicitly disabled, if we inline those functions into
SYCL kernels, we disable the ability for the backend to generate
approximate functions, not just inside the devicelib function but across
the entire kernel.

This might explain why some performance reports we've received suggest
that inlining certain maths functions can make things worse even when
the CUDA compiler does the same thing (e.g., #14358 though this needs
verified).

For this reason, presuambly, the NVPTX backend has two codegen options
that override the function attribute and always generate approximate
div/sqrt instructions. This patch thus explicitly sets these options
when compiling SYCL for NVPTX GPUs. It does not do so for regular C/C++
or CUDA code to limit the wider impact on existing code.
---
 clang/lib/Driver/ToolChains/Cuda.cpp       |  9 +++++++++
 clang/test/Driver/sycl-nvptx-fast-math.cpp | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 clang/test/Driver/sycl-nvptx-fast-math.cpp

diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index a2b6db69b9284..9f15612355e6c 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -946,6 +946,15 @@ void CudaToolChain::addClangTargetOptions(
 
     if (DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt))
       CC1Args.push_back("-fcuda-prec-sqrt");
+
+    bool FastRelaxedMath = DriverArgs.hasFlag(
+        options::OPT_ffast_math, options::OPT_fno_fast_math, false);
+    bool UnsafeMathOpt =
+        DriverArgs.hasFlag(options::OPT_funsafe_math_optimizations,
+                           options::OPT_fno_unsafe_math_optimizations, false);
+    if (FastRelaxedMath || UnsafeMathOpt)
+      CC1Args.append({"-mllvm", "--nvptx-prec-divf32=0", "-mllvm",
+                      "--nvptx-prec-sqrtf32=0"});
   } else {
     CC1Args.append(
         {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
diff --git a/clang/test/Driver/sycl-nvptx-fast-math.cpp b/clang/test/Driver/sycl-nvptx-fast-math.cpp
new file mode 100644
index 0000000000000..b4593a3700148
--- /dev/null
+++ b/clang/test/Driver/sycl-nvptx-fast-math.cpp
@@ -0,0 +1,18 @@
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang -### -nocudalib \
+// RUN:   -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s 2>&1 \
+// RUN: | FileCheck --check-prefix=CHECK-DEFAULT %s
+
+// RUN: %clang -### -nocudalib \
+// RUN:   -fsycl -fsycl-targets=nvptx64-nvidia-cuda -ffast-math %s 2>&1 \
+// RUN: | FileCheck --check-prefix=CHECK-FAST %s
+
+// RUN: %clang -### -nocudalib \
+// RUN:   -fsycl -fsycl-targets=nvptx64-nvidia-cuda -funsafe-math-optimizations %s 2>&1 \
+// RUN: | FileCheck --check-prefix=CHECK-FAST %s
+
+// CHECK-FAST: "-mllvm" "--nvptx-prec-divf32=0" "-mllvm" "--nvptx-prec-sqrtf32=0"
+
+// CHECK-DEFAULT-NOT: "nvptx-prec-divf32=0"
+// CHECK-DEFAULT-NOT: "nvptx-prec-sqrtf32=0"