From eec98b6755d0aba625c033b104f1065b98b9d3ce Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 30 Sep 2024 12:15:47 +0100 Subject: [PATCH] [SYCL][NVPTX] Enable approximate div/sqrt with -ffast-math The generation of approximate div/sqrt in the NVPTX backend is driven by the "unsafe-fp-math" function attribute, as there is currently no suitable instruction-level representation (flag, metadata, etc.) for this optimization. The problem with this function attribute is that when inlining it must be set on *both* caller/callee functions, otherwise it is wiped. Since CUDA's devicelib bytecode library has hundreds functions with unsafe-fp-math explicitly disabled, if we inline those functions into SYCL kernels, we disable the ability for the backend to generate approximate functions, not just inside the devicelib function but across the entire kernel. This might explain why some performance reports we've received suggest that inlining certain maths functions can make things worse even when the CUDA compiler does the same thing (e.g., #14358 though this needs verified). For this reason, presuambly, the NVPTX backend has two codegen options that override the function attribute and always generate approximate div/sqrt instructions. This patch thus explicitly sets these options when compiling SYCL for NVPTX GPUs. It does not do so for regular C/C++ or CUDA code to limit the wider impact on existing code. --- clang/lib/Driver/ToolChains/Cuda.cpp | 9 +++++++++ clang/test/Driver/sycl-nvptx-fast-math.cpp | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 clang/test/Driver/sycl-nvptx-fast-math.cpp diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index a2b6db69b9284..9f15612355e6c 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -946,6 +946,15 @@ void CudaToolChain::addClangTargetOptions( if (DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt)) CC1Args.push_back("-fcuda-prec-sqrt"); + + bool FastRelaxedMath = DriverArgs.hasFlag( + options::OPT_ffast_math, options::OPT_fno_fast_math, false); + bool UnsafeMathOpt = + DriverArgs.hasFlag(options::OPT_funsafe_math_optimizations, + options::OPT_fno_unsafe_math_optimizations, false); + if (FastRelaxedMath || UnsafeMathOpt) + CC1Args.append({"-mllvm", "--nvptx-prec-divf32=0", "-mllvm", + "--nvptx-prec-sqrtf32=0"}); } else { CC1Args.append( {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"}); diff --git a/clang/test/Driver/sycl-nvptx-fast-math.cpp b/clang/test/Driver/sycl-nvptx-fast-math.cpp new file mode 100644 index 0000000000000..b4593a3700148 --- /dev/null +++ b/clang/test/Driver/sycl-nvptx-fast-math.cpp @@ -0,0 +1,18 @@ +// REQUIRES: nvptx-registered-target + +// RUN: %clang -### -nocudalib \ +// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-DEFAULT %s + +// RUN: %clang -### -nocudalib \ +// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -ffast-math %s 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-FAST %s + +// RUN: %clang -### -nocudalib \ +// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -funsafe-math-optimizations %s 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-FAST %s + +// CHECK-FAST: "-mllvm" "--nvptx-prec-divf32=0" "-mllvm" "--nvptx-prec-sqrtf32=0" + +// CHECK-DEFAULT-NOT: "nvptx-prec-divf32=0" +// CHECK-DEFAULT-NOT: "nvptx-prec-sqrtf32=0"