Skip to content

Commit e8e7ae8

Browse files
authored
[SYCL] Enable FTZ operations for CUDA/PTX backend via -fcuda-flush-denormals-to-zero (#6411)
Gives a small performance benefit for Gromacs on NVidia targets
1 parent b05f256 commit e8e7ae8

File tree

4 files changed

+32
-3
lines changed

4 files changed

+32
-3
lines changed

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -840,13 +840,15 @@ void CodeGenModule::Release() {
840840
llvm::MDString::get(Ctx, CodeGenOpts.MemoryProfileOutput));
841841
}
842842

843-
if ((LangOpts.CUDAIsDevice || LangOpts.isSYCL()) && getTriple().isNVPTX()) {
843+
if ((LangOpts.CUDAIsDevice || LangOpts.SYCLIsDevice) && getTriple().isNVPTX()) {
844844
// Indicate whether __nvvm_reflect should be configured to flush denormal
845845
// floating point values to 0. (This corresponds to its "__CUDA_FTZ"
846846
// property.)
847847
getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
848-
CodeGenOpts.FP32DenormalMode.Output !=
849-
llvm::DenormalMode::IEEE);
848+
(CodeGenOpts.FP32DenormalMode.Output !=
849+
llvm::DenormalMode::IEEE) ||
850+
(CodeGenOpts.FPDenormalMode.Output !=
851+
llvm::DenormalMode::IEEE));
850852
getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt",
851853
getTarget().getTargetOpts().NVVMCudaPrecSqrt);
852854
}

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3014,6 +3014,11 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
30143014
RoundingMathPresent = false;
30153015
break;
30163016

3017+
case options::OPT_fcuda_flush_denormals_to_zero:
3018+
case options::OPT_fgpu_flush_denormals_to_zero:
3019+
DenormalFP32Math = llvm::DenormalMode::getPreserveSign();
3020+
break;
3021+
30173022
case options::OPT_fdenormal_fp_math_EQ:
30183023
DenormalFPMath = llvm::parseDenormalFPAttribute(A->getValue());
30193024
DenormalFP32Math = DenormalFPMath;
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// RUN: %clang_cc1 -fcuda-is-device -fdenormal-fp-math-f32=preserve-sign \
2+
// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
3+
// RUN: FileCheck -check-prefixes=FTZ32,PTXFTZ32 %s
4+
5+
// RUN: %clang_cc1 -fcuda-is-device -fdenormal-fp-math=preserve-sign \
6+
// RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
7+
// RUN: FileCheck -check-prefixes=FTZ,PTXFTZ %s
8+
9+
// CHECK-LABEL: define void @_Z3foov() #0
10+
void foo() {}
11+
12+
// FTZ32: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
13+
// PTXFTZ32:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
14+
// PTXFTZ32:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
15+
16+
// FTZ: attributes #0 = {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign"
17+
// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
18+
// PTXFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}

clang/test/Driver/ftz-cuda.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
// RUN: %clang -### -fcuda-flush-denormals-to-zero -c %s 2>&1 \
2+
// RUN: | FileCheck --check-prefix=CHECK-FTZ %s
3+
// CHECK-FTZ: "-cc1"
4+
// CHECK-FTZ: "-fdenormal-fp-math-f32=preserve-sign,preserve-sign"

0 commit comments

Comments
 (0)