Generate fma from mad when allowed by compile options (#1438)

rjodinchr · web-flow · commit c4f91b91f195 · 2025-01-16T17:02:56.000+01:00
diff --git a/lib/Builtins.cpp b/lib/Builtins.cpp
@@ -686,6 +686,9 @@ Builtins::getExtInstEnum(const Builtins::FunctionInfo &func_info) {
     return glsl::ExtInst::ExtInstPackHalf2x16;
   case Builtins::kSpirvUnpack:
     return glsl::ExtInst::ExtInstUnpackHalf2x16;
+  case Builtins::kMad:
+    // Only floating-point kMad should be able to get here
+    return glsl::ExtInst::ExtInstFma;
   default:
     break;
   }
diff --git a/lib/LongVectorLoweringPass.cpp b/lib/LongVectorLoweringPass.cpp
@@ -235,6 +235,7 @@ Function *getBIFScalarVersion(Function &Builtin) {
   case clspv::Builtins::kLdexp:
   case clspv::Builtins::kLog:
   case clspv::Builtins::kLog2:
+  case clspv::Builtins::kMad:
   case clspv::Builtins::kMax:
   case clspv::Builtins::kMin:
   case clspv::Builtins::kMix:
diff --git a/lib/ReplaceOpenCLBuiltinPass.cpp b/lib/ReplaceOpenCLBuiltinPass.cpp
@@ -2116,6 +2116,13 @@ bool ReplaceOpenCLBuiltinPass::replaceSignbit(Function &F, bool is_vec) {
 
 bool ReplaceOpenCLBuiltinPass::replaceMul(Function &F, bool is_float,
                                           bool is_mad) {
+  // floating-point fma can be handle later in the flow if they are allowed
+  if (is_float && is_mad &&
+      (clspv::Option::UseNativeBuiltins().count(
+           clspv::Builtins::BuiltinType::kFma) > 0 ||
+       clspv::Option::ClMadEnable() || clspv::Option::UnsafeMath())) {
+    return false;
+  }
   return replaceCallsWithValue(F, [&](CallInst *CI) -> llvm::Value * {
     // The multiply instruction to use.
     auto MulInst = is_float ? Instruction::FMul : Instruction::Mul;
diff --git a/test/LongVectorLowering/mad-float-optimization.ll b/test/LongVectorLowering/mad-float-optimization.ll
@@ -0,0 +1,24 @@
+; RUN: clspv-opt --passes=long-vector-lowering %s -o %t.ll
+; RUN: FileCheck %s < %t.ll
+
+; CHECK-COUNT-8: call spir_func float @_Z3madfff(
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @foo(ptr addrspace(1) align 32 %a) {
+entry:
+  %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(1) %a, i32 1
+  %0 = load <8 x float>, ptr addrspace(1) %arrayidx, align 32
+  %arrayidx1 = getelementptr inbounds <8 x float>, ptr addrspace(1) %a, i32 2
+  %1 = load <8 x float>, ptr addrspace(1) %arrayidx1, align 32
+  %arrayidx2 = getelementptr inbounds <8 x float>, ptr addrspace(1) %a, i32 3
+  %2 = load <8 x float>, ptr addrspace(1) %arrayidx2, align 32
+  %call = call spir_func <8 x float> @_Z3madDv8_fS_S_(<8 x float> %0, <8 x float> %1, <8 x float> %2)
+  %arrayidx3 = getelementptr inbounds <8 x float>, ptr addrspace(1) %a, i32 0
+  store <8 x float> %call, ptr addrspace(1) %arrayidx3, align 32
+  ret void
+}
+
+declare spir_func <8 x float> @_Z3madDv8_fS_S_(<8 x float>, <8 x float>, <8 x float>)
+
diff --git a/test/mad-float-optimization.cl b/test/mad-float-optimization.cl
@@ -0,0 +1,27 @@
+// RUN: clspv %target %s -o %t.unsafe.spv --cl-unsafe-math-optimizations
+// RUN: spirv-dis -o %t.unsafe.spvasm %t.unsafe.spv
+// RUN: FileCheck %s < %t.unsafe.spvasm
+// RUN: spirv-val --target-env spv1.0 %t.unsafe.spv
+
+// RUN: clspv %target %s -o %t.mad.spv --cl-mad-enable
+// RUN: spirv-dis -o %t.mad.spvasm %t.mad.spv
+// RUN: FileCheck %s < %t.mad.spvasm
+// RUN: spirv-val --target-env spv1.0 %t.mad.spv
+
+// RUN: clspv %target %s -o %t.native.spv --use-native-builtins=fma
+// RUN: spirv-dis -o %t.native.spvasm %t.native.spv
+// RUN: FileCheck %s < %t.native.spvasm
+// RUN: spirv-val --target-env spv1.0 %t.native.spv
+
+// CHECK: OpExtInst {{.*}} Fma
+
+// RUN: clspv %target %s -o %t.spv
+// RUN: spirv-dis -o %t.spvasm %t.spv
+// RUN: FileCheck %s --check-prefix=NOOPT < %t.spvasm
+// RUN: spirv-val --target-env spv1.0 %t.spv
+
+// NOOPT-NOT: OpExtInst {{.*}} Fma
+
+void kernel foo(global float* a) {
+    a[0] = mad(a[1], a[2], a[3]);
+}