Rename function. Set FP_ROUND TRUNC flag when safe and add associated tests.

paulwalker-arm · paulwalker-arm · commit 9dc5f72f2e12 · 2025-06-12T14:51:59.000Z
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -188,7 +188,12 @@ class VectorLegalizer {
   void PromoteSETCC(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
   void PromoteSTRICT(SDNode *Node, SmallVectorImpl<SDValue> &Results);
-  void PromoteVECREDUCE(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  /// Calculate the reduction using a type of higher precision and round the
+  /// result to match the original type. Setting NonArithmetic signifies the
+  /// rounding of the result does not affect its value.
+  void PromoteFloatVECREDUCE(SDNode *Node, SmallVectorImpl<SDValue> &Results,
+                             bool NonArithmetic);
 
 public:
   VectorLegalizer(SelectionDAG& dag) :
@@ -683,8 +688,9 @@ void VectorLegalizer::PromoteSTRICT(SDNode *Node,
   Results.push_back(Round.getValue(1));
 }
 
-void VectorLegalizer::PromoteVECREDUCE(SDNode *Node,
-                                       SmallVectorImpl<SDValue> &Results) {
+void VectorLegalizer::PromoteFloatVECREDUCE(SDNode *Node,
+                                            SmallVectorImpl<SDValue> &Results,
+                                            bool NonArithmetic) {
   MVT OpVT = Node->getOperand(0).getSimpleValueType();
   assert(OpVT.isFloatingPoint() && "Expected floating point reduction!");
   MVT NewOpVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OpVT);
@@ -694,8 +700,9 @@ void VectorLegalizer::PromoteVECREDUCE(SDNode *Node,
   SDValue Rdx =
       DAG.getNode(Node->getOpcode(), DL, NewOpVT.getVectorElementType(), NewOp,
                   Node->getFlags());
-  SDValue Res = DAG.getNode(ISD::FP_ROUND, DL, Node->getValueType(0), Rdx,
-                            DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
+  SDValue Res =
+      DAG.getNode(ISD::FP_ROUND, DL, Node->getValueType(0), Rdx,
+                  DAG.getIntPtrConstant(NonArithmetic, DL, /*isTarget=*/true));
   Results.push_back(Res);
 }
 
@@ -731,11 +738,13 @@ void VectorLegalizer::Promote(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     PromoteSTRICT(Node, Results);
     return;
   case ISD::VECREDUCE_FADD:
+    PromoteFloatVECREDUCE(Node, Results, /*NonArithmetic=*/false);
+    return;
   case ISD::VECREDUCE_FMAX:
   case ISD::VECREDUCE_FMAXIMUM:
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMINIMUM:
-    PromoteVECREDUCE(Node, Results);
+    PromoteFloatVECREDUCE(Node, Results, /*NonArithmetic=*/true);
     return;
   case ISD::FP_ROUND:
   case ISD::FP_EXTEND:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11432,7 +11432,7 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
   }
 
   if (VT.isScalableVector())
-    report_fatal_error(
+    reportFatalInternalError(
         "Expanding reductions for scalable vectors is undefined.");
 
   EVT EltVT = VT.getVectorElementType();
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-reductions.ll b/llvm/test/CodeGen/AArch64/sve-bf16-reductions.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+sve,+bf16            < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sve,+bf16            < %s | FileCheck %s -check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,SME
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -214,6 +214,50 @@ define bfloat @fminimumv_nxv8bf16(<vscale x 8 x bfloat> %a) {
   ret bfloat %res
 }
 
+; The reduction is performed at a higher precision. Because add operations
+; can utilise that precision, its result must be rounded even if it's then
+; promoted.
+define float @promoted_fadd(<vscale x 4 x bfloat> %a) {
+; SVE-LABEL: promoted_fadd:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    faddv s0, p0, z0.s
+; SVE-NEXT:    bfcvt h0, s0
+; SVE-NEXT:    shll v0.4s, v0.4h, #16
+; SVE-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; SVE-NEXT:    ret
+;
+; SME-LABEL: promoted_fadd:
+; SME:       // %bb.0:
+; SME-NEXT:    lsl z0.s, z0.s, #16
+; SME-NEXT:    ptrue p0.s
+; SME-NEXT:    faddv s0, p0, z0.s
+; SME-NEXT:    bfcvt h0, s0
+; SME-NEXT:    fmov w8, s0
+; SME-NEXT:    lsl w8, w8, #16
+; SME-NEXT:    fmov s0, w8
+; SME-NEXT:    ret
+  %rdx = call fast bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat zeroinitializer, <vscale x 4 x bfloat> %a)
+  %res = fpext bfloat %rdx to float
+  ret float %res
+}
+
+; The reduction is performed at a higher precision. Because min/max operations
+; don't utilise that precision, its result can be used directly.
+define float @promoted_fmax(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: promoted_fmax:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+  %rdx = call bfloat @llvm.vector.reduce.fmax.nxv4bf16(<vscale x 4 x bfloat> %a)
+  %res = fpext bfloat %rdx to float
+  ret float %res
+}
+
 declare bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat, <vscale x 2 x bfloat>)
 declare bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat, <vscale x 4 x bfloat>)
 declare bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat, <vscale x 8 x bfloat>)

Original file line number	Diff line number	Diff line change
`@@ -11432,7 +11432,7 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {`
`11432`	`11432`	`}`
`11433`	`11433`
`11434`	`11434`	`if (VT.isScalableVector())`
`11435`		`- report_fatal_error(`
	`11435`	`+ reportFatalInternalError(`
`11436`	`11436`	`"Expanding reductions for scalable vectors is undefined.");`
`11437`	`11437`
`11438`	`11438`	`EVT EltVT = VT.getVectorElementType();`