[NVPTX] lower VECREDUCE max/min to 3-input on sm_100+

Prince781 · Prince781 · commit f4f7fe14a94a · 2025-07-28T19:48:58.000-07:00
Add support for 3-input fmaxnum/fminnum/fmaximum/fminimum introduced in
PTX 8.8 for sm_100+. Other improvements are:
- Fix lowering of fmaxnum/fminnum so that they are not reassociated.
  According to the IEEE 754 definition of these (maxNum/minNum), they are not
  associative due to how they handle sNaNs. A quick example:
  a = 1.0, b = 1.0, c = sNaN
  maxNum(a, maxNum(b, c)) = maxNum(a, qNaN) = a = 1.0
  maxNum(maxNum(a, b), c) = maxNum(1.0, sNaN) = qNaN
- Use a tree reduction when 3-input operations are supported and the
  reduction has the `reassoc`.
- If not on sm_100+/PTX 8.8, fallback to 2-input operations and use the
  default shuffle reduction.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -850,6 +850,19 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   if (STI.allowFP16Math() || STI.hasBF16Math())
     setTargetDAGCombine(ISD::SETCC);
 
+  // Vector reduction operations. These may be turned into sequential, shuffle,
+  // or tree reductions depending on what instructions are available for each
+  // type.
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+    MVT EltVT = VT.getVectorElementType();
+    if (EltVT == MVT::f16 || EltVT == MVT::bf16 || EltVT == MVT::f32 ||
+        EltVT == MVT::f64) {
+      setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
+                          ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
+                         VT, Custom);
+    }
+  }
+
   // Promote fp16 arithmetic if fp16 hardware isn't available or the
   // user passed --nvptx-no-fp16-math. The flag is useful because,
   // although sm_53+ GPUs have some sort of FP16 support in
@@ -1093,6 +1106,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::BFI)
     MAKE_CASE(NVPTXISD::PRMT)
     MAKE_CASE(NVPTXISD::FCOPYSIGN)
+    MAKE_CASE(NVPTXISD::FMAXNUM3)
+    MAKE_CASE(NVPTXISD::FMINNUM3)
+    MAKE_CASE(NVPTXISD::FMAXIMUM3)
+    MAKE_CASE(NVPTXISD::FMINIMUM3)
     MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
     MAKE_CASE(NVPTXISD::STACKRESTORE)
     MAKE_CASE(NVPTXISD::STACKSAVE)
@@ -1900,6 +1917,191 @@ static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL,
   return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
 }
 
+/// A generic routine for constructing a tree reduction on a vector operand.
+/// This method groups elements bottom-up, progressively building each level.
+/// Unlike the shuffle reduction used in DAGTypeLegalizer and ExpandReductions,
+/// adjacent elements are combined first, leading to shorter live ranges. This
+/// approach makes the most sense if the shuffle reduction would use the same
+/// amount of registers.
+///
+/// The flags on the original reduction operation will be propagated to
+/// each scalar operation.
+static SDValue BuildTreeReduction(
+    const SmallVector<SDValue> &Elements, EVT EltTy,
+    ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
+    const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
+  // Build the reduction tree at each level, starting with all the elements.
+  SmallVector<SDValue> Level = Elements;
+
+  unsigned OpIdx = 0;
+  while (Level.size() > 1) {
+    // Try to reduce this level using the current operator.
+    const auto [DefaultScalarOp, DefaultGroupSize] = Ops[OpIdx];
+
+    // Build the next level by partially reducing all elements.
+    SmallVector<SDValue> ReducedLevel;
+    unsigned I = 0, E = Level.size();
+    for (; I + DefaultGroupSize <= E; I += DefaultGroupSize) {
+      // Reduce elements in groups of [DefaultGroupSize], as much as possible.
+      ReducedLevel.push_back(DAG.getNode(
+          DefaultScalarOp, DL, EltTy,
+          ArrayRef<SDValue>(Level).slice(I, DefaultGroupSize), Flags));
+    }
+
+    if (I < E) {
+      // Handle leftover elements.
+
+      if (ReducedLevel.empty()) {
+        // We didn't reduce anything at this level. We need to pick a smaller
+        // operator.
+        ++OpIdx;
+        assert(OpIdx < Ops.size() && "no smaller operators for reduction");
+        continue;
+      }
+
+      // We reduced some things but there's still more left, meaning the
+      // operator's number of inputs doesn't evenly divide this level size. Move
+      // these elements to the next level.
+      for (; I < E; ++I)
+        ReducedLevel.push_back(Level[I]);
+    }
+
+    // Process the next level.
+    Level = ReducedLevel;
+  }
+
+  return *Level.begin();
+}
+
+/// Lower reductions to either a sequence of operations or a tree if
+/// reassociations are allowed. This method will use larger operations like
+/// max3/min3 when the target supports them.
+SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const SDNodeFlags Flags = Op->getFlags();
+  SDValue Vector = Op.getOperand(0);
+  SDValue Accumulator;
+
+  EVT EltTy = Vector.getValueType().getVectorElementType();
+  const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
+                             STI.getPTXVersion() >= 88;
+
+  // A list of SDNode opcodes with equivalent semantics, sorted descending by
+  // number of inputs they take.
+  SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
+
+  // Whether we can lower to scalar operations in an arbitrary order.
+  bool IsAssociative = allowUnsafeFPMath(DAG.getMachineFunction());
+
+  // Whether the data type and operation can be represented with fewer ops and
+  // registers in a shuffle reduction.
+  bool PrefersShuffle;
+
+  switch (Op->getOpcode()) {
+  case ISD::VECREDUCE_FMAX:
+    if (CanUseMinMax3) {
+      ScalarOps.push_back({NVPTXISD::FMAXNUM3, 3});
+      // Can't use fmaxnum3 in shuffle reduction
+      PrefersShuffle = false;
+    } else {
+      // Prefer max.{,b}f16x2 for v2{,b}f16
+      PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16;
+    }
+    ScalarOps.push_back({ISD::FMAXNUM, 2});
+    // Definition of maxNum in IEEE 754 2008 is non-associative due to handling
+    // of sNaN inputs.
+    IsAssociative = Flags.hasNoNaNs();
+    break;
+  case ISD::VECREDUCE_FMIN:
+    if (CanUseMinMax3) {
+      ScalarOps.push_back({NVPTXISD::FMINNUM3, 3});
+      // Can't use fminnum3 in shuffle reduction
+      PrefersShuffle = false;
+    } else {
+      // Prefer min.{,b}f16x2 for v2{,b}f16
+      PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16;
+    }
+    ScalarOps.push_back({ISD::FMINNUM, 2});
+    // Definition of minNum in IEEE 754 2008 is non-associative due to handling
+    // of sNaN inputs.
+    IsAssociative = Flags.hasNoNaNs();
+    break;
+  case ISD::VECREDUCE_FMAXIMUM:
+    if (CanUseMinMax3) {
+      ScalarOps.push_back({NVPTXISD::FMAXIMUM3, 3});
+      // Can't use fmax3 in shuffle reduction
+      PrefersShuffle = false;
+    } else {
+      // Prefer max.{,b}f16x2 for v2{,b}f16
+      PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16;
+    }
+    ScalarOps.push_back({ISD::FMAXIMUM, 2});
+    IsAssociative = true;
+    break;
+  case ISD::VECREDUCE_FMINIMUM:
+    if (CanUseMinMax3) {
+      ScalarOps.push_back({NVPTXISD::FMINIMUM3, 3});
+      // Can't use fmin3 in shuffle reduction
+      PrefersShuffle = false;
+    } else {
+      // Prefer min.{,b}f16x2 for v2{,b}f16
+      PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16;
+    }
+    ScalarOps.push_back({ISD::FMINIMUM, 2});
+    IsAssociative = true;
+    break;
+  default:
+    llvm_unreachable("unhandled vecreduce operation");
+  }
+
+  // We don't expect an accumulator for reassociative vector reduction ops.
+  assert((!IsAssociative || !Accumulator) && "unexpected accumulator");
+
+  // If shuffle reduction is preferred, leave it to SelectionDAG.
+  if (IsAssociative && PrefersShuffle)
+    return SDValue();
+
+  // Otherwise, handle the reduction here.
+  SmallVector<SDValue> Elements;
+  DAG.ExtractVectorElements(Vector, Elements);
+
+  // Lower to tree reduction.
+  if (IsAssociative)
+    return BuildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
+
+  // Lower to sequential reduction.
+  EVT VectorTy = Vector.getValueType();
+  const unsigned NumElts = VectorTy.getVectorNumElements();
+  for (unsigned OpIdx = 0, I = 0; I < NumElts; ++OpIdx) {
+    // Try to reduce the remaining sequence as much as possible using the
+    // current operator.
+    assert(OpIdx < ScalarOps.size() && "no smaller operators for reduction");
+    const auto [DefaultScalarOp, DefaultGroupSize] = ScalarOps[OpIdx];
+
+    if (!Accumulator) {
+      // Try to initialize the accumulator using the current operator.
+      if (I + DefaultGroupSize <= NumElts) {
+        Accumulator = DAG.getNode(
+            DefaultScalarOp, DL, EltTy,
+            ArrayRef(Elements).slice(I, I + DefaultGroupSize), Flags);
+        I += DefaultGroupSize;
+      }
+    }
+
+    if (Accumulator) {
+      for (; I + (DefaultGroupSize - 1) <= NumElts; I += DefaultGroupSize - 1) {
+        SmallVector<SDValue> Operands = {Accumulator};
+        for (unsigned K = 0; K < DefaultGroupSize - 1; ++K)
+          Operands.push_back(Elements[I + K]);
+        Accumulator = DAG.getNode(DefaultScalarOp, DL, EltTy, Operands, Flags);
+      }
+    }
+  }
+
+  return Accumulator;
+}
+
 SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   // Handle bitcasting from v2i8 without hitting the default promotion
   // strategy which goes through stack memory.
@@ -2779,6 +2981,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::CONCAT_VECTORS:
     return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
+    return LowerVECREDUCE(Op, DAG);
   case ISD::STORE:
     return LowerSTORE(Op, DAG);
   case ISD::LOAD:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -64,6 +64,11 @@ enum NodeType : unsigned {
   UNPACK_VECTOR,
 
   FCOPYSIGN,
+  FMAXNUM3,
+  FMINNUM3,
+  FMAXIMUM3,
+  FMINIMUM3,
+
   DYNAMIC_STACKALLOC,
   STACKRESTORE,
   STACKSAVE,
@@ -286,6 +291,7 @@ class NVPTXTargetLowering : public TargetLowering {
 
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -358,6 +358,36 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
                Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
 }
 
+// Template for 3-input minimum/maximum instructions
+// (sm_100+/PTX 8.8 and f32 only)
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
+multiclass FMINIMUMMAXIMUM3<string OpcStr, bit NaN, SDNode OpNode> {
+  defvar nan_str = !if(NaN, ".NaN", "");
+   def f32rrr :
+     BasicFlagsNVPTXInst<(outs B32:$dst),
+               (ins B32:$a, B32:$b, B32:$c),
+               (ins FTZFlag:$ftz),
+               OpcStr # "$ftz" # nan_str # ".f32",
+               [(set f32:$dst, (OpNode f32:$a, f32:$b, f32:$c))]>,
+               Requires<[hasPTX<88>, hasSM<100>]>;
+   def f32rri :
+     BasicFlagsNVPTXInst<(outs B32:$dst),
+               (ins B32:$a, B32:$b, f32imm:$c),
+               (ins FTZFlag:$ftz),
+               OpcStr # "$ftz" # nan_str # ".f32",
+               [(set f32:$dst, (OpNode f32:$a, f32:$b, fpimm:$c))]>,
+               Requires<[hasPTX<88>, hasSM<100>]>;
+   def f32rii :
+     BasicFlagsNVPTXInst<(outs B32:$dst),
+               (ins B32:$a, f32imm:$b, f32imm:$c),
+               (ins FTZFlag:$ftz),
+               OpcStr # "$ftz" # nan_str # ".f32",
+               [(set f32:$dst, (OpNode f32:$a, fpimm:$b, fpimm:$c))]>,
+               Requires<[hasPTX<88>, hasSM<100>]>;
+}
+
 // Template for instructions which take three FP args.  The
 // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
 //
@@ -1027,6 +1057,20 @@ defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
 defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>;
 defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>;
 
+def nvptx_fminnum3 : SDNode<"NVPTXISD::FMINNUM3", SDTFPTernaryOp,
+                            [SDNPCommutative]>;
+def nvptx_fmaxnum3 : SDNode<"NVPTXISD::FMAXNUM3", SDTFPTernaryOp,
+                             [SDNPCommutative]>;
+def nvptx_fminimum3 : SDNode<"NVPTXISD::FMINIMUM3", SDTFPTernaryOp,
+                             [SDNPCommutative]>;
+def nvptx_fmaximum3 : SDNode<"NVPTXISD::FMAXIMUM3", SDTFPTernaryOp,
+                             [SDNPCommutative]>;
+
+defm FMIN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ false, nvptx_fminnum3>;
+defm FMAX3 : FMINIMUMMAXIMUM3<"max", /* NaN */ false, nvptx_fmaxnum3>;
+defm FMINNAN3 : FMINIMUMMAXIMUM3<"min", /* NaN */ true, nvptx_fminimum3>;
+defm FMAXNAN3 : FMINIMUMMAXIMUM3<"max", /* NaN */ true, nvptx_fmaximum3>;
+
 defm FABS  : F2<"abs", fabs>;
 defm FNEG  : F2<"neg", fneg>;
 defm FABS_H: F2_Support_Half<"abs", fabs>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -87,6 +87,13 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
   }
   unsigned getMinVectorRegisterBitWidth() const override { return 32; }
 
+  bool shouldExpandReduction(const IntrinsicInst *II) const override {
+    // Turn off ExpandReductions pass for NVPTX, which doesn't have advanced
+    // swizzling operations. Our backend/Selection DAG can expand these
+    // reductions with less movs.
+    return false;
+  }
+
   // We don't want to prevent inlining because of target-cpu and -features
   // attributes that were added to newer versions of LLVM/Clang: There are
   // no incompatible functions in PTX, ptxas will throw errors in such cases.
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll