@@ -850,6 +850,19 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
850
850
if (STI.allowFP16Math () || STI.hasBF16Math ())
851
851
setTargetDAGCombine (ISD::SETCC);
852
852
853
+ // Vector reduction operations. These may be turned into sequential, shuffle,
854
+ // or tree reductions depending on what instructions are available for each
855
+ // type.
856
+ for (MVT VT : MVT::fixedlen_vector_valuetypes ()) {
857
+ MVT EltVT = VT.getVectorElementType ();
858
+ if (EltVT == MVT::f16 || EltVT == MVT::bf16 || EltVT == MVT::f32 ||
859
+ EltVT == MVT::f64 ) {
860
+ setOperationAction ({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
861
+ ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
862
+ VT, Custom);
863
+ }
864
+ }
865
+
853
866
// Promote fp16 arithmetic if fp16 hardware isn't available or the
854
867
// user passed --nvptx-no-fp16-math. The flag is useful because,
855
868
// although sm_53+ GPUs have some sort of FP16 support in
@@ -1093,6 +1106,10 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
1093
1106
MAKE_CASE (NVPTXISD::BFI)
1094
1107
MAKE_CASE (NVPTXISD::PRMT)
1095
1108
MAKE_CASE (NVPTXISD::FCOPYSIGN)
1109
+ MAKE_CASE (NVPTXISD::FMAXNUM3)
1110
+ MAKE_CASE (NVPTXISD::FMINNUM3)
1111
+ MAKE_CASE (NVPTXISD::FMAXIMUM3)
1112
+ MAKE_CASE (NVPTXISD::FMINIMUM3)
1096
1113
MAKE_CASE (NVPTXISD::DYNAMIC_STACKALLOC)
1097
1114
MAKE_CASE (NVPTXISD::STACKRESTORE)
1098
1115
MAKE_CASE (NVPTXISD::STACKSAVE)
@@ -1900,6 +1917,191 @@ static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL,
1900
1917
return getPRMT (A, B, DAG.getConstant (Selector, DL, MVT::i32 ), DL, DAG, Mode);
1901
1918
}
1902
1919
1920
+ // / A generic routine for constructing a tree reduction on a vector operand.
1921
+ // / This method groups elements bottom-up, progressively building each level.
1922
+ // / Unlike the shuffle reduction used in DAGTypeLegalizer and ExpandReductions,
1923
+ // / adjacent elements are combined first, leading to shorter live ranges. This
1924
+ // / approach makes the most sense if the shuffle reduction would use the same
1925
+ // / amount of registers.
1926
+ // /
1927
+ // / The flags on the original reduction operation will be propagated to
1928
+ // / each scalar operation.
1929
+ static SDValue BuildTreeReduction (
1930
+ const SmallVector<SDValue> &Elements, EVT EltTy,
1931
+ ArrayRef<std::pair<unsigned /* NodeType*/ , unsigned /* NumInputs*/ >> Ops,
1932
+ const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1933
+ // Build the reduction tree at each level, starting with all the elements.
1934
+ SmallVector<SDValue> Level = Elements;
1935
+
1936
+ unsigned OpIdx = 0 ;
1937
+ while (Level.size () > 1 ) {
1938
+ // Try to reduce this level using the current operator.
1939
+ const auto [DefaultScalarOp, DefaultGroupSize] = Ops[OpIdx];
1940
+
1941
+ // Build the next level by partially reducing all elements.
1942
+ SmallVector<SDValue> ReducedLevel;
1943
+ unsigned I = 0 , E = Level.size ();
1944
+ for (; I + DefaultGroupSize <= E; I += DefaultGroupSize) {
1945
+ // Reduce elements in groups of [DefaultGroupSize], as much as possible.
1946
+ ReducedLevel.push_back (DAG.getNode (
1947
+ DefaultScalarOp, DL, EltTy,
1948
+ ArrayRef<SDValue>(Level).slice (I, DefaultGroupSize), Flags));
1949
+ }
1950
+
1951
+ if (I < E) {
1952
+ // Handle leftover elements.
1953
+
1954
+ if (ReducedLevel.empty ()) {
1955
+ // We didn't reduce anything at this level. We need to pick a smaller
1956
+ // operator.
1957
+ ++OpIdx;
1958
+ assert (OpIdx < Ops.size () && " no smaller operators for reduction" );
1959
+ continue ;
1960
+ }
1961
+
1962
+ // We reduced some things but there's still more left, meaning the
1963
+ // operator's number of inputs doesn't evenly divide this level size. Move
1964
+ // these elements to the next level.
1965
+ for (; I < E; ++I)
1966
+ ReducedLevel.push_back (Level[I]);
1967
+ }
1968
+
1969
+ // Process the next level.
1970
+ Level = ReducedLevel;
1971
+ }
1972
+
1973
+ return *Level.begin ();
1974
+ }
1975
+
1976
+ // / Lower reductions to either a sequence of operations or a tree if
1977
+ // / reassociations are allowed. This method will use larger operations like
1978
+ // / max3/min3 when the target supports them.
1979
+ SDValue NVPTXTargetLowering::LowerVECREDUCE (SDValue Op,
1980
+ SelectionDAG &DAG) const {
1981
+ SDLoc DL (Op);
1982
+ const SDNodeFlags Flags = Op->getFlags ();
1983
+ SDValue Vector = Op.getOperand (0 );
1984
+ SDValue Accumulator;
1985
+
1986
+ EVT EltTy = Vector.getValueType ().getVectorElementType ();
1987
+ const bool CanUseMinMax3 = EltTy == MVT::f32 && STI.getSmVersion () >= 100 &&
1988
+ STI.getPTXVersion () >= 88 ;
1989
+
1990
+ // A list of SDNode opcodes with equivalent semantics, sorted descending by
1991
+ // number of inputs they take.
1992
+ SmallVector<std::pair<unsigned /* Op*/ , unsigned /* NumIn*/ >, 2 > ScalarOps;
1993
+
1994
+ // Whether we can lower to scalar operations in an arbitrary order.
1995
+ bool IsAssociative = allowUnsafeFPMath (DAG.getMachineFunction ());
1996
+
1997
+ // Whether the data type and operation can be represented with fewer ops and
1998
+ // registers in a shuffle reduction.
1999
+ bool PrefersShuffle;
2000
+
2001
+ switch (Op->getOpcode ()) {
2002
+ case ISD::VECREDUCE_FMAX:
2003
+ if (CanUseMinMax3) {
2004
+ ScalarOps.push_back ({NVPTXISD::FMAXNUM3, 3 });
2005
+ // Can't use fmaxnum3 in shuffle reduction
2006
+ PrefersShuffle = false ;
2007
+ } else {
2008
+ // Prefer max.{,b}f16x2 for v2{,b}f16
2009
+ PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16 ;
2010
+ }
2011
+ ScalarOps.push_back ({ISD::FMAXNUM, 2 });
2012
+ // Definition of maxNum in IEEE 754 2008 is non-associative due to handling
2013
+ // of sNaN inputs.
2014
+ IsAssociative = Flags.hasNoNaNs ();
2015
+ break ;
2016
+ case ISD::VECREDUCE_FMIN:
2017
+ if (CanUseMinMax3) {
2018
+ ScalarOps.push_back ({NVPTXISD::FMINNUM3, 3 });
2019
+ // Can't use fminnum3 in shuffle reduction
2020
+ PrefersShuffle = false ;
2021
+ } else {
2022
+ // Prefer min.{,b}f16x2 for v2{,b}f16
2023
+ PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16 ;
2024
+ }
2025
+ ScalarOps.push_back ({ISD::FMINNUM, 2 });
2026
+ // Definition of minNum in IEEE 754 2008 is non-associative due to handling
2027
+ // of sNaN inputs.
2028
+ IsAssociative = Flags.hasNoNaNs ();
2029
+ break ;
2030
+ case ISD::VECREDUCE_FMAXIMUM:
2031
+ if (CanUseMinMax3) {
2032
+ ScalarOps.push_back ({NVPTXISD::FMAXIMUM3, 3 });
2033
+ // Can't use fmax3 in shuffle reduction
2034
+ PrefersShuffle = false ;
2035
+ } else {
2036
+ // Prefer max.{,b}f16x2 for v2{,b}f16
2037
+ PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16 ;
2038
+ }
2039
+ ScalarOps.push_back ({ISD::FMAXIMUM, 2 });
2040
+ IsAssociative = true ;
2041
+ break ;
2042
+ case ISD::VECREDUCE_FMINIMUM:
2043
+ if (CanUseMinMax3) {
2044
+ ScalarOps.push_back ({NVPTXISD::FMINIMUM3, 3 });
2045
+ // Can't use fmin3 in shuffle reduction
2046
+ PrefersShuffle = false ;
2047
+ } else {
2048
+ // Prefer min.{,b}f16x2 for v2{,b}f16
2049
+ PrefersShuffle = EltTy == MVT::f16 || EltTy == MVT::bf16 ;
2050
+ }
2051
+ ScalarOps.push_back ({ISD::FMINIMUM, 2 });
2052
+ IsAssociative = true ;
2053
+ break ;
2054
+ default :
2055
+ llvm_unreachable (" unhandled vecreduce operation" );
2056
+ }
2057
+
2058
+ // We don't expect an accumulator for reassociative vector reduction ops.
2059
+ assert ((!IsAssociative || !Accumulator) && " unexpected accumulator" );
2060
+
2061
+ // If shuffle reduction is preferred, leave it to SelectionDAG.
2062
+ if (IsAssociative && PrefersShuffle)
2063
+ return SDValue ();
2064
+
2065
+ // Otherwise, handle the reduction here.
2066
+ SmallVector<SDValue> Elements;
2067
+ DAG.ExtractVectorElements (Vector, Elements);
2068
+
2069
+ // Lower to tree reduction.
2070
+ if (IsAssociative)
2071
+ return BuildTreeReduction (Elements, EltTy, ScalarOps, DL, Flags, DAG);
2072
+
2073
+ // Lower to sequential reduction.
2074
+ EVT VectorTy = Vector.getValueType ();
2075
+ const unsigned NumElts = VectorTy.getVectorNumElements ();
2076
+ for (unsigned OpIdx = 0 , I = 0 ; I < NumElts; ++OpIdx) {
2077
+ // Try to reduce the remaining sequence as much as possible using the
2078
+ // current operator.
2079
+ assert (OpIdx < ScalarOps.size () && " no smaller operators for reduction" );
2080
+ const auto [DefaultScalarOp, DefaultGroupSize] = ScalarOps[OpIdx];
2081
+
2082
+ if (!Accumulator) {
2083
+ // Try to initialize the accumulator using the current operator.
2084
+ if (I + DefaultGroupSize <= NumElts) {
2085
+ Accumulator = DAG.getNode (
2086
+ DefaultScalarOp, DL, EltTy,
2087
+ ArrayRef (Elements).slice (I, I + DefaultGroupSize), Flags);
2088
+ I += DefaultGroupSize;
2089
+ }
2090
+ }
2091
+
2092
+ if (Accumulator) {
2093
+ for (; I + (DefaultGroupSize - 1 ) <= NumElts; I += DefaultGroupSize - 1 ) {
2094
+ SmallVector<SDValue> Operands = {Accumulator};
2095
+ for (unsigned K = 0 ; K < DefaultGroupSize - 1 ; ++K)
2096
+ Operands.push_back (Elements[I + K]);
2097
+ Accumulator = DAG.getNode (DefaultScalarOp, DL, EltTy, Operands, Flags);
2098
+ }
2099
+ }
2100
+ }
2101
+
2102
+ return Accumulator;
2103
+ }
2104
+
1903
2105
SDValue NVPTXTargetLowering::LowerBITCAST (SDValue Op, SelectionDAG &DAG) const {
1904
2106
// Handle bitcasting from v2i8 without hitting the default promotion
1905
2107
// strategy which goes through stack memory.
@@ -2779,6 +2981,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2779
2981
return LowerVECTOR_SHUFFLE (Op, DAG);
2780
2982
case ISD::CONCAT_VECTORS:
2781
2983
return LowerCONCAT_VECTORS (Op, DAG);
2984
+ case ISD::VECREDUCE_FMAX:
2985
+ case ISD::VECREDUCE_FMIN:
2986
+ case ISD::VECREDUCE_FMAXIMUM:
2987
+ case ISD::VECREDUCE_FMINIMUM:
2988
+ return LowerVECREDUCE (Op, DAG);
2782
2989
case ISD::STORE:
2783
2990
return LowerSTORE (Op, DAG);
2784
2991
case ISD::LOAD:
0 commit comments