@@ -15841,11 +15841,27 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
1584115841      return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
1584215842    }
1584315843
15844-     // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
15845-     // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
15844+     // Results of setcc operations get widened to 128 bits for xor reduce if
15845+     // their input operands are 128 bits wide, otherwise vectors that are less
15846+     // than 64 bits get widened to neatly fit a 64 bit register, so e.g.
15847+     // <4 x i1> gets lowered to either <4 x i16> or <4 x i32>. Sign extending to
1584615848    // this element size leads to the best codegen, since e.g. setcc results
1584715849    // might need to be truncated otherwise.
15848-     EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
15850+     unsigned ExtendedWidth = 64;
15851+     if (ScalarOpcode == ISD::XOR && Vec.getOpcode() == ISD::SETCC &&
15852+         Vec.getOperand(0).getValueSizeInBits() >= 128) {
15853+       ExtendedWidth = 128;
15854+     }
15855+     EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
15856+ 
15857+     // Negate the reduced vector value for reduce and operations that use
15858+     // fcmp.
15859+     if (ScalarOpcode == ISD::AND && NumElems < 16) {
15860+       Vec = DAG.getNode(
15861+           ISD::XOR, DL, VecVT, Vec,
15862+           DAG.getSplatVector(
15863+               VecVT, DL, DAG.getConstant(APInt::getAllOnes(32), DL, MVT::i32)));
15864+     }
1584915865
1585015866    // any_ext doesn't work with umin/umax, so only use it for uadd.
1585115867    unsigned ExtendOp =
@@ -15854,10 +15870,36 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
1585415870        ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
1585515871    switch (ScalarOpcode) {
1585615872    case ISD::AND:
15857-       Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
15873+       if (NumElems < 16) {
15874+         // Check if all lanes of the negated bool vector value are zero by
15875+         // comparing against 0.0 with ordered and equal predicate. The only
15876+         // non-zero bit pattern that compares ordered and equal to 0.0 is -0.0,
15877+         // where only the sign bit is set. However the bool vector is
15878+         // sign-extended so that each bit in a lane is either zero or one,
15879+         // meaning that it is impossible to get the bit pattern of -0.0.
15880+         assert(Extended.getValueSizeInBits() == 64);
15881+         Extended = DAG.getBitcast(MVT::f64, Extended);
15882+         Result =
15883+             DAG.getSetCC(DL, MVT::i32, Extended,
15884+                          DAG.getConstantFP(0.0, DL, MVT::f64), ISD::SETOEQ);
15885+       } else {
15886+         Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
15887+       }
1585815888      break;
1585915889    case ISD::OR:
15860-       Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
15890+       if (NumElems < 16) {
15891+         // Check if any lane of the bool vector is set by comparing against 0.0.
15892+         // NaN bit patterns are handled by using the 'unordered or not equal'
15893+         // predicate. Similarly to the reduce and case, -0.0 doesn't have to be
15894+         // handled here (see explanation above).
15895+         assert(Extended.getValueSizeInBits() == 64);
15896+         Extended = DAG.getBitcast(MVT::f64, Extended);
15897+         Result =
15898+             DAG.getSetCC(DL, MVT::i32, Extended,
15899+                          DAG.getConstantFP(0.0, DL, MVT::f64), ISD::SETUNE);
15900+       } else {
15901+         Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
15902+       }
1586115903      break;
1586215904    case ISD::XOR:
1586315905      Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
0 commit comments