@@ -28998,6 +28998,35 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
2899828998 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
2899928999 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
2900029000}
29001+ static SDValue LowerVectorCTLZ_GFNI(SDValue Op, SelectionDAG &DAG,
29002+ const X86Subtarget &Subtarget) {
29003+ SDLoc dl(Op);
29004+ MVT VT = Op.getSimpleValueType();
29005+ SDValue Input = Op.getOperand(0);
29006+
29007+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i8)
29008+ return SDValue();
29009+ SmallVector<SDValue, 16> MatrixVals;
29010+ for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
29011+ uint8_t mask = 1 << (7 - (i % 8));
29012+ MatrixVals.push_back(DAG.getConstant(mask, dl, MVT::i8));
29013+ }
29014+
29015+ SDValue Matrix = DAG.getBuildVector(VT, dl, MatrixVals);
29016+ SDValue Reversed = DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, Input, Matrix,
29017+ DAG.getTargetConstant(0, dl, MVT::i8));
29018+ SDValue AddMask = DAG.getConstant(0xFF, dl, MVT::i8);
29019+
29020+ SDValue AddVec = DAG.getSplatBuildVector(VT, dl, AddMask);
29021+ SDValue Summed = DAG.getNode(ISD::ADD, dl, VT, Reversed, AddVec);
29022+ SDValue NotSummed = DAG.getNode(ISD::XOR, dl, VT, Summed, AddVec);
29023+ SDValue Filtered = DAG.getNode(ISD::AND, dl, VT, NotSummed, Reversed);
29024+ SDValue FinalMatrix = DAG.getBuildVector(VT, dl, MatrixVals);
29025+ SDValue LZCNT =
29026+ DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, Filtered, FinalMatrix,
29027+ DAG.getTargetConstant(8, dl, MVT::i8));
29028+ return LZCNT;
29029+ }
2900129030
2900229031static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
2900329032 SelectionDAG &DAG) {
@@ -29007,6 +29036,9 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
2900729036 SDLoc dl(Op);
2900829037 unsigned Opc = Op.getOpcode();
2900929038
29039+ if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29040+ return LowerVectorCTLZ_GFNI(Op, DAG, Subtarget);
29041+
2901029042 if (VT.isVector())
2901129043 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
2901229044
0 commit comments