@@ -28988,7 +28988,7 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
2898828988 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
2898928989
2899028990 // Decompose 256-bit ops into smaller 128-bit ops.
28991- if (VT.is256BitVector() && !Subtarget.hasInt256())
28991+ if (VT.is256BitVector() && !Subtarget.hasInt256())
2899228992 return splitVectorIntUnary(Op, DAG, DL);
2899328993
2899428994 // Decompose 512-bit ops into smaller 256-bit ops.
@@ -28998,6 +28998,7 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
2899828998 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
2899928999 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
2900029000}
29001+
2900129002static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL,
2900229003 SelectionDAG &DAG,
2900329004 const X86Subtarget &Subtarget) {
@@ -29007,20 +29008,11 @@ static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL,
2900729008 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
2900829009 "Expected vXi8 input for GFNI-based CTLZ lowering");
2900929010
29010- // Step 1: Bit-reverse input
2901129011 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
2901229012
29013- // Step 2: Add 0xFF
29014- SDValue AddVec = DAG.getAllOnesConstant(DL, VT);
29015- SDValue Summed = DAG.getNode(ISD::ADD, DL, VT, Reversed, AddVec);
29016-
29017- // Step 3: Not(Summed)
29018- SDValue NotSummed = DAG.getNOT(DL, Summed, VT);
29013+ SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29014+ SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
2901929015
29020- // Step 4: AND with Reversed
29021- SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, NotSummed, Reversed);
29022-
29023- // Step 5: Apply CTTZ LUT using GF2P8AFFINEQB
2902429016 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
2902529017 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
2902629018 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
@@ -29031,7 +29023,6 @@ static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL,
2903129023 return LZCNT;
2903229024}
2903329025
29034-
2903529026static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
2903629027 SelectionDAG &DAG) {
2903729028 MVT VT = Op.getSimpleValueType();
0 commit comments