@@ -29628,6 +29628,62 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
2962829628 DAG.getNode(Opc, dl, ExtVT, R, Amt));
2962929629 }
2963029630
29631+ // GFNI - we can perform SHL with a GF multiplication, and can convert
29632+ // SRL/SRA to a SHL.
29633+ if (VT == MVT::v16i8 ||
29634+ (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29635+ (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29636+ if (Subtarget.hasGFNI() && Subtarget.hasSSSE3()) {
29637+ auto GFShiftLeft = [&](SDValue Val) {
29638+ // Use PSHUFB as a LUT from the shift amount to create a per-element
29639+ // byte mask for the shift value and an index. For shift amounts greater
29640+ // than 7, the result will be zero.
29641+ SmallVector<APInt, 8> MaskBits, IdxBits;
29642+ for (unsigned I = 0, E = VT.getSizeInBits() / 128; I != E; ++I) {
29643+ MaskBits.push_back(APInt(64, 0x0103070F1F3F7FFFULL));
29644+ IdxBits.push_back(APInt(64, 0x8040201008040201ULL));
29645+ MaskBits.push_back(APInt::getZero(64));
29646+ IdxBits.push_back(APInt::getZero(64));
29647+ }
29648+
29649+ MVT CVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29650+ SDValue Mask =
29651+ DAG.getBitcast(VT, getConstVector(MaskBits, CVT, DAG, dl));
29652+ SDValue Idx = DAG.getBitcast(VT, getConstVector(IdxBits, CVT, DAG, dl));
29653+ Mask = DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
29654+ Idx = DAG.getNode(X86ISD::PSHUFB, dl, VT, Idx, Amt);
29655+ Mask = DAG.getNode(ISD::AND, dl, VT, Val, Mask);
29656+ return DAG.getNode(X86ISD::GF2P8MULB, dl, VT, Mask, Idx);
29657+ };
29658+
29659+ if (Opc == ISD::SHL)
29660+ return GFShiftLeft(R);
29661+
29662+ // srl(x,y)
29663+ // --> bitreverse(shl(bitreverse(x),y))
29664+ if (Opc == ISD::SRL) {
29665+ R = DAG.getNode(ISD::BITREVERSE, dl, VT, R);
29666+ R = GFShiftLeft(R);
29667+ return DAG.getNode(ISD::BITREVERSE, dl, VT, R);
29668+ }
29669+
29670+ // sra(x,y)
29671+ // --> sub(xor(srl(x,y), m),m)
29672+ // --> sub(xor(bitreverse(shl(bitreverse(x),y)), m),m)
29673+ // where m = srl(signbit, amt) --> bitreverse(shl(lsb, amt))
29674+ if (Opc == ISD::SRA) {
29675+ SDValue LSB = DAG.getConstant(APInt::getOneBitSet(8, 0), dl, VT);
29676+ SDValue M = DAG.getNode(ISD::BITREVERSE, dl, VT, GFShiftLeft(LSB));
29677+ R = DAG.getNode(ISD::BITREVERSE, dl, VT, R);
29678+ R = GFShiftLeft(R);
29679+ R = DAG.getNode(ISD::BITREVERSE, dl, VT, R);
29680+ R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29681+ R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29682+ return R;
29683+ }
29684+ }
29685+ }
29686+
2963129687 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
2963229688 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
2963329689 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
@@ -55807,6 +55863,15 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5580755863 ConcatSubOperand(VT, Ops, 0));
5580855864 }
5580955865 break;
55866+ case X86ISD::GF2P8MULB:
55867+ if (!IsSplat &&
55868+ (VT.is256BitVector() ||
55869+ (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55870+ return DAG.getNode(Op0.getOpcode(), DL, VT,
55871+ ConcatSubOperand(VT, Ops, 0),
55872+ ConcatSubOperand(VT, Ops, 1));
55873+ }
55874+ break;
5581055875 case X86ISD::GF2P8AFFINEQB:
5581155876 if (!IsSplat &&
5581255877 (VT.is256BitVector() ||
0 commit comments