tomtor
diff --git a/‎llvm/lib/Target/X86/X86ISelLowering.cpp‎
Lines changed: 69 additions & 20 deletions b/‎llvm/lib/Target/X86/X86ISelLowering.cpp‎
Lines changed: 69 additions & 20 deletions
@@ -58503,14 +58503,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
       const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
       // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
-      // Only concat of subvector high halves which vperm2x128 is best at.
+      // Only concat of subvector high halves which vperm2x128 is best at or if
+      // it should fold into a subvector broadcast.
       if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
-          SrcVT1.is256BitVector() && SrcIdx0 == (NumSrcElts0 / 2) &&
-          SrcIdx1 == (NumSrcElts1 / 2)) {
-        return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
-                           DAG.getBitcast(VT, Src0.getOperand(0)),
-                           DAG.getBitcast(VT, Src1.getOperand(0)),
-                           DAG.getTargetConstant(0x31, DL, MVT::i8));
+          SrcVT1.is256BitVector()) {
+        assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
+               (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
+               "Bad subvector index");
+        if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
+            (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
+          unsigned Index = 0;
+          Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
+          Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
+          return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
+                             DAG.getBitcast(VT, Src0.getOperand(0)),
+                             DAG.getBitcast(VT, Src1.getOperand(0)),
+                             DAG.getTargetConstant(Index, DL, MVT::i8));
+        }
       }
       // Widen extract_subvector
       // concat(extract_subvector(x,lo), extract_subvector(x,hi))
@@ -58672,7 +58681,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       break;
     }
     case X86ISD::SHUFP: {
-      // TODO: Add SHUFPD support if/when necessary.
       if (!IsSplat &&
           (VT == MVT::v8f32 ||
            (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
@@ -58741,18 +58749,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
             DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
         return DAG.getBitcast(VT, Res);
       }
-      if (!IsSplat && (VT == MVT::v4f64 || VT == MVT::v8f64)) {
-        unsigned NumSubElts = Op0.getValueType().getVectorNumElements();
-        uint64_t Mask = (1ULL << NumSubElts) - 1;
-        uint64_t Idx = 0;
-        for (unsigned I = 0; I != NumOps; ++I) {
-          uint64_t SubIdx = Ops[I].getConstantOperandVal(1);
-          Idx |= (SubIdx & Mask) << (I * NumSubElts);
-        }
-        return DAG.getNode(X86ISD::VPERMILPI, DL, VT,
-                           ConcatSubOperand(VT, Ops, 0),
-                           DAG.getTargetConstant(Idx, DL, MVT::i8));
-      }
       break;
     case X86ISD::VPERMILPV:
       if (!IsSplat && (VT.is256BitVector() ||
@@ -59323,6 +59319,59 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     return DAG.getBitcast(VT, Res);
   }
 
+  // We can always convert per-lane vXf64 shuffles into VSHUFPD.
+  if (!IsSplat &&
+      (VT == MVT::v4f64 || (VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
+      all_of(Ops, [](SDValue Op) {
+        return Op.hasOneUse() && (Op.getOpcode() == X86ISD::MOVDDUP ||
+                                  Op.getOpcode() == X86ISD::SHUFP ||
+                                  Op.getOpcode() == X86ISD::VPERMILPI ||
+                                  Op.getOpcode() == X86ISD::BLENDI ||
+                                  Op.getOpcode() == X86ISD::UNPCKL ||
+                                  Op.getOpcode() == X86ISD::UNPCKH);
+      })) {
+    // Collect the individual per-lane v2f64/v4f64 shuffles.
+    MVT OpVT = Ops[0].getSimpleValueType();
+    unsigned NumOpElts = OpVT.getVectorNumElements();
+    SmallVector<SmallVector<SDValue, 2>, 4> SrcOps(NumOps);
+    SmallVector<SmallVector<int, 8>, 4> SrcMasks(NumOps);
+    if (all_of(seq<int>(NumOps), [&](int I) {
+          return getTargetShuffleMask(Ops[I], /*AllowSentinelZero=*/false,
+                                      SrcOps[I], SrcMasks[I]) &&
+                 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
+                 SrcMasks[I].size() == NumOpElts &&
+                 all_of(SrcOps[I], [&OpVT](SDValue V) {
+                   return V.getValueType() == OpVT;
+                 });
+        })) {
+      // Concatenate the shuffle masks into SHUFPD mask and collect subops.
+      bool Unary = true;
+      unsigned SHUFPDMask = 0;
+      SmallVector<SDValue, 4> LHS(NumOps), RHS(NumOps);
+      for (unsigned I = 0; I != NumOps; ++I) {
+        LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
+        RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
+        Unary &= LHS[I] == RHS[I];
+        for (unsigned J = 0; J != NumOpElts; ++J)
+          SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
+      }
+      // Concat SHUFPD LHS/RHS operands - if they match then it will become a
+      // PERMILPD mask and we can always profitably concatenate them.
+      SDValue Concat0 =
+          combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
+      SDValue Concat1 =
+          combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
+      if (Unary || Concat0 || Concat1) {
+        Concat0 =
+            Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
+        Concat1 =
+            Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
+        return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
+                           DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
+      }
+    }
+  }
+
   return SDValue();
 }