[RISCV] Lower the shufflevector equivalent of vector.splice

topperc · topperc · commit b0e77d5e489a · 2022-02-10T09:39:35.000-08:00
We can lower a vector splice to a vslidedown and a vslideup. The majority of the matching code here came from X86's code for matching PALIGNR and VPALIGND/Q. The slidedown and slideup lowering don't really require it to be concatenation, but it happened to be an interesting pattern with existing analysis code I could use. This helps with cases where the scalar loop optimizer forwarded a load result from a previous loop iteration. For example, this happens if the loop uses x[i] and x[i+1] on the same iteration. The scalar optimizer will forward x[i+1] load from the previous loop to satisfy x[i] on this loop. When this get vectorized it results in one element of a vector being forwarded from the previous loop to be concatenated with elements loaded on this iteration. Whether that's more efficient than doing a shifted loaded or reloading the single scalar and using vslide1up is an interesting question. But that's not something the backend can help with. Reviewed By: khchen Differential Revision: https://reviews.llvm.org/D119039
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2514,6 +2514,72 @@ static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,
   return true;
 }
 
+static int isElementRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
+  int Size = Mask.size();
+
+  // We need to detect various ways of spelling a rotation:
+  //   [11, 12, 13, 14, 15,  0,  1,  2]
+  //   [-1, 12, 13, 14, -1, -1,  1, -1]
+  //   [-1, -1, -1, -1, -1, -1,  1,  2]
+  //   [ 3,  4,  5,  6,  7,  8,  9, 10]
+  //   [-1,  4,  5,  6, -1, -1,  9, -1]
+  //   [-1,  4,  5,  6, -1, -1, -1, -1]
+  int Rotation = 0;
+  SDValue Lo, Hi;
+  for (int i = 0; i != Size; ++i) {
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+
+    // Determine where a rotate vector would have started.
+    int StartIdx = i - (M % Size);
+    // The identity rotation isn't interesting, stop.
+    if (StartIdx == 0)
+      return -1;
+
+    // If we found the tail of a vector the rotation must be the missing
+    // front. If we found the head of a vector, it must be how much of the
+    // head.
+    int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
+
+    if (Rotation == 0)
+      Rotation = CandidateRotation;
+    else if (Rotation != CandidateRotation)
+      // The rotations don't match, so we can't match this mask.
+      return -1;
+
+    // Compute which value this mask is pointing at.
+    SDValue MaskV = M < Size ? V1 : V2;
+
+    // Compute which of the two target values this index should be assigned to.
+    // This reflects whether the high elements are remaining or the low elemnts
+    // are remaining.
+    SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+    // Either set up this value if we've not encountered it before, or check
+    // that it remains consistent.
+    if (!TargetV)
+      TargetV = MaskV;
+    else if (TargetV != MaskV)
+      // This may be a rotation, but it pulls from the inputs in some
+      // unsupported interleaving.
+      return -1;
+  }
+
+  // Check that we successfully analyzed the mask, and normalize the results.
+  assert(Rotation != 0 && "Failed to locate a viable rotation!");
+  assert((Lo || Hi) && "Failed to find a rotated input vector!");
+
+  // Make sure we've found a value for both halves.
+  if (!Lo || !Hi)
+    return -1;
+
+  V1 = Lo;
+  V2 = Hi;
+
+  return Rotation;
+}
+
 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
   SDValue V1 = Op.getOperand(0);
@@ -2619,6 +2685,33 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     return convertFromScalableVector(VT, SlideDown, DAG, Subtarget);
   }
 
+  // Match shuffles that concatenate two vectors, rotate the concatenation,
+  // and then extract the original number of elements from the rotated result.
+  // This is equivalent to vector.splice or X86's PALIGNR instruction. Lower
+  // it to a SLIDEDOWN and a SLIDEUP.
+  // FIXME: We don't really need it to be a concatenation. We just need two
+  // regions with contiguous elements that need to be shifted down and up.
+  int Rotation = isElementRotate(V1, V2, Mask);
+  if (Rotation > 0) {
+    // We found a rotation. We need to slide V1 down by Rotation. Using
+    // (NumElts - Rotation) for VL. Then we need to slide V2 up by
+    // (NumElts - Rotation) using NumElts for VL.
+    V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
+    V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+
+    unsigned InvRotate = NumElts - Rotation;
+    SDValue SlideDown =
+        DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT,
+                    DAG.getUNDEF(ContainerVT), V2,
+                    DAG.getConstant(Rotation, DL, XLenVT),
+                    TrueMask, DAG.getConstant(InvRotate, DL, XLenVT));
+    SDValue SlideUp =
+        DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, SlideDown, V1,
+                    DAG.getConstant(InvRotate, DL, XLenVT),
+                    TrueMask, VL);
+    return convertFromScalableVector(VT, SlideUp, DAG, Subtarget);
+  }
+
   // Detect an interleave shuffle and lower to
   // (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1))
   bool SwapSources;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -255,3 +255,54 @@ define <8 x float> @slidedown_v8f32(<8 x float> %x) {
   %s = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 3, i32 undef, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x float> %s
 }
+
+define <8 x float> @splice_unary(<8 x float> %x) {
+; CHECK-LABEL: splice_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, mu
+; CHECK-NEXT:    vslidedown.vi v10, v8, 1
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, tu, mu
+; CHECK-NEXT:    vslideup.vi v10, v8, 7
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    ret
+  %s = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
+  ret <8 x float> %s
+}
+
+define <8 x double> @splice_unary2(<8 x double> %x) {
+; CHECK-LABEL: splice_unary2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m4, ta, mu
+; CHECK-NEXT:    vslidedown.vi v12, v8, 6
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, tu, mu
+; CHECK-NEXT:    vslideup.vi v12, v8, 2
+; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    ret
+  %s = shufflevector <8 x double> %x, <8 x double> poison, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+  ret <8 x double> %s
+}
+
+define <8 x float> @splice_binary(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: splice_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, mu
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, tu, mu
+; CHECK-NEXT:    vslideup.vi v8, v10, 6
+; CHECK-NEXT:    ret
+  %s = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 9>
+  ret <8 x float> %s
+}
+
+define <8 x double> @splice_binary2(<8 x double> %x, <8 x double> %y) {
+; CHECK-LABEL: splice_binary2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 3, e64, m4, ta, mu
+; CHECK-NEXT:    vslidedown.vi v12, v12, 5
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, tu, mu
+; CHECK-NEXT:    vslideup.vi v12, v8, 3
+; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    ret
+  %s = shufflevector <8 x double> %x, <8 x double> %y, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x double> %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -554,3 +554,53 @@ define <8 x i32> @slidedown_v8i32(<8 x i32> %x) {
   %s = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 3, i32 undef, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i32> %s
 }
+
+define <8 x i16> @splice_unary(<8 x i16> %x) {
+; CHECK-LABEL: splice_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; CHECK-NEXT:    vslidedown.vi v9, v8, 2
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, tu, mu
+; CHECK-NEXT:    vslideup.vi v9, v8, 6
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %s = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
+  ret <8 x i16> %s
+}
+
+define <8 x i32> @splice_unary2(<8 x i32> %x) {
+; CHECK-LABEL: splice_unary2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 3, e32, m2, ta, mu
+; CHECK-NEXT:    vslidedown.vi v10, v8, 5
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, tu, mu
+; CHECK-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    ret
+  %s = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 undef, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x i32> %s
+}
+
+define <8 x i16> @splice_binary(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: splice_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, tu, mu
+; CHECK-NEXT:    vslideup.vi v8, v9, 6
+; CHECK-NEXT:    ret
+  %s = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 9>
+  ret <8 x i16> %s
+}
+
+define <8 x i32> @splice_binary2(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: splice_binary2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 3, e32, m2, ta, mu
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, tu, mu
+; CHECK-NEXT:    vslideup.vi v8, v10, 3
+; CHECK-NEXT:    ret
+  %s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+  ret <8 x i32> %s
+}