Skip to content

Conversation

@MacDue
Copy link
Member

@MacDue MacDue commented Nov 18, 2024

This now tries to widen the shuffle before generating a possibly expensive SVE TBL, this may allow the shuffle to be matched as something cheaper like a ZIP1.

@MacDue MacDue marked this pull request as ready for review November 19, 2024 13:09
@llvmbot
Copy link
Member

llvmbot commented Nov 19, 2024

@llvm/pr-subscribers-backend-aarch64

Author: Benjamin Maxwell (MacDue)

Changes
  • Avoid using TBL for small vectors (that can be lowered with a couple of ZIP1s)
  • Fold redundant ZIP1s

Full diff: https://github.com/llvm/llvm-project/pull/116662.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+83)
  • (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll (+84-106)
  • (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll (+9-29)
  • (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll (+6-5)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9d1c3d4eddc880..c6b0f5876f4607 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24720,6 +24720,80 @@ static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
                      Op0ExtV, Op1ExtV, Op->getOperand(2));
 }
 
+static SDValue skipElementSizePreservingCast(SDValue Op, EVT VT) {
+  if (Op->getOpcode() == ISD::BITCAST)
+    Op = Op->getOperand(0);
+  EVT OpVT = Op.getValueType();
+  if (OpVT.isVector() && OpVT.getVectorElementType().getSizeInBits() ==
+                             VT.getVectorElementType().getSizeInBits())
+    return Op;
+  return SDValue();
+}
+
+static SDValue performZIP1Combine(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  SDValue Op0 = skipElementSizePreservingCast(N->getOperand(0), VT);
+  SDValue Op1 = skipElementSizePreservingCast(N->getOperand(1), VT);
+  if (Op0 && Op1 && Op0->getOpcode() == ISD::INSERT_VECTOR_ELT &&
+      Op1->getOpcode() == ISD::INSERT_VECTOR_ELT) {
+    SDValue Op00 = Op0->getOperand(0);
+    SDValue Op10 = Op1->getOperand(0);
+    if (Op00.isUndef() && Op10.isUndef() &&
+        Op0->getConstantOperandVal(2) == 0 &&
+        Op1->getConstantOperandVal(2) == 0) {
+      SDValue Op01 = Op0->getOperand(1);
+      SDValue Op11 = Op1->getOperand(1);
+      if (Op01->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          Op11->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+        SDValue Op010 = skipElementSizePreservingCast(Op01->getOperand(0), VT);
+        SDValue Op110 = skipElementSizePreservingCast(Op11->getOperand(0), VT);
+        unsigned StartExtractIdx = Op01->getConstantOperandVal(1);
+        if (Op010 && Op010 == Op110 &&
+            Op11->getConstantOperandVal(1) == StartExtractIdx + 1 &&
+            StartExtractIdx % 2 == 0) {
+          //       t0: nxv16i8 = ...
+          //     t1: i32 = extract_vector_elt t0, Constant:i64<n>
+          //     t2: i32 = extract_vector_elt t0, Constant:i64<n + 1>
+          //   t3: nxv16i8 = insert_vector_elt(undef, t1, 0)
+          //   t4: nxv16i8 = insert_vector_elt(undef, t2, 0)
+          // t5: nxv16i8 = zip1(t3, t4)
+          //
+          // ->
+          //         t0: nxv16i8 = ...
+          //       t1: nxv8i16 = bitcast t0
+          //     t2: i32 = extract_vector_elt t1, Constant:i64<n / 2>
+          //   t3: nxv8i16 = insert_vector_elt(undef, t2, 0)
+          // t4: nxv16i8 = bitcast t3
+          //
+          // Where n % 2 == 0
+          SDValue Result;
+          if (StartExtractIdx == 0)
+            Result = Op010;
+          else if (EltVT.getSizeInBits() < 64) {
+            unsigned LargeEltBits = EltVT.getSizeInBits() * 2;
+            EVT LargeEltVT = MVT::getVectorVT(
+                MVT::getIntegerVT(LargeEltBits),
+                VT.getVectorElementCount().divideCoefficientBy(2));
+            EVT ExtractVT = MVT::getIntegerVT(std::max(LargeEltBits, 32U));
+            SDValue Extract =
+                DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT,
+                            DAG.getBitcast(LargeEltVT, Op010),
+                            DAG.getVectorIdxConstant(StartExtractIdx / 2, DL));
+            Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, LargeEltVT,
+                                 DAG.getUNDEF(LargeEltVT), Extract,
+                                 DAG.getVectorIdxConstant(0, DL));
+          }
+          if (Result)
+            return DAG.getBitcast(VT, Result);
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
 static SDValue
 performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                SelectionDAG &DAG) {
@@ -26161,6 +26235,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
+  case AArch64ISD::ZIP1:
+    return performZIP1Combine(N, DAG);
   case ISD::XOR:
     return performXorCombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
@@ -29030,7 +29106,14 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
   if (!IsSingleOp && !Subtarget.hasSVE2())
     return SDValue();
 
+  // Small vectors (with few extracts) can be lowered more efficiently as a
+  // sequence of ZIPs.
   EVT VTOp1 = Op.getOperand(0).getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.isPow2VectorType() && VT.getFixedSizeInBits() <= 128 &&
+      (NumElts <= 2 || (NumElts <= 4 && !Op2.isUndef())))
+    return SDValue();
+
   unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
   unsigned IndexLen = MinSVESize / BitsPerElt;
   unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index 20659cde83ee00..45285f5f6b6938 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -140,64 +140,52 @@ define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) {
 ;
 ; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
 ; SVE2_128_NOMAX:       // %bb.0:
-; SVE2_128_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT:    mov z4.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_128_NOMAX-NEXT:    mov z5.b, z0.b[7]
-; SVE2_128_NOMAX-NEXT:    mov z6.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
-; SVE2_128_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
-; SVE2_128_NOMAX-NEXT:    zip1 z3.b, z6.b, z5.b
-; SVE2_128_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
-; SVE2_128_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
-; SVE2_128_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
-; SVE2_128_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT:    ldr d0, [x0]
+; SVE2_128_NOMAX-NEXT:    ldr d1, [x1]
+; SVE2_128_NOMAX-NEXT:    mov z2.b, z1.b[4]
+; SVE2_128_NOMAX-NEXT:    mov z3.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z4.h, z1.h[3]
+; SVE2_128_NOMAX-NEXT:    mov z1.h, z1.h[1]
+; SVE2_128_NOMAX-NEXT:    zip1 z2.b, z2.b, z2.b
+; SVE2_128_NOMAX-NEXT:    zip1 z0.b, z0.b, z3.b
+; SVE2_128_NOMAX-NEXT:    zip1 z2.h, z2.h, z4.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.h, z0.h, z1.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.s, z0.s, z2.s
 ; SVE2_128_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_128_NOMAX-NEXT:    ret
 ;
 ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
 ; SVE2_NOMIN_NOMAX:       // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z4.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z5.b, z0.b[7]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z6.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z3.b, z6.b, z5.b
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [x0]
+; SVE2_NOMIN_NOMAX-NEXT:    ldr d1, [x1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z1.b[4]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z3.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z4.h, z1.h[3]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z1.h, z1.h[1]
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z2.b, z2.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.b, z0.b, z3.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z2.h, z2.h, z4.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.h, z0.h, z1.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.s, z0.s, z2.s
 ; SVE2_NOMIN_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_NOMIN_NOMAX-NEXT:    ret
 ;
 ; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
 ; SVE2_MIN_256_NOMAX:       // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z4.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z5.b, z0.b[7]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z6.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z3.b, z6.b, z5.b
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.b, z0.b, z0.b
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.h, z0.h, z3.h
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [x0]
+; SVE2_MIN_256_NOMAX-NEXT:    ldr d1, [x1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z1.b[4]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z3.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z4.h, z1.h[3]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z1.h, z1.h[1]
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z2.b, z2.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.b, z0.b, z3.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z2.h, z2.h, z4.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.h, z0.h, z1.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.s, z0.s, z2.s
 ; SVE2_MIN_256_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_MIN_256_NOMAX-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
@@ -230,58 +218,52 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
 ;
 ; SVE2_128_NOMAX-LABEL: shuffle_index_poison_value:
 ; SVE2_128_NOMAX:       // %bb.0:
-; SVE2_128_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_128_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT:    mov z4.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_128_NOMAX-NEXT:    mov z5.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
-; SVE2_128_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
-; SVE2_128_NOMAX-NEXT:    zip1 z3.b, z5.b, z5.b
-; SVE2_128_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
-; SVE2_128_NOMAX-NEXT:    zip1 z0.h, z3.h, z0.h
-; SVE2_128_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT:    ldr d0, [x0]
+; SVE2_128_NOMAX-NEXT:    ldr d1, [x1]
+; SVE2_128_NOMAX-NEXT:    mov z2.b, z1.b[4]
+; SVE2_128_NOMAX-NEXT:    mov z3.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z0.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT:    mov z4.b, z1.b[6]
+; SVE2_128_NOMAX-NEXT:    mov z1.h, z1.h[1]
+; SVE2_128_NOMAX-NEXT:    zip1 z2.b, z2.b, z2.b
+; SVE2_128_NOMAX-NEXT:    zip1 z0.b, z0.b, z3.b
+; SVE2_128_NOMAX-NEXT:    zip1 z2.h, z2.h, z4.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.h, z0.h, z1.h
+; SVE2_128_NOMAX-NEXT:    zip1 z0.s, z0.s, z2.s
 ; SVE2_128_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_128_NOMAX-NEXT:    ret
 ;
 ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_poison_value:
 ; SVE2_NOMIN_NOMAX:       // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z4.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z5.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z3.b, z5.b, z5.b
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.h, z3.h, z0.h
-; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT:    ldr d0, [x0]
+; SVE2_NOMIN_NOMAX-NEXT:    ldr d1, [x1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z2.b, z1.b[4]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z3.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z0.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z4.b, z1.b[6]
+; SVE2_NOMIN_NOMAX-NEXT:    mov z1.h, z1.h[1]
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z2.b, z2.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.b, z0.b, z3.b
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z2.h, z2.h, z4.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.h, z0.h, z1.h
+; SVE2_NOMIN_NOMAX-NEXT:    zip1 z0.s, z0.s, z2.s
 ; SVE2_NOMIN_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_NOMIN_NOMAX-NEXT:    ret
 ;
 ; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_poison_value:
 ; SVE2_MIN_256_NOMAX:       // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT:    ldr d1, [x0]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z3.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z4.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z1.b, z1.b[1]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z5.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z2.b, z3.b, z2.b
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.b, z1.b, z4.b
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z3.b, z5.b, z5.b
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z1.h, z1.h, z2.h
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.h, z3.h, z0.h
-; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT:    ldr d0, [x0]
+; SVE2_MIN_256_NOMAX-NEXT:    ldr d1, [x1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z2.b, z1.b[4]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z3.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z0.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z4.b, z1.b[6]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z1.h, z1.h[1]
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z2.b, z2.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.b, z0.b, z3.b
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z2.h, z2.h, z4.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.h, z0.h, z1.h
+; SVE2_MIN_256_NOMAX-NEXT:    zip1 z0.s, z0.s, z2.s
 ; SVE2_MIN_256_NOMAX-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2_MIN_256_NOMAX-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
@@ -338,22 +320,18 @@ define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) {
 define <8 x i8> @negative_test_shuffle_index_size_op_both_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) {
 ; CHECK-LABEL: negative_test_shuffle_index_size_op_both_maxhw:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    mov z2.b, z0.b[3]
-; CHECK-NEXT:    mov z3.b, z0.b[2]
-; CHECK-NEXT:    mov z4.b, z0.b[1]
-; CHECK-NEXT:    mov z1.b, z1.b[1]
-; CHECK-NEXT:    mov z5.b, z0.b[7]
-; CHECK-NEXT:    mov z6.b, z0.b[6]
-; CHECK-NEXT:    mov z0.b, z0.b[4]
-; CHECK-NEXT:    zip1 z2.b, z3.b, z2.b
-; CHECK-NEXT:    zip1 z1.b, z1.b, z4.b
-; CHECK-NEXT:    zip1 z3.b, z6.b, z5.b
-; CHECK-NEXT:    zip1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    zip1 z1.h, z1.h, z2.h
-; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
-; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    mov z2.b, z1.b[4]
+; CHECK-NEXT:    mov z3.b, z1.b[1]
+; CHECK-NEXT:    mov z0.b, z0.b[1]
+; CHECK-NEXT:    mov z4.h, z1.h[3]
+; CHECK-NEXT:    mov z1.h, z1.h[1]
+; CHECK-NEXT:    zip1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    zip1 z0.b, z0.b, z3.b
+; CHECK-NEXT:    zip1 z2.h, z2.h, z4.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    zip1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 6e2ecfca9e963e..619840fc6afb28 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming  < %s | FileCheck %s --check-prefixes=CHECK,SME
+; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mattr=+sme -force-streaming  < %s | FileCheck %s --check-prefixes=CHECK
 ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -406,33 +406,13 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
 ;
 
 define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2)  {
-; SVE2-LABEL: concat_v4f16:
-; SVE2:       // %bb.0:
-; SVE2-NEXT:    cnth x8
-; SVE2-NEXT:    adrp x9, .LCPI15_0
-; SVE2-NEXT:    adrp x10, .LCPI15_1
-; SVE2-NEXT:    mov z2.h, w8
-; SVE2-NEXT:    ldr q3, [x9, :lo12:.LCPI15_0]
-; SVE2-NEXT:    ldr q4, [x10, :lo12:.LCPI15_1]
-; SVE2-NEXT:    ptrue p0.h, vl8
-; SVE2-NEXT:    // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
-; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
-; SVE2-NEXT:    mad z2.h, p0/m, z3.h, z4.h
-; SVE2-NEXT:    tbl z0.h, { z0.h, z1.h }, z2.h
-; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; SVE2-NEXT:    ret
-;
-; SME-LABEL: concat_v4f16:
-; SME:       // %bb.0:
-; SME-NEXT:    // kill: def $d1 killed $d1 def $z1
-; SME-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SME-NEXT:    mov z2.h, z1.h[1]
-; SME-NEXT:    mov z3.h, z0.h[1]
-; SME-NEXT:    zip1 z1.h, z1.h, z2.h
-; SME-NEXT:    zip1 z0.h, z0.h, z3.h
-; SME-NEXT:    zip1 z0.s, z0.s, z1.s
-; SME-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; SME-NEXT:    ret
+; CHECK-LABEL: concat_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: concat_v4f16:
 ; NONEON-NOSVE:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index a33e8537edf4ee..1b083d80ef3e68 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -643,11 +643,12 @@ define void @test_revhv32i16(ptr %a) {
 define void @test_rev_elts_fail(ptr %a) {
 ; CHECK-LABEL: test_rev_elts_fail:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    index z0.d, #1, #-1
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    tbl z1.d, { z1.d }, z0.d
-; CHECK-NEXT:    tbl z0.d, { z2.d }, z0.d
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    mov z2.d, z0.d[1]
+; CHECK-NEXT:    mov z3.d, z1.d[1]
+; CHECK-NEXT:    zip1 z0.d, z2.d, z0.d
+; CHECK-NEXT:    zip1 z1.d, z3.d, z1.d
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: test_rev_elts_fail:

*  Avoid using TBL for small vectors (that can be lowered with a couple
   of ZIP1s)
*  Fold redundant ZIP1s
}
}

// Try to widen the shuffle before generating a possibly expensive SVE TBL.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could there be any value for the patterns above to move this up a bit so that it's executed earlier in the function?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it changes the result (as all the matching above is looking for single-instruction replacements). Also, I placed it here to be consistent with the non-SVE lowering, which attempts this just before generating the Neon TBL too.

@MacDue MacDue merged commit cc721db into llvm:main Nov 22, 2024
6 of 8 checks passed
@MacDue MacDue deleted the shuffle_vec branch November 22, 2024 10:15
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants