-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AArch64][Codegen] Improve small shufflevector/concat lowering for SME #116662
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) Changes
Full diff: https://github.com/llvm/llvm-project/pull/116662.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9d1c3d4eddc880..c6b0f5876f4607 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24720,6 +24720,80 @@ static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
Op0ExtV, Op1ExtV, Op->getOperand(2));
}
+static SDValue skipElementSizePreservingCast(SDValue Op, EVT VT) {
+ if (Op->getOpcode() == ISD::BITCAST)
+ Op = Op->getOperand(0);
+ EVT OpVT = Op.getValueType();
+ if (OpVT.isVector() && OpVT.getVectorElementType().getSizeInBits() ==
+ VT.getVectorElementType().getSizeInBits())
+ return Op;
+ return SDValue();
+}
+
+static SDValue performZIP1Combine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ EVT EltVT = VT.getVectorElementType();
+ SDValue Op0 = skipElementSizePreservingCast(N->getOperand(0), VT);
+ SDValue Op1 = skipElementSizePreservingCast(N->getOperand(1), VT);
+ if (Op0 && Op1 && Op0->getOpcode() == ISD::INSERT_VECTOR_ELT &&
+ Op1->getOpcode() == ISD::INSERT_VECTOR_ELT) {
+ SDValue Op00 = Op0->getOperand(0);
+ SDValue Op10 = Op1->getOperand(0);
+ if (Op00.isUndef() && Op10.isUndef() &&
+ Op0->getConstantOperandVal(2) == 0 &&
+ Op1->getConstantOperandVal(2) == 0) {
+ SDValue Op01 = Op0->getOperand(1);
+ SDValue Op11 = Op1->getOperand(1);
+ if (Op01->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op11->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ SDValue Op010 = skipElementSizePreservingCast(Op01->getOperand(0), VT);
+ SDValue Op110 = skipElementSizePreservingCast(Op11->getOperand(0), VT);
+ unsigned StartExtractIdx = Op01->getConstantOperandVal(1);
+ if (Op010 && Op010 == Op110 &&
+ Op11->getConstantOperandVal(1) == StartExtractIdx + 1 &&
+ StartExtractIdx % 2 == 0) {
+ // t0: nxv16i8 = ...
+ // t1: i32 = extract_vector_elt t0, Constant:i64<n>
+ // t2: i32 = extract_vector_elt t0, Constant:i64<n + 1>
+ // t3: nxv16i8 = insert_vector_elt(undef, t1, 0)
+ // t4: nxv16i8 = insert_vector_elt(undef, t2, 0)
+ // t5: nxv16i8 = zip1(t3, t4)
+ //
+ // ->
+ // t0: nxv16i8 = ...
+ // t1: nxv8i16 = bitcast t0
+ // t2: i32 = extract_vector_elt t1, Constant:i64<n / 2>
+ // t3: nxv8i16 = insert_vector_elt(undef, t2, 0)
+ // t4: nxv16i8 = bitcast t3
+ //
+ // Where n % 2 == 0
+ SDValue Result;
+ if (StartExtractIdx == 0)
+ Result = Op010;
+ else if (EltVT.getSizeInBits() < 64) {
+ unsigned LargeEltBits = EltVT.getSizeInBits() * 2;
+ EVT LargeEltVT = MVT::getVectorVT(
+ MVT::getIntegerVT(LargeEltBits),
+ VT.getVectorElementCount().divideCoefficientBy(2));
+ EVT ExtractVT = MVT::getIntegerVT(std::max(LargeEltBits, 32U));
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT,
+ DAG.getBitcast(LargeEltVT, Op010),
+ DAG.getVectorIdxConstant(StartExtractIdx / 2, DL));
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, LargeEltVT,
+ DAG.getUNDEF(LargeEltVT), Extract,
+ DAG.getVectorIdxConstant(0, DL));
+ }
+ if (Result)
+ return DAG.getBitcast(VT, Result);
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
static SDValue
performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -26161,6 +26235,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
break;
}
+ case AArch64ISD::ZIP1:
+ return performZIP1Combine(N, DAG);
case ISD::XOR:
return performXorCombine(N, DAG, DCI, Subtarget);
case ISD::MUL:
@@ -29030,7 +29106,14 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
if (!IsSingleOp && !Subtarget.hasSVE2())
return SDValue();
+ // Small vectors (with few extracts) can be lowered more efficiently as a
+ // sequence of ZIPs.
EVT VTOp1 = Op.getOperand(0).getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ if (VT.isPow2VectorType() && VT.getFixedSizeInBits() <= 128 &&
+ (NumElts <= 2 || (NumElts <= 4 && !Op2.isUndef())))
+ return SDValue();
+
unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
unsigned IndexLen = MinSVESize / BitsPerElt;
unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index 20659cde83ee00..45285f5f6b6938 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -140,64 +140,52 @@ define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) {
;
; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_128_NOMAX: // %bb.0:
-; SVE2_128_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[7]
-; SVE2_128_NOMAX-NEXT: mov z6.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
-; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
-; SVE2_128_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
-; SVE2_128_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
-; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
-; SVE2_128_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
-; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT: ldr d0, [x0]
+; SVE2_128_NOMAX-NEXT: ldr d1, [x1]
+; SVE2_128_NOMAX-NEXT: mov z2.b, z1.b[4]
+; SVE2_128_NOMAX-NEXT: mov z3.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT: mov z4.h, z1.h[3]
+; SVE2_128_NOMAX-NEXT: mov z1.h, z1.h[1]
+; SVE2_128_NOMAX-NEXT: zip1 z2.b, z2.b, z2.b
+; SVE2_128_NOMAX-NEXT: zip1 z0.b, z0.b, z3.b
+; SVE2_128_NOMAX-NEXT: zip1 z2.h, z2.h, z4.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.h, z0.h, z1.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.s, z0.s, z2.s
; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_128_NOMAX-NEXT: ret
;
; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_NOMIN_NOMAX: // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[7]
-; SVE2_NOMIN_NOMAX-NEXT: mov z6.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x0]
+; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z1.b[4]
+; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z4.h, z1.h[3]
+; SVE2_NOMIN_NOMAX-NEXT: mov z1.h, z1.h[1]
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z2.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.b, z0.b, z3.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.h, z2.h, z4.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z0.h, z1.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z0.s, z2.s
; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_NOMIN_NOMAX-NEXT: ret
;
; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_both_ops:
; SVE2_MIN_256_NOMAX: // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[7]
-; SVE2_MIN_256_NOMAX-NEXT: mov z6.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z6.b, z5.b
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.b, z0.b, z0.b
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z0.h, z3.h
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x0]
+; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z1.b[4]
+; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z4.h, z1.h[3]
+; SVE2_MIN_256_NOMAX-NEXT: mov z1.h, z1.h[1]
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z2.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.b, z0.b, z3.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.h, z2.h, z4.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z0.h, z1.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z0.s, z2.s
; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_MIN_256_NOMAX-NEXT: ret
%op1 = load <8 x i8>, ptr %a
@@ -230,58 +218,52 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) {
;
; SVE2_128_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_128_NOMAX: // %bb.0:
-; SVE2_128_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_128_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_128_NOMAX-NEXT: mov z4.b, z0.b[1]
-; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_128_NOMAX-NEXT: mov z5.b, z0.b[4]
-; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[6]
-; SVE2_128_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
-; SVE2_128_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
-; SVE2_128_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
-; SVE2_128_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
-; SVE2_128_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
-; SVE2_128_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_128_NOMAX-NEXT: ldr d0, [x0]
+; SVE2_128_NOMAX-NEXT: ldr d1, [x1]
+; SVE2_128_NOMAX-NEXT: mov z2.b, z1.b[4]
+; SVE2_128_NOMAX-NEXT: mov z3.b, z1.b[1]
+; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1]
+; SVE2_128_NOMAX-NEXT: mov z4.b, z1.b[6]
+; SVE2_128_NOMAX-NEXT: mov z1.h, z1.h[1]
+; SVE2_128_NOMAX-NEXT: zip1 z2.b, z2.b, z2.b
+; SVE2_128_NOMAX-NEXT: zip1 z0.b, z0.b, z3.b
+; SVE2_128_NOMAX-NEXT: zip1 z2.h, z2.h, z4.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.h, z0.h, z1.h
+; SVE2_128_NOMAX-NEXT: zip1 z0.s, z0.s, z2.s
; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_128_NOMAX-NEXT: ret
;
; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_NOMIN_NOMAX: // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z0.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_NOMIN_NOMAX-NEXT: mov z5.b, z0.b[4]
-; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[6]
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
-; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x0]
+; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z1.b[4]
+; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z1.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1]
+; SVE2_NOMIN_NOMAX-NEXT: mov z4.b, z1.b[6]
+; SVE2_NOMIN_NOMAX-NEXT: mov z1.h, z1.h[1]
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.b, z2.b, z2.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.b, z0.b, z3.b
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z2.h, z2.h, z4.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.h, z0.h, z1.h
+; SVE2_NOMIN_NOMAX-NEXT: zip1 z0.s, z0.s, z2.s
; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_NOMIN_NOMAX-NEXT: ret
;
; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_poison_value:
; SVE2_MIN_256_NOMAX: // %bb.0:
-; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1]
-; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0]
-; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3]
-; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2]
-; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z0.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1]
-; SVE2_MIN_256_NOMAX-NEXT: mov z5.b, z0.b[4]
-; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[6]
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z3.b, z2.b
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.b, z1.b, z4.b
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z3.b, z5.b, z5.b
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z1.h, z1.h, z2.h
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z3.h, z0.h
-; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z1.s, z0.s
+; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x0]
+; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z1.b[4]
+; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z1.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1]
+; SVE2_MIN_256_NOMAX-NEXT: mov z4.b, z1.b[6]
+; SVE2_MIN_256_NOMAX-NEXT: mov z1.h, z1.h[1]
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.b, z2.b, z2.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.b, z0.b, z3.b
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z2.h, z2.h, z4.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.h, z0.h, z1.h
+; SVE2_MIN_256_NOMAX-NEXT: zip1 z0.s, z0.s, z2.s
; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE2_MIN_256_NOMAX-NEXT: ret
%op1 = load <8 x i8>, ptr %a
@@ -338,22 +320,18 @@ define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) {
define <8 x i8> @negative_test_shuffle_index_size_op_both_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) {
; CHECK-LABEL: negative_test_shuffle_index_size_op_both_maxhw:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: ldr d1, [x0]
-; CHECK-NEXT: mov z2.b, z0.b[3]
-; CHECK-NEXT: mov z3.b, z0.b[2]
-; CHECK-NEXT: mov z4.b, z0.b[1]
-; CHECK-NEXT: mov z1.b, z1.b[1]
-; CHECK-NEXT: mov z5.b, z0.b[7]
-; CHECK-NEXT: mov z6.b, z0.b[6]
-; CHECK-NEXT: mov z0.b, z0.b[4]
-; CHECK-NEXT: zip1 z2.b, z3.b, z2.b
-; CHECK-NEXT: zip1 z1.b, z1.b, z4.b
-; CHECK-NEXT: zip1 z3.b, z6.b, z5.b
-; CHECK-NEXT: zip1 z0.b, z0.b, z0.b
-; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
-; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
-; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: mov z2.b, z1.b[4]
+; CHECK-NEXT: mov z3.b, z1.b[1]
+; CHECK-NEXT: mov z0.b, z0.b[1]
+; CHECK-NEXT: mov z4.h, z1.h[3]
+; CHECK-NEXT: mov z1.h, z1.h[1]
+; CHECK-NEXT: zip1 z2.b, z2.b, z2.b
+; CHECK-NEXT: zip1 z0.b, z0.b, z3.b
+; CHECK-NEXT: zip1 z2.h, z2.h, z4.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT: zip1 z0.s, z0.s, z2.s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%op1 = load <8 x i8>, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 6e2ecfca9e963e..619840fc6afb28 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME
+; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
target triple = "aarch64-unknown-linux-gnu"
@@ -406,33 +406,13 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
;
define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) {
-; SVE2-LABEL: concat_v4f16:
-; SVE2: // %bb.0:
-; SVE2-NEXT: cnth x8
-; SVE2-NEXT: adrp x9, .LCPI15_0
-; SVE2-NEXT: adrp x10, .LCPI15_1
-; SVE2-NEXT: mov z2.h, w8
-; SVE2-NEXT: ldr q3, [x9, :lo12:.LCPI15_0]
-; SVE2-NEXT: ldr q4, [x10, :lo12:.LCPI15_1]
-; SVE2-NEXT: ptrue p0.h, vl8
-; SVE2-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
-; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
-; SVE2-NEXT: mad z2.h, p0/m, z3.h, z4.h
-; SVE2-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h
-; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
-; SVE2-NEXT: ret
-;
-; SME-LABEL: concat_v4f16:
-; SME: // %bb.0:
-; SME-NEXT: // kill: def $d1 killed $d1 def $z1
-; SME-NEXT: // kill: def $d0 killed $d0 def $z0
-; SME-NEXT: mov z2.h, z1.h[1]
-; SME-NEXT: mov z3.h, z0.h[1]
-; SME-NEXT: zip1 z1.h, z1.h, z2.h
-; SME-NEXT: zip1 z0.h, z0.h, z3.h
-; SME-NEXT: zip1 z0.s, z0.s, z1.s
-; SME-NEXT: // kill: def $d0 killed $d0 killed $z0
-; SME-NEXT: ret
+; CHECK-LABEL: concat_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: concat_v4f16:
; NONEON-NOSVE: // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index a33e8537edf4ee..1b083d80ef3e68 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -643,11 +643,12 @@ define void @test_revhv32i16(ptr %a) {
define void @test_rev_elts_fail(ptr %a) {
; CHECK-LABEL: test_rev_elts_fail:
; CHECK: // %bb.0:
-; CHECK-NEXT: index z0.d, #1, #-1
-; CHECK-NEXT: ldp q1, q2, [x0]
-; CHECK-NEXT: tbl z1.d, { z1.d }, z0.d
-; CHECK-NEXT: tbl z0.d, { z2.d }, z0.d
-; CHECK-NEXT: stp q1, q0, [x0]
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: mov z2.d, z0.d[1]
+; CHECK-NEXT: mov z3.d, z1.d[1]
+; CHECK-NEXT: zip1 z0.d, z2.d, z0.d
+; CHECK-NEXT: zip1 z1.d, z3.d, z1.d
+; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: test_rev_elts_fail:
|
* Avoid using TBL for small vectors (that can be lowered with a couple of ZIP1s) * Fold redundant ZIP1s
| } | ||
| } | ||
|
|
||
| // Try to widen the shuffle before generating a possibly expensive SVE TBL. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could there be any value for the patterns above to move this up a bit so that it's executed earlier in the function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it changes the result (as all the matching above is looking for single-instruction replacements). Also, I placed it here to be consistent with the non-SVE lowering, which attempts this just before generating the Neon TBL too.
This now tries to widen the shuffle before generating a possibly expensive SVE TBL, this may allow the shuffle to be matched as something cheaper like a ZIP1.