Skip to content

Commit 8ab4b88

Browse files
committed
[AArch64] Improve lowering for scalable masked interleaving stores
Similar to #154338, this PR aims to support lowering of certain IR to SVE's st2 and st4 instructions. The typical IR scenario looks like: %mask = .. @llvm.vector.interleave2(<vscale x 16 x i1> %m, <vscale x 16 x i1> %m) %val = .. @llvm.vector.interleave2(<vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2) .. @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %val, ..., <vscale x 32 x i1> %mask) where we're interleaving both the value and the mask being passed to the wide store. When the mask interleave parts are identical we can lower this to st2b. This PR adds a DAG combine for lowering this kind of IR pattern to st2X and st4X SVE instructions.
1 parent e4f346c commit 8ab4b88

File tree

3 files changed

+156
-142
lines changed

3 files changed

+156
-142
lines changed

llvm/include/llvm/CodeGen/SelectionDAGNodes.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3346,6 +3346,14 @@ namespace ISD {
33463346
Ld->getAddressingMode() == ISD::UNINDEXED;
33473347
}
33483348

3349+
/// Returns true if the specified node is a non-extending and unindexed
3350+
/// masked store.
3351+
inline bool isNormalMaskedStore(const SDNode *N) {
3352+
auto *St = dyn_cast<MaskedStoreSDNode>(N);
3353+
return St && !St->isTruncatingStore() &&
3354+
St->getAddressingMode() == ISD::UNINDEXED;
3355+
}
3356+
33493357
/// Attempt to match a unary predicate against a scalar/splat constant or
33503358
/// every element of a constant BUILD_VECTOR.
33513359
/// If AllowUndef is true, then UNDEF elements will pass nullptr to Match.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 107 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -24632,6 +24632,106 @@ static SDValue performSTORECombine(SDNode *N,
2463224632
return SDValue();
2463324633
}
2463424634

24635+
static bool
24636+
isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
24637+
if (N->getOpcode() != ISD::CONCAT_VECTORS)
24638+
return false;
24639+
24640+
unsigned NumParts = N->getNumOperands();
24641+
24642+
// We should be concatenating each sequential result from a
24643+
// VECTOR_INTERLEAVE.
24644+
SDNode *InterleaveOp = N->getOperand(0).getNode();
24645+
if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
24646+
InterleaveOp->getNumOperands() != NumParts)
24647+
return false;
24648+
24649+
for (unsigned I = 0; I < NumParts; I++) {
24650+
if (N->getOperand(I) != SDValue(InterleaveOp, I))
24651+
return false;
24652+
}
24653+
24654+
Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
24655+
return true;
24656+
}
24657+
24658+
static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
24659+
SDValue WideMask,
24660+
unsigned RequiredNumParts) {
24661+
if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
24662+
SmallVector<SDValue, 4> MaskInterleaveOps;
24663+
if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
24664+
MaskInterleaveOps))
24665+
return SDValue();
24666+
24667+
if (MaskInterleaveOps.size() != RequiredNumParts)
24668+
return SDValue();
24669+
24670+
// Make sure the inputs to the vector interleave are identical.
24671+
if (!llvm::all_equal(MaskInterleaveOps))
24672+
return SDValue();
24673+
24674+
return MaskInterleaveOps[0];
24675+
} else if (WideMask->getOpcode() == ISD::SPLAT_VECTOR) {
24676+
ElementCount EC = WideMask.getValueType().getVectorElementCount();
24677+
assert(EC.isKnownMultipleOf(RequiredNumParts) &&
24678+
"Expected element count divisible by number of parts");
24679+
EC = EC.divideCoefficientBy(RequiredNumParts);
24680+
return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
24681+
WideMask->getOperand(0));
24682+
}
24683+
return SDValue();
24684+
}
24685+
24686+
static SDValue
24687+
performStoreInterleaveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
24688+
SelectionDAG &DAG) {
24689+
if (!DCI.isBeforeLegalize())
24690+
return SDValue();
24691+
24692+
MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
24693+
SDValue WideValue = MST->getValue();
24694+
24695+
// Bail out if the stored value has an unexpected number of uses, since we'll
24696+
// have to peform manual interleaving and may as well just use normal masked
24697+
// stores. Also, discard masked stores that are truncating or indexed.
24698+
if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
24699+
!MST->getOffset().isUndef())
24700+
return SDValue();
24701+
24702+
SmallVector<SDValue, 4> ValueInterleaveOps;
24703+
if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
24704+
ValueInterleaveOps))
24705+
return SDValue();
24706+
24707+
unsigned NumParts = ValueInterleaveOps.size();
24708+
if (NumParts != 2 && NumParts != 4)
24709+
return SDValue();
24710+
24711+
// At the moment we're unlikely to see a fixed-width vector deinterleave as
24712+
// we usually generate shuffles instead.
24713+
EVT SubVecTy = ValueInterleaveOps[0].getValueType();
24714+
if (!SubVecTy.isScalableVT() ||
24715+
SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
24716+
!DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
24717+
return SDValue();
24718+
24719+
SDLoc DL(N);
24720+
SDValue NarrowMask =
24721+
getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
24722+
if (!NarrowMask)
24723+
return SDValue();
24724+
24725+
const Intrinsic::ID IID =
24726+
NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
24727+
SDValue Res;
24728+
SmallVector<SDValue, 8> NewStOps;
24729+
NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
24730+
NewStOps.append(ValueInterleaveOps);
24731+
NewStOps.append({NarrowMask, MST->getBasePtr()});
24732+
return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
24733+
}
24734+
2463524735
static SDValue performMSTORECombine(SDNode *N,
2463624736
TargetLowering::DAGCombinerInfo &DCI,
2463724737
SelectionDAG &DAG,
@@ -24641,6 +24741,9 @@ static SDValue performMSTORECombine(SDNode *N,
2464124741
SDValue Mask = MST->getMask();
2464224742
SDLoc DL(N);
2464324743

24744+
if (SDValue Res = performStoreInterleaveCombine(N, DCI, DAG))
24745+
return Res;
24746+
2464424747
// If this is a UZP1 followed by a masked store, fold this into a masked
2464524748
// truncating store. We can do this even if this is already a masked
2464624749
// truncstore.
@@ -27274,43 +27377,11 @@ static SDValue performVectorDeinterleaveCombine(
2727427377
return SDValue();
2727527378

2727627379
// Now prove that the mask is an interleave of identical masks.
27277-
SDValue Mask = MaskedLoad->getMask();
27278-
if (Mask->getOpcode() != ISD::SPLAT_VECTOR &&
27279-
Mask->getOpcode() != ISD::CONCAT_VECTORS)
27280-
return SDValue();
27281-
27282-
SDValue NarrowMask;
2728327380
SDLoc DL(N);
27284-
if (Mask->getOpcode() == ISD::CONCAT_VECTORS) {
27285-
if (Mask->getNumOperands() != NumParts)
27286-
return SDValue();
27287-
27288-
// We should be concatenating each sequential result from a
27289-
// VECTOR_INTERLEAVE.
27290-
SDNode *InterleaveOp = Mask->getOperand(0).getNode();
27291-
if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
27292-
InterleaveOp->getNumOperands() != NumParts)
27293-
return SDValue();
27294-
27295-
for (unsigned I = 0; I < NumParts; I++) {
27296-
if (Mask.getOperand(I) != SDValue(InterleaveOp, I))
27297-
return SDValue();
27298-
}
27299-
27300-
// Make sure the inputs to the vector interleave are identical.
27301-
if (!llvm::all_equal(InterleaveOp->op_values()))
27302-
return SDValue();
27303-
27304-
NarrowMask = InterleaveOp->getOperand(0);
27305-
} else { // ISD::SPLAT_VECTOR
27306-
ElementCount EC = Mask.getValueType().getVectorElementCount();
27307-
assert(EC.isKnownMultipleOf(NumParts) &&
27308-
"Expected element count divisible by number of parts");
27309-
EC = EC.divideCoefficientBy(NumParts);
27310-
NarrowMask =
27311-
DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
27312-
Mask->getOperand(0));
27313-
}
27381+
SDValue NarrowMask =
27382+
getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
27383+
if (!NarrowMask)
27384+
return SDValue();
2731427385

2731527386
const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
2731627387
: Intrinsic::aarch64_sve_ld4_sret;

llvm/test/CodeGen/AArch64/scalable_masked_interleaved_stores.ll

Lines changed: 41 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,9 @@
44
define void @foo_st2_nxv16i8(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, ptr %p) {
55
; CHECK-LABEL: foo_st2_nxv16i8:
66
; CHECK: // %bb.0:
7-
; CHECK-NEXT: zip2 z2.b, z0.b, z1.b
8-
; CHECK-NEXT: zip1 z0.b, z0.b, z1.b
9-
; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
10-
; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
11-
; CHECK-NEXT: st1b { z2.b }, p1, [x0, #1, mul vl]
12-
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
7+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
8+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
9+
; CHECK-NEXT: st2b { z0.b, z1.b }, p0, [x0]
1310
; CHECK-NEXT: ret
1411
%interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
1512
%interleaved.value = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2)
@@ -20,12 +17,9 @@ define void @foo_st2_nxv16i8(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %val1,
2017
define void @foo_st2_nxv8i16(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %val1, <vscale x 8 x i16> %val2, ptr %p) {
2118
; CHECK-LABEL: foo_st2_nxv8i16:
2219
; CHECK: // %bb.0:
23-
; CHECK-NEXT: zip2 z2.h, z0.h, z1.h
24-
; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
25-
; CHECK-NEXT: zip2 p1.h, p0.h, p0.h
26-
; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
27-
; CHECK-NEXT: st1h { z2.h }, p1, [x0, #1, mul vl]
28-
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
20+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
21+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
22+
; CHECK-NEXT: st2h { z0.h, z1.h }, p0, [x0]
2923
; CHECK-NEXT: ret
3024
%interleaved.mask = tail call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
3125
%interleaved.value = tail call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %val1, <vscale x 8 x i16> %val2)
@@ -36,12 +30,9 @@ define void @foo_st2_nxv8i16(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %val1,
3630
define void @foo_st2_nxv4i32(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2, ptr %p) {
3731
; CHECK-LABEL: foo_st2_nxv4i32:
3832
; CHECK: // %bb.0:
39-
; CHECK-NEXT: zip2 z2.s, z0.s, z1.s
40-
; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
41-
; CHECK-NEXT: zip2 p1.s, p0.s, p0.s
42-
; CHECK-NEXT: zip1 p0.s, p0.s, p0.s
43-
; CHECK-NEXT: st1w { z2.s }, p1, [x0, #1, mul vl]
44-
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
33+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
34+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
35+
; CHECK-NEXT: st2w { z0.s, z1.s }, p0, [x0]
4536
; CHECK-NEXT: ret
4637
%interleaved.mask = tail call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
4738
%interleaved.value = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2)
@@ -52,12 +43,9 @@ define void @foo_st2_nxv4i32(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %val1,
5243
define void @foo_st2_nxv2i64(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %val1, <vscale x 2 x i64> %val2, ptr %p) {
5344
; CHECK-LABEL: foo_st2_nxv2i64:
5445
; CHECK: // %bb.0:
55-
; CHECK-NEXT: zip2 z2.d, z0.d, z1.d
56-
; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
57-
; CHECK-NEXT: zip2 p1.d, p0.d, p0.d
58-
; CHECK-NEXT: zip1 p0.d, p0.d, p0.d
59-
; CHECK-NEXT: st1d { z2.d }, p1, [x0, #1, mul vl]
60-
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
46+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
47+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
48+
; CHECK-NEXT: st2d { z0.d, z1.d }, p0, [x0]
6149
; CHECK-NEXT: ret
6250
%interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
6351
%interleaved.value = tail call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %val1, <vscale x 2 x i64> %val2)
@@ -68,24 +56,11 @@ define void @foo_st2_nxv2i64(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %val1,
6856
define void @foo_st4_nxv16i8(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, <vscale x 16 x i8> %val3, <vscale x 16 x i8> %val4, ptr %p) {
6957
; CHECK-LABEL: foo_st4_nxv16i8:
7058
; CHECK: // %bb.0:
71-
; CHECK-NEXT: zip2 z4.b, z1.b, z3.b
72-
; CHECK-NEXT: zip2 z5.b, z0.b, z2.b
73-
; CHECK-NEXT: zip1 z1.b, z1.b, z3.b
74-
; CHECK-NEXT: zip1 z0.b, z0.b, z2.b
75-
; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
76-
; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
77-
; CHECK-NEXT: zip2 z2.b, z5.b, z4.b
78-
; CHECK-NEXT: zip1 z3.b, z5.b, z4.b
79-
; CHECK-NEXT: zip2 p2.b, p1.b, p1.b
80-
; CHECK-NEXT: zip2 z4.b, z0.b, z1.b
81-
; CHECK-NEXT: zip1 z0.b, z0.b, z1.b
82-
; CHECK-NEXT: zip1 p1.b, p1.b, p1.b
83-
; CHECK-NEXT: zip2 p3.b, p0.b, p0.b
84-
; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
85-
; CHECK-NEXT: st1b { z2.b }, p2, [x0, #3, mul vl]
86-
; CHECK-NEXT: st1b { z3.b }, p1, [x0, #2, mul vl]
87-
; CHECK-NEXT: st1b { z4.b }, p3, [x0, #1, mul vl]
88-
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
59+
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
60+
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
61+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
62+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
63+
; CHECK-NEXT: st4b { z0.b - z3.b }, p0, [x0]
8964
; CHECK-NEXT: ret
9065
%interleaved.mask = tail call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
9166
%interleaved.value = tail call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, <vscale x 16 x i8> %val3, <vscale x 16 x i8> %val4)
@@ -96,24 +71,11 @@ define void @foo_st4_nxv16i8(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %val1,
9671
define void @foo_st4_nxv8i16(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %val1, <vscale x 8 x i16> %val2, <vscale x 8 x i16> %val3, <vscale x 8 x i16> %val4, ptr %p) {
9772
; CHECK-LABEL: foo_st4_nxv8i16:
9873
; CHECK: // %bb.0:
99-
; CHECK-NEXT: zip2 z4.h, z1.h, z3.h
100-
; CHECK-NEXT: zip2 z5.h, z0.h, z2.h
101-
; CHECK-NEXT: zip1 z1.h, z1.h, z3.h
102-
; CHECK-NEXT: zip1 z0.h, z0.h, z2.h
103-
; CHECK-NEXT: zip2 p1.h, p0.h, p0.h
104-
; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
105-
; CHECK-NEXT: zip2 z2.h, z5.h, z4.h
106-
; CHECK-NEXT: zip1 z3.h, z5.h, z4.h
107-
; CHECK-NEXT: zip2 p2.h, p1.h, p1.h
108-
; CHECK-NEXT: zip2 z4.h, z0.h, z1.h
109-
; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
110-
; CHECK-NEXT: zip1 p1.h, p1.h, p1.h
111-
; CHECK-NEXT: zip2 p3.h, p0.h, p0.h
112-
; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
113-
; CHECK-NEXT: st1h { z2.h }, p2, [x0, #3, mul vl]
114-
; CHECK-NEXT: st1h { z3.h }, p1, [x0, #2, mul vl]
115-
; CHECK-NEXT: st1h { z4.h }, p3, [x0, #1, mul vl]
116-
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
74+
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
75+
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
76+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
77+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
78+
; CHECK-NEXT: st4h { z0.h - z3.h }, p0, [x0]
11779
; CHECK-NEXT: ret
11880
%interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
11981
%interleaved.value = tail call <vscale x 32 x i16> @llvm.vector.interleave4.nxv32i16(<vscale x 8 x i16> %val1, <vscale x 8 x i16> %val2, <vscale x 8 x i16> %val3, <vscale x 8 x i16> %val4)
@@ -124,24 +86,11 @@ define void @foo_st4_nxv8i16(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %val1,
12486
define void @foo_st4_nxv4i32(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2, <vscale x 4 x i32> %val3, <vscale x 4 x i32> %val4, ptr %p) {
12587
; CHECK-LABEL: foo_st4_nxv4i32:
12688
; CHECK: // %bb.0:
127-
; CHECK-NEXT: zip2 z4.s, z1.s, z3.s
128-
; CHECK-NEXT: zip2 z5.s, z0.s, z2.s
129-
; CHECK-NEXT: zip1 z1.s, z1.s, z3.s
130-
; CHECK-NEXT: zip1 z0.s, z0.s, z2.s
131-
; CHECK-NEXT: zip2 p1.s, p0.s, p0.s
132-
; CHECK-NEXT: zip1 p0.s, p0.s, p0.s
133-
; CHECK-NEXT: zip2 z2.s, z5.s, z4.s
134-
; CHECK-NEXT: zip1 z3.s, z5.s, z4.s
135-
; CHECK-NEXT: zip2 p2.s, p1.s, p1.s
136-
; CHECK-NEXT: zip2 z4.s, z0.s, z1.s
137-
; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
138-
; CHECK-NEXT: zip1 p1.s, p1.s, p1.s
139-
; CHECK-NEXT: zip2 p3.s, p0.s, p0.s
140-
; CHECK-NEXT: zip1 p0.s, p0.s, p0.s
141-
; CHECK-NEXT: st1w { z2.s }, p2, [x0, #3, mul vl]
142-
; CHECK-NEXT: st1w { z3.s }, p1, [x0, #2, mul vl]
143-
; CHECK-NEXT: st1w { z4.s }, p3, [x0, #1, mul vl]
144-
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
89+
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
90+
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
91+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
92+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
93+
; CHECK-NEXT: st4w { z0.s - z3.s }, p0, [x0]
14594
; CHECK-NEXT: ret
14695
%interleaved.mask = tail call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
14796
%interleaved.value = tail call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2, <vscale x 4 x i32> %val3, <vscale x 4 x i32> %val4)
@@ -152,24 +101,11 @@ define void @foo_st4_nxv4i32(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %val1,
152101
define void @foo_st4_nxv2i64(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %val1, <vscale x 2 x i64> %val2, <vscale x 2 x i64> %val3, <vscale x 2 x i64> %val4, ptr %p) {
153102
; CHECK-LABEL: foo_st4_nxv2i64:
154103
; CHECK: // %bb.0:
155-
; CHECK-NEXT: zip2 z4.d, z1.d, z3.d
156-
; CHECK-NEXT: zip2 z5.d, z0.d, z2.d
157-
; CHECK-NEXT: zip1 z1.d, z1.d, z3.d
158-
; CHECK-NEXT: zip1 z0.d, z0.d, z2.d
159-
; CHECK-NEXT: zip2 p1.d, p0.d, p0.d
160-
; CHECK-NEXT: zip1 p0.d, p0.d, p0.d
161-
; CHECK-NEXT: zip2 z2.d, z5.d, z4.d
162-
; CHECK-NEXT: zip1 z3.d, z5.d, z4.d
163-
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
164-
; CHECK-NEXT: zip2 z4.d, z0.d, z1.d
165-
; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
166-
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
167-
; CHECK-NEXT: zip2 p3.d, p0.d, p0.d
168-
; CHECK-NEXT: zip1 p0.d, p0.d, p0.d
169-
; CHECK-NEXT: st1d { z2.d }, p2, [x0, #3, mul vl]
170-
; CHECK-NEXT: st1d { z3.d }, p1, [x0, #2, mul vl]
171-
; CHECK-NEXT: st1d { z4.d }, p3, [x0, #1, mul vl]
172-
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
104+
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
105+
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
106+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
107+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
108+
; CHECK-NEXT: st4d { z0.d - z3.d }, p0, [x0]
173109
; CHECK-NEXT: ret
174110
%interleaved.mask = tail call <vscale x 8 x i1> @llvm.vector.interleave4.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
175111
%interleaved.value = tail call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> %val1, <vscale x 2 x i64> %val2, <vscale x 2 x i64> %val3, <vscale x 2 x i64> %val4)
@@ -181,13 +117,12 @@ define void @foo_st2_nxv16i8_mul_use_mask(<vscale x 16 x i1> %mask, <vscale x 16
181117
; CHECK-LABEL: foo_st2_nxv16i8_mul_use_mask:
182118
; CHECK: // %bb.0:
183119
; CHECK-NEXT: zip1 p1.b, p0.b, p0.b
184-
; CHECK-NEXT: zip1 z2.b, z0.b, z1.b
185-
; CHECK-NEXT: zip2 z0.b, z0.b, z1.b
186-
; CHECK-NEXT: zip2 p0.b, p0.b, p0.b
120+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
121+
; CHECK-NEXT: zip2 p2.b, p0.b, p0.b
122+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
187123
; CHECK-NEXT: // fake_use: $p1
188-
; CHECK-NEXT: // fake_use: $p0
189-
; CHECK-NEXT: st1b { z0.b }, p0, [x0, #1, mul vl]
190-
; CHECK-NEXT: st1b { z2.b }, p1, [x0]
124+
; CHECK-NEXT: // fake_use: $p2
125+
; CHECK-NEXT: st2b { z0.b, z1.b }, p0, [x0]
191126
; CHECK-NEXT: ret
192127
%interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
193128
%interleaved.value = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2)
@@ -199,10 +134,10 @@ define void @foo_st2_nxv16i8_mul_use_mask(<vscale x 16 x i1> %mask, <vscale x 16
199134
define void @foo_st2_nxv16i8_mask_of_interleaved_ones(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, ptr %p) {
200135
; CHECK-LABEL: foo_st2_nxv16i8_mask_of_interleaved_ones:
201136
; CHECK: // %bb.0:
202-
; CHECK-NEXT: zip2 z2.b, z0.b, z1.b
203-
; CHECK-NEXT: zip1 z0.b, z0.b, z1.b
204-
; CHECK-NEXT: str z2, [x0, #1, mul vl]
205-
; CHECK-NEXT: str z0, [x0]
137+
; CHECK-NEXT: ptrue p0.b
138+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
139+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
140+
; CHECK-NEXT: st2b { z0.b, z1.b }, p0, [x0]
206141
; CHECK-NEXT: ret
207142
%interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1))
208143
%interleaved.value = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2)

0 commit comments

Comments
 (0)