Skip to content

Commit a1ca690

Browse files
authored
[AArch64] recognise zip1/zip2 with flipped operands (#167235)
Currently, the following two snippets get treated very differently from each other (https://godbolt.org/z/rYGj9TGz6): ```LLVM define <8 x i8> @foo(<8 x i8> %x, <8 x i8> %y) local_unnamed_addr #0 { entry: %0 = shufflevector <8 x i8> %x, <8 x i8> %y, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> ret <8 x i8> %0 } define <8 x i8> @bar(<8 x i8> %x, <8 x i8> %y) local_unnamed_addr #0 { entry: %0 = shufflevector <8 x i8> %x, <8 x i8> %y, <8 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3> ret <8 x i8> %0 } ``` ``` foo: // @foo zip1 v0.8b, v0.8b, v1.8b ret .LCPI1_0: .byte 8 // 0x8 .byte 0 // 0x0 .byte 9 // 0x9 .byte 1 // 0x1 .byte 10 // 0xa .byte 2 // 0x2 .byte 11 // 0xb .byte 3 // 0x3 bar: // @bar adrp x8, .LCPI1_0 mov v0.d[1], v1.d[0] ldr d1, [x8, :lo12:.LCPI1_0] tbl v0.8b, { v0.16b }, v1.8b ret ``` The reason is that `isZIPMask` does not recognise the pattern when the operands are flipped. This PR fixes `isZIPMask` so that both `foo` and `bar` get compiled as expected: ``` foo: // @foo zip1 v0.8b, v0.8b, v1.8b ret bar: // @bar zip1 v0.8b, v1.8b, v0.8b ret ``` I intend to open a similar follow-up PR for `isTRNMask`, which seems to have the same problem. I noticed this while working on #137447, though the change does not on itself fix that issue.
1 parent e3a28c0 commit a1ca690

File tree

10 files changed

+207
-220
lines changed

10 files changed

+207
-220
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14805,9 +14805,11 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
1480514805
}
1480614806

1480714807
unsigned WhichResult;
14808-
if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
14808+
unsigned OperandOrder;
14809+
if (isZIPMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
1480914810
unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14810-
return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14811+
return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
14812+
OperandOrder == 0 ? V2 : V1);
1481114813
}
1481214814
if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
1481314815
unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
@@ -16529,7 +16531,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
1652916531
isSingletonEXTMask(M, VT, DummyUnsigned) ||
1653016532
isTRNMask(M, NumElts, DummyUnsigned) ||
1653116533
isUZPMask(M, NumElts, DummyUnsigned) ||
16532-
isZIPMask(M, NumElts, DummyUnsigned) ||
16534+
isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
1653316535
isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
1653416536
isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
1653516537
isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
@@ -31576,10 +31578,15 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
3157631578
}
3157731579

3157831580
unsigned WhichResult;
31579-
if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
31581+
unsigned OperandOrder;
31582+
if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
31583+
OperandOrder) &&
3158031584
WhichResult == 0)
3158131585
return convertFromScalableVector(
31582-
DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
31586+
DAG, VT,
31587+
DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT,
31588+
OperandOrder == 0 ? Op1 : Op2,
31589+
OperandOrder == 0 ? Op2 : Op1));
3158331590

3158431591
if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
3158531592
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
@@ -31624,10 +31631,14 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
3162431631
return convertFromScalableVector(DAG, VT, Op);
3162531632
}
3162631633

31627-
if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
31634+
if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
31635+
OperandOrder) &&
3162831636
WhichResult != 0)
3162931637
return convertFromScalableVector(
31630-
DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
31638+
DAG, VT,
31639+
DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT,
31640+
OperandOrder == 0 ? Op1 : Op2,
31641+
OperandOrder == 0 ? Op2 : Op1));
3163131642

3163231643
if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
3163331644
unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;

llvm/lib/Target/AArch64/AArch64PerfectShuffle.h

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6622,35 +6622,52 @@ inline unsigned getPerfectShuffleCost(llvm::ArrayRef<int> M) {
66226622
}
66236623

66246624
/// Return true for zip1 or zip2 masks of the form:
6625-
/// <0, 8, 1, 9, 2, 10, 3, 11> or
6626-
/// <4, 12, 5, 13, 6, 14, 7, 15>
6625+
/// <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0, OperandOrderOut = 0) or
6626+
/// <4, 12, 5, 13, 6, 14, 7, 15> (WhichResultOut = 1, OperandOrderOut = 0) or
6627+
/// <8, 0, 9, 1, 10, 2, 11, 3> (WhichResultOut = 0, OperandOrderOut = 1) or
6628+
/// <12, 4, 13, 5, 14, 6, 15, 7> (WhichResultOut = 1, OperandOrderOut = 1)
66276629
inline bool isZIPMask(ArrayRef<int> M, unsigned NumElts,
6628-
unsigned &WhichResultOut) {
6630+
unsigned &WhichResultOut, unsigned &OperandOrderOut) {
66296631
if (NumElts % 2 != 0)
66306632
return false;
6631-
// Check the first non-undef element for which half to use.
6632-
unsigned WhichResult = 2;
6633-
for (unsigned i = 0; i != NumElts / 2; i++) {
6634-
if (M[i * 2] >= 0) {
6635-
WhichResult = ((unsigned)M[i * 2] == i ? 0 : 1);
6636-
break;
6637-
} else if (M[i * 2 + 1] >= 0) {
6638-
WhichResult = ((unsigned)M[i * 2 + 1] == NumElts + i ? 0 : 1);
6639-
break;
6640-
}
6641-
}
6642-
if (WhichResult == 2)
6643-
return false;
66446633

6634+
// "Variant" refers to the distinction bwetween zip1 and zip2, while
6635+
// "Order" refers to sequence of input registers (matching vs flipped).
6636+
bool Variant0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0
6637+
bool Variant1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0
6638+
bool Variant0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1
6639+
bool Variant1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1
66456640
// Check all elements match.
6646-
unsigned Idx = WhichResult * NumElts / 2;
66476641
for (unsigned i = 0; i != NumElts; i += 2) {
6648-
if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
6649-
(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
6650-
return false;
6651-
Idx += 1;
6642+
if (M[i] >= 0) {
6643+
unsigned EvenElt = (unsigned)M[i];
6644+
if (EvenElt != i / 2)
6645+
Variant0Order0 = false;
6646+
if (EvenElt != NumElts / 2 + i / 2)
6647+
Variant1Order0 = false;
6648+
if (EvenElt != NumElts + i / 2)
6649+
Variant0Order1 = false;
6650+
if (EvenElt != NumElts + NumElts / 2 + i / 2)
6651+
Variant1Order1 = false;
6652+
}
6653+
if (M[i + 1] >= 0) {
6654+
unsigned OddElt = (unsigned)M[i + 1];
6655+
if (OddElt != NumElts + i / 2)
6656+
Variant0Order0 = false;
6657+
if (OddElt != NumElts + NumElts / 2 + i / 2)
6658+
Variant1Order0 = false;
6659+
if (OddElt != i / 2)
6660+
Variant0Order1 = false;
6661+
if (OddElt != NumElts / 2 + i / 2)
6662+
Variant1Order1 = false;
6663+
}
66526664
}
6653-
WhichResultOut = WhichResult;
6665+
6666+
if (Variant0Order0 + Variant1Order0 + Variant0Order1 + Variant1Order1 != 1)
6667+
return false;
6668+
6669+
WhichResultOut = (Variant0Order0 || Variant0Order1) ? 0 : 1;
6670+
OperandOrderOut = (Variant0Order0 || Variant1Order0) ? 0 : 1;
66546671
return true;
66556672
}
66566673

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6067,7 +6067,7 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
60676067
if (LT.second.isFixedLengthVector() &&
60686068
LT.second.getVectorNumElements() == Mask.size() &&
60696069
(Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6070-
(isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6070+
(isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
60716071
isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
60726072
isREVMask(Mask, LT.second.getScalarSizeInBits(),
60736073
LT.second.getVectorNumElements(), 16) ||

llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,14 +252,15 @@ bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI,
252252
ShuffleVectorPseudo &MatchInfo) {
253253
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
254254
unsigned WhichResult;
255+
unsigned OperandOrder;
255256
ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
256257
Register Dst = MI.getOperand(0).getReg();
257258
unsigned NumElts = MRI.getType(Dst).getNumElements();
258-
if (!isZIPMask(ShuffleMask, NumElts, WhichResult))
259+
if (!isZIPMask(ShuffleMask, NumElts, WhichResult, OperandOrder))
259260
return false;
260261
unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2;
261-
Register V1 = MI.getOperand(1).getReg();
262-
Register V2 = MI.getOperand(2).getReg();
262+
Register V1 = MI.getOperand(OperandOrder == 0 ? 1 : 2).getReg();
263+
Register V2 = MI.getOperand(OperandOrder == 0 ? 2 : 1).getReg();
263264
MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
264265
return true;
265266
}

llvm/test/CodeGen/AArch64/arm64-zip.ll

Lines changed: 12 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -355,49 +355,25 @@ define <8 x i16> @combine_v8i16_undef(<4 x i16> %0, <4 x i16> %1) {
355355
ret <8 x i16> %3
356356
}
357357

358-
; FIXME: This could be zip1 too, 8,0,9,1... pattern is handled
359358
define <16 x i8> @combine_v8i16_8first(<8 x i8> %0, <8 x i8> %1) {
360-
; CHECK-SD-LABEL: combine_v8i16_8first:
361-
; CHECK-SD: // %bb.0:
362-
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1_q2
363-
; CHECK-SD-NEXT: adrp x8, .LCPI25_0
364-
; CHECK-SD-NEXT: fmov d2, d0
365-
; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI25_0]
366-
; CHECK-SD-NEXT: tbl.16b v0, { v1, v2 }, v3
367-
; CHECK-SD-NEXT: ret
368-
;
369-
; CHECK-GI-LABEL: combine_v8i16_8first:
370-
; CHECK-GI: // %bb.0:
371-
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q31_q0
372-
; CHECK-GI-NEXT: adrp x8, .LCPI25_0
373-
; CHECK-GI-NEXT: fmov d31, d1
374-
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI25_0]
375-
; CHECK-GI-NEXT: tbl.16b v0, { v31, v0 }, v2
376-
; CHECK-GI-NEXT: ret
359+
; CHECK-LABEL: combine_v8i16_8first:
360+
; CHECK: // %bb.0:
361+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
362+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
363+
; CHECK-NEXT: zip1.16b v0, v0, v1
364+
; CHECK-NEXT: ret
377365
%3 = shufflevector <8 x i8> %1, <8 x i8> %0, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
378366
ret <16 x i8> %3
379367
}
380368

381369

382-
; FIXME: This could be zip1 too, 8,0,9,1... pattern is handled
383370
define <16 x i8> @combine_v8i16_8firstundef(<8 x i8> %0, <8 x i8> %1) {
384-
; CHECK-SD-LABEL: combine_v8i16_8firstundef:
385-
; CHECK-SD: // %bb.0:
386-
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1_q2
387-
; CHECK-SD-NEXT: adrp x8, .LCPI26_0
388-
; CHECK-SD-NEXT: fmov d2, d0
389-
; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI26_0]
390-
; CHECK-SD-NEXT: tbl.16b v0, { v1, v2 }, v3
391-
; CHECK-SD-NEXT: ret
392-
;
393-
; CHECK-GI-LABEL: combine_v8i16_8firstundef:
394-
; CHECK-GI: // %bb.0:
395-
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q31_q0
396-
; CHECK-GI-NEXT: adrp x8, .LCPI26_0
397-
; CHECK-GI-NEXT: fmov d31, d1
398-
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI26_0]
399-
; CHECK-GI-NEXT: tbl.16b v0, { v31, v0 }, v2
400-
; CHECK-GI-NEXT: ret
371+
; CHECK-LABEL: combine_v8i16_8firstundef:
372+
; CHECK: // %bb.0:
373+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
374+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
375+
; CHECK-NEXT: zip1.16b v0, v0, v1
376+
; CHECK-NEXT: ret
401377
%3 = shufflevector <8 x i8> %1, <8 x i8> %0, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 undef>
402378
ret <16 x i8> %3
403379
}

llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
88
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
99
; CHECK-SD-NEXT: dup v2.2s, v0.s[1]
1010
; CHECK-SD-NEXT: mov v1.16b, v2.16b
11+
; CHECK-SD-NEXT: zip1 v2.4h, v0.4h, v2.4h
1112
; CHECK-SD-NEXT: mov v1.h[0], v0.h[1]
12-
; CHECK-SD-NEXT: mov v0.h[1], v2.h[0]
13-
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
13+
; CHECK-SD-NEXT: fmov d0, d2
1414
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
1515
; CHECK-SD-NEXT: ret
1616
;

llvm/test/CodeGen/AArch64/insert-extend.ll

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -66,57 +66,57 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
6666
; CHECK-NEXT: ldr d5, [x11, x9]
6767
; CHECK-NEXT: shll2 v6.4s, v0.8h, #16
6868
; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b
69+
; CHECK-NEXT: shll2 v7.4s, v1.8h, #16
6970
; CHECK-NEXT: usubl v3.8h, v4.8b, v5.8b
70-
; CHECK-NEXT: shll2 v4.4s, v1.8h, #16
7171
; CHECK-NEXT: saddw v0.4s, v6.4s, v0.4h
72-
; CHECK-NEXT: shll2 v6.4s, v2.8h, #16
72+
; CHECK-NEXT: shll2 v4.4s, v2.8h, #16
73+
; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h
7374
; CHECK-NEXT: shll2 v5.4s, v3.8h, #16
74-
; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h
75-
; CHECK-NEXT: rev64 v4.4s, v0.4s
76-
; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h
75+
; CHECK-NEXT: rev64 v6.4s, v0.4s
76+
; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h
77+
; CHECK-NEXT: rev64 v7.4s, v1.4s
7778
; CHECK-NEXT: saddw v3.4s, v5.4s, v3.4h
78-
; CHECK-NEXT: rev64 v5.4s, v1.4s
79-
; CHECK-NEXT: rev64 v6.4s, v2.4s
80-
; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s
79+
; CHECK-NEXT: rev64 v4.4s, v2.4s
80+
; CHECK-NEXT: sub v6.4s, v0.4s, v6.4s
8181
; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s
82-
; CHECK-NEXT: rev64 v7.4s, v3.4s
83-
; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s
84-
; CHECK-NEXT: sub v6.4s, v2.4s, v6.4s
82+
; CHECK-NEXT: rev64 v5.4s, v3.4s
83+
; CHECK-NEXT: sub v7.4s, v1.4s, v7.4s
84+
; CHECK-NEXT: sub v4.4s, v2.4s, v4.4s
8585
; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s
86-
; CHECK-NEXT: zip1 v16.4s, v5.4s, v4.4s
87-
; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s
88-
; CHECK-NEXT: trn1 v4.4s, v5.4s, v4.4s
89-
; CHECK-NEXT: zip2 v3.4s, v6.4s, v7.4s
90-
; CHECK-NEXT: mov v6.s[1], v7.s[0]
86+
; CHECK-NEXT: zip1 v16.4s, v7.4s, v6.4s
87+
; CHECK-NEXT: sub v5.4s, v3.4s, v5.4s
88+
; CHECK-NEXT: trn1 v3.4s, v7.4s, v6.4s
89+
; CHECK-NEXT: zip1 v6.4s, v4.4s, v5.4s
90+
; CHECK-NEXT: zip2 v4.4s, v4.4s, v5.4s
91+
; CHECK-NEXT: ext v5.16b, v7.16b, v16.16b, #8
9192
; CHECK-NEXT: ext v7.16b, v2.16b, v2.16b, #8
92-
; CHECK-NEXT: ext v5.16b, v5.16b, v16.16b, #8
93-
; CHECK-NEXT: mov v3.d[1], v4.d[1]
94-
; CHECK-NEXT: uzp1 v1.4s, v7.4s, v0.4s
95-
; CHECK-NEXT: uzp2 v4.4s, v7.4s, v0.4s
93+
; CHECK-NEXT: mov v4.d[1], v3.d[1]
9694
; CHECK-NEXT: mov v6.d[1], v5.d[1]
95+
; CHECK-NEXT: uzp1 v1.4s, v7.4s, v0.4s
96+
; CHECK-NEXT: uzp2 v3.4s, v7.4s, v0.4s
9797
; CHECK-NEXT: addp v0.4s, v2.4s, v0.4s
98-
; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s
98+
; CHECK-NEXT: add v5.4s, v4.4s, v6.4s
99+
; CHECK-NEXT: sub v4.4s, v6.4s, v4.4s
100+
; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s
99101
; CHECK-NEXT: rev64 v7.4s, v0.4s
100-
; CHECK-NEXT: add v5.4s, v3.4s, v6.4s
101-
; CHECK-NEXT: sub v3.4s, v6.4s, v3.4s
102+
; CHECK-NEXT: rev64 v3.4s, v5.4s
103+
; CHECK-NEXT: rev64 v6.4s, v4.4s
102104
; CHECK-NEXT: rev64 v2.4s, v1.4s
103-
; CHECK-NEXT: rev64 v4.4s, v5.4s
104-
; CHECK-NEXT: rev64 v6.4s, v3.4s
105105
; CHECK-NEXT: addp v16.4s, v0.4s, v5.4s
106106
; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
107-
; CHECK-NEXT: zip1 v21.4s, v16.4s, v16.4s
108-
; CHECK-NEXT: sub v4.4s, v5.4s, v4.4s
109-
; CHECK-NEXT: addp v5.4s, v1.4s, v3.4s
110-
; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s
107+
; CHECK-NEXT: sub v3.4s, v5.4s, v3.4s
108+
; CHECK-NEXT: addp v5.4s, v1.4s, v4.4s
109+
; CHECK-NEXT: sub v4.4s, v4.4s, v6.4s
111110
; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s
112111
; CHECK-NEXT: ext v7.16b, v0.16b, v16.16b, #4
113-
; CHECK-NEXT: ext v2.16b, v16.16b, v4.16b, #4
114-
; CHECK-NEXT: ext v6.16b, v5.16b, v3.16b, #4
115-
; CHECK-NEXT: mov v19.16b, v4.16b
112+
; CHECK-NEXT: zip1 v21.4s, v16.4s, v16.4s
113+
; CHECK-NEXT: ext v2.16b, v16.16b, v3.16b, #4
114+
; CHECK-NEXT: ext v6.16b, v5.16b, v4.16b, #4
115+
; CHECK-NEXT: mov v19.16b, v3.16b
116116
; CHECK-NEXT: ext v17.16b, v1.16b, v5.16b, #8
117-
; CHECK-NEXT: mov v20.16b, v3.16b
118-
; CHECK-NEXT: trn2 v0.4s, v21.4s, v0.4s
117+
; CHECK-NEXT: mov v20.16b, v4.16b
119118
; CHECK-NEXT: ext v7.16b, v7.16b, v7.16b, #4
119+
; CHECK-NEXT: trn2 v0.4s, v21.4s, v0.4s
120120
; CHECK-NEXT: mov v19.s[2], v16.s[3]
121121
; CHECK-NEXT: zip2 v2.4s, v2.4s, v16.4s
122122
; CHECK-NEXT: zip2 v6.4s, v6.4s, v5.4s
@@ -125,8 +125,8 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
125125
; CHECK-NEXT: mov v1.s[2], v5.s[1]
126126
; CHECK-NEXT: mov v21.16b, v7.16b
127127
; CHECK-NEXT: sub v7.4s, v0.4s, v7.4s
128-
; CHECK-NEXT: ext v2.16b, v4.16b, v2.16b, #12
129-
; CHECK-NEXT: ext v3.16b, v3.16b, v6.16b, #12
128+
; CHECK-NEXT: ext v2.16b, v3.16b, v2.16b, #12
129+
; CHECK-NEXT: ext v3.16b, v4.16b, v6.16b, #12
130130
; CHECK-NEXT: uzp2 v4.4s, v17.4s, v18.4s
131131
; CHECK-NEXT: mov v6.16b, v1.16b
132132
; CHECK-NEXT: mov v17.16b, v19.16b

0 commit comments

Comments
 (0)