diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index f7beca1b8b77e..c28cbf2bc63c2 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -6685,15 +6685,30 @@ inline bool isUZPMask(ArrayRef M, unsigned NumElts, /// <0, 8, 2, 10, 4, 12, 6, 14> or /// <1, 9, 3, 11, 5, 13, 7, 15> inline bool isTRNMask(ArrayRef M, unsigned NumElts, - unsigned &WhichResult) { + unsigned &WhichResultOut) { if (NumElts % 2 != 0) return false; - WhichResult = (M[0] == 0 ? 0 : 1); + // Check the first non-undef element for trn1 vs trn2. + unsigned WhichResult = 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if (M[i] >= 0) { + WhichResult = ((unsigned)M[i] == i ? 0 : 1); + break; + } + if (M[i + 1] >= 0) { + WhichResult = ((unsigned)M[i + 1] == i + NumElts ? 0 : 1); + break; + } + } + if (WhichResult == 2) + return false; + for (unsigned i = 0; i < NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) return false; } + WhichResultOut = WhichResult; return true; } diff --git a/llvm/test/CodeGen/AArch64/arm64-trn.ll b/llvm/test/CodeGen/AArch64/arm64-trn.ll index 073cbab6f0230..fe245d01a7a6d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-trn.ll +++ b/llvm/test/CodeGen/AArch64/arm64-trn.ll @@ -246,7 +246,7 @@ define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind { ret <4 x float> %tmp5 } -; Undef shuffle indices should not prevent matching to VTRN: +; Undef shuffle indices (even at the start of the shuffle mask) should not prevent matching to VTRN: define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind { ; CHECKLE-LABEL: vtrni8_undef: @@ -302,3 +302,115 @@ define <8 x i16> @vtrnQi16_undef(ptr %A, ptr %B) nounwind { %tmp5 = add <8 x i16> %tmp3, %tmp4 ret <8 x i16> %tmp5 } + +define <8 x i16> @vtrnQi16_undef_01(ptr %A, ptr %B) nounwind { +; CHECKLE-LABEL: vtrnQi16_undef_01: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr q0, [x0] +; CHECKLE-NEXT: ldr q1, [x1] +; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h +; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h +; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrnQi16_undef_01: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.8h }, [x0] +; CHECKBE-NEXT: ld1 { v1.8h }, [x1] +; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h +; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h +; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h +; CHECKBE-NEXT: rev64 v0.8h, v0.8h +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %tmp1 = load <8 x i16>, ptr %A + %tmp2 = load <8 x i16>, ptr %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp5 = add <8 x i16> %tmp3, %tmp4 + ret <8 x i16> %tmp5 +} + +define <8 x i16> @vtrnQi16_undef_0(ptr %A, ptr %B) nounwind { +; CHECKLE-LABEL: vtrnQi16_undef_0: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr q0, [x0] +; CHECKLE-NEXT: ldr q1, [x1] +; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h +; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h +; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrnQi16_undef_0: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.8h }, [x0] +; CHECKBE-NEXT: ld1 { v1.8h }, [x1] +; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h +; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h +; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h +; CHECKBE-NEXT: rev64 v0.8h, v0.8h +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %tmp1 = load <8 x i16>, ptr %A + %tmp2 = load <8 x i16>, ptr %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp5 = add <8 x i16> %tmp3, %tmp4 + ret <8 x i16> %tmp5 +} + +define <4 x i32> @vtrnQi32_undef_1(ptr %A, ptr %B) nounwind { +; CHECKLE-LABEL: vtrnQi32_undef_1: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr q0, [x0] +; CHECKLE-NEXT: ldr q1, [x1] +; CHECKLE-NEXT: trn1 v2.4s, v0.4s, v1.4s +; CHECKLE-NEXT: trn2 v0.4s, v0.4s, v1.4s +; CHECKLE-NEXT: add v0.4s, v2.4s, v0.4s +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrnQi32_undef_1: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.4s }, [x0] +; CHECKBE-NEXT: ld1 { v1.4s }, [x1] +; CHECKBE-NEXT: trn1 v2.4s, v0.4s, v1.4s +; CHECKBE-NEXT: trn2 v0.4s, v0.4s, v1.4s +; CHECKBE-NEXT: add v0.4s, v2.4s, v0.4s +; CHECKBE-NEXT: rev64 v0.4s, v0.4s +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %tmp1 = load <4 x i32>, ptr %A + %tmp2 = load <4 x i32>, ptr %B + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp5 = add <4 x i32> %tmp3, %tmp4 + ret <4 x i32> %tmp5 +} + +define <16 x i8> @vtrnQi8_undef_012(ptr %A, ptr %B) nounwind { +; CHECKLE-LABEL: vtrnQi8_undef_012: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr q0, [x0] +; CHECKLE-NEXT: ldr q1, [x1] +; CHECKLE-NEXT: trn1 v2.16b, v0.16b, v1.16b +; CHECKLE-NEXT: trn2 v0.16b, v0.16b, v1.16b +; CHECKLE-NEXT: add v0.16b, v2.16b, v0.16b +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrnQi8_undef_012: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.16b }, [x0] +; CHECKBE-NEXT: ld1 { v1.16b }, [x1] +; CHECKBE-NEXT: trn1 v2.16b, v0.16b, v1.16b +; CHECKBE-NEXT: trn2 v0.16b, v0.16b, v1.16b +; CHECKBE-NEXT: add v0.16b, v2.16b, v0.16b +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret + %tmp1 = load <16 x i8>, ptr %A + %tmp2 = load <16 x i8>, ptr %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + %tmp5 = add <16 x i8> %tmp3, %tmp4 + ret <16 x i8> %tmp5 +} diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll index 851fb0d03e8aa..e128abf4d7376 100644 --- a/llvm/test/CodeGen/AArch64/insert-extend.ll +++ b/llvm/test/CodeGen/AArch64/insert-extend.ll @@ -85,24 +85,24 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca ; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s ; CHECK-NEXT: zip1 v16.4s, v5.4s, v4.4s ; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s +; CHECK-NEXT: trn1 v4.4s, v5.4s, v4.4s ; CHECK-NEXT: zip2 v3.4s, v6.4s, v7.4s ; CHECK-NEXT: mov v6.s[1], v7.s[0] -; CHECK-NEXT: ext v7.16b, v5.16b, v16.16b, #8 -; CHECK-NEXT: mov v5.s[3], v4.s[2] -; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: mov v6.d[1], v7.d[1] -; CHECK-NEXT: mov v3.d[1], v5.d[1] -; CHECK-NEXT: uzp1 v1.4s, v4.4s, v0.4s -; CHECK-NEXT: uzp2 v4.4s, v4.4s, v0.4s +; CHECK-NEXT: ext v7.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: ext v5.16b, v5.16b, v16.16b, #8 +; CHECK-NEXT: mov v3.d[1], v4.d[1] +; CHECK-NEXT: uzp1 v1.4s, v7.4s, v0.4s +; CHECK-NEXT: uzp2 v4.4s, v7.4s, v0.4s +; CHECK-NEXT: mov v6.d[1], v5.d[1] ; CHECK-NEXT: addp v0.4s, v2.4s, v0.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s +; CHECK-NEXT: rev64 v7.4s, v0.4s ; CHECK-NEXT: add v5.4s, v3.4s, v6.4s ; CHECK-NEXT: sub v3.4s, v6.4s, v3.4s -; CHECK-NEXT: rev64 v7.4s, v0.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s +; CHECK-NEXT: rev64 v2.4s, v1.4s ; CHECK-NEXT: rev64 v4.4s, v5.4s ; CHECK-NEXT: rev64 v6.4s, v3.4s ; CHECK-NEXT: addp v16.4s, v0.4s, v5.4s -; CHECK-NEXT: rev64 v2.4s, v1.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s ; CHECK-NEXT: zip1 v21.4s, v16.4s, v16.4s ; CHECK-NEXT: sub v4.4s, v5.4s, v4.4s diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll index 325ab444205bf..354edc4ff7ab4 100644 --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -4,126 +4,126 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: sxtw x9, w3 ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d5, [x2] ; CHECK-NEXT: add x10, x0, x8 ; CHECK-NEXT: add x11, x2, x9 -; CHECK-NEXT: ldr d2, [x10] -; CHECK-NEXT: add x10, x10, x8 -; CHECK-NEXT: ldr d3, [x11] -; CHECK-NEXT: add x11, x11, x9 -; CHECK-NEXT: ldr d4, [x10] -; CHECK-NEXT: ldr d6, [x10, x8] -; CHECK-NEXT: ldr d5, [x11] -; CHECK-NEXT: ldr d7, [x11, x9] -; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: usubl v1.8h, v2.8b, v3.8b -; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b +; CHECK-NEXT: add x12, x10, x8 +; CHECK-NEXT: ldr d6, [x10] +; CHECK-NEXT: ldr d7, [x11] +; CHECK-NEXT: ldr d1, [x12, x8] +; CHECK-NEXT: add x8, x11, x9 +; CHECK-NEXT: ldr d2, [x12] +; CHECK-NEXT: ldr d3, [x8, x9] +; CHECK-NEXT: ldr d4, [x8] +; CHECK-NEXT: usubl v0.8h, v0.8b, v5.8b +; CHECK-NEXT: usubl v2.8h, v2.8b, v4.8b +; CHECK-NEXT: usubl v1.8h, v1.8b, v3.8b ; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b -; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 -; CHECK-NEXT: shll2 v5.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 -; CHECK-NEXT: shll2 v7.4s, v2.8h, #16 -; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h -; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h -; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h -; CHECK-NEXT: saddw v2.4s, v7.4s, v2.4h -; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s -; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s -; CHECK-NEXT: mov v7.16b, v2.16b -; CHECK-NEXT: ext v17.16b, v3.16b, v3.16b, #12 -; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s -; CHECK-NEXT: ext v16.16b, v1.16b, v4.16b, #8 -; CHECK-NEXT: mov v1.s[3], v0.s[2] -; CHECK-NEXT: mov v7.s[1], v3.s[0] -; CHECK-NEXT: uzp2 v0.4s, v5.4s, v3.4s -; CHECK-NEXT: zip2 v5.4s, v2.4s, v3.4s -; CHECK-NEXT: mov v3.s[0], v2.s[1] +; CHECK-NEXT: shll2 v6.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v4.4s, v1.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 +; CHECK-NEXT: shll2 v7.4s, v3.8h, #16 +; CHECK-NEXT: saddw v0.4s, v6.4s, v0.4h +; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h +; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h +; CHECK-NEXT: saddw v3.4s, v7.4s, v3.4h +; CHECK-NEXT: uzp2 v4.4s, v1.4s, v2.4s +; CHECK-NEXT: zip1 v5.4s, v3.4s, v0.4s +; CHECK-NEXT: mov v6.16b, v2.16b +; CHECK-NEXT: trn1 v7.4s, v3.4s, v0.4s +; CHECK-NEXT: zip2 v0.4s, v3.4s, v0.4s +; CHECK-NEXT: ext v17.16b, v1.16b, v1.16b, #12 +; CHECK-NEXT: zip2 v18.4s, v1.4s, v2.4s +; CHECK-NEXT: zip2 v16.4s, v2.4s, v1.4s +; CHECK-NEXT: mov v6.s[1], v1.s[0] +; CHECK-NEXT: uzp2 v4.4s, v4.4s, v1.4s +; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #8 +; CHECK-NEXT: mov v1.s[0], v2.s[1] ; CHECK-NEXT: ext v2.16b, v2.16b, v17.16b, #12 -; CHECK-NEXT: mov v18.d[1], v1.d[1] -; CHECK-NEXT: mov v7.d[1], v16.d[1] -; CHECK-NEXT: mov v0.d[1], v6.d[1] -; CHECK-NEXT: mov v3.d[1], v4.d[1] -; CHECK-NEXT: mov v5.d[1], v1.d[1] -; CHECK-NEXT: mov v2.d[1], v6.d[1] -; CHECK-NEXT: add v0.4s, v0.4s, v18.4s -; CHECK-NEXT: add v1.4s, v3.4s, v7.4s -; CHECK-NEXT: sub v3.4s, v7.4s, v3.4s -; CHECK-NEXT: sub v2.4s, v5.4s, v2.4s +; CHECK-NEXT: mov v18.d[1], v7.d[1] +; CHECK-NEXT: mov v16.d[1], v7.d[1] +; CHECK-NEXT: mov v4.d[1], v0.d[1] +; CHECK-NEXT: mov v6.d[1], v3.d[1] +; CHECK-NEXT: mov v1.d[1], v5.d[1] +; CHECK-NEXT: mov v2.d[1], v0.d[1] +; CHECK-NEXT: add v0.4s, v4.4s, v18.4s +; CHECK-NEXT: add v3.4s, v1.4s, v6.4s +; CHECK-NEXT: sub v1.4s, v6.4s, v1.4s +; CHECK-NEXT: sub v2.4s, v16.4s, v2.4s ; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v6.4s, v1.4s -; CHECK-NEXT: sub v5.4s, v3.4s, v2.4s -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s +; CHECK-NEXT: rev64 v5.4s, v3.4s +; CHECK-NEXT: sub v6.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: mov v4.d[1], v0.d[1] -; CHECK-NEXT: mov v6.d[1], v1.d[1] -; CHECK-NEXT: rev64 v3.4s, v5.4s -; CHECK-NEXT: rev64 v7.4s, v2.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s -; CHECK-NEXT: add v0.4s, v0.4s, v6.4s -; CHECK-NEXT: sub v3.4s, v5.4s, v3.4s -; CHECK-NEXT: addp v4.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v5.4s, v2.4s, v7.4s -; CHECK-NEXT: addp v2.4s, v0.4s, v2.4s -; CHECK-NEXT: rev64 v6.4s, v0.4s +; CHECK-NEXT: mov v5.d[1], v3.d[1] +; CHECK-NEXT: rev64 v2.4s, v6.4s ; CHECK-NEXT: rev64 v7.4s, v1.4s -; CHECK-NEXT: ext v16.16b, v4.16b, v3.16b, #4 -; CHECK-NEXT: ext v17.16b, v2.16b, v5.16b, #4 +; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s +; CHECK-NEXT: add v0.4s, v0.4s, v5.4s +; CHECK-NEXT: sub v2.4s, v6.4s, v2.4s +; CHECK-NEXT: sub v5.4s, v1.4s, v7.4s +; CHECK-NEXT: addp v4.4s, v3.4s, v6.4s +; CHECK-NEXT: addp v1.4s, v0.4s, v1.4s +; CHECK-NEXT: rev64 v6.4s, v0.4s +; CHECK-NEXT: rev64 v7.4s, v3.4s +; CHECK-NEXT: ext v16.16b, v4.16b, v2.16b, #4 +; CHECK-NEXT: ext v17.16b, v1.16b, v5.16b, #4 ; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v7.4s -; CHECK-NEXT: mov v7.16b, v3.16b +; CHECK-NEXT: sub v3.4s, v3.4s, v7.4s +; CHECK-NEXT: mov v7.16b, v2.16b ; CHECK-NEXT: zip2 v6.4s, v16.4s, v4.4s ; CHECK-NEXT: mov v16.16b, v5.16b -; CHECK-NEXT: zip2 v17.4s, v17.4s, v2.4s -; CHECK-NEXT: ext v18.16b, v0.16b, v2.16b, #4 +; CHECK-NEXT: zip2 v17.4s, v17.4s, v1.4s +; CHECK-NEXT: ext v18.16b, v0.16b, v1.16b, #4 ; CHECK-NEXT: mov v7.s[2], v4.s[3] -; CHECK-NEXT: mov v21.16b, v1.16b -; CHECK-NEXT: mov v16.s[2], v2.s[3] +; CHECK-NEXT: mov v21.16b, v3.16b +; CHECK-NEXT: mov v16.s[2], v1.s[3] ; CHECK-NEXT: ext v5.16b, v5.16b, v17.16b, #12 -; CHECK-NEXT: zip1 v17.4s, v2.4s, v2.4s -; CHECK-NEXT: ext v3.16b, v3.16b, v6.16b, #12 +; CHECK-NEXT: zip1 v17.4s, v1.4s, v1.4s +; CHECK-NEXT: ext v2.16b, v2.16b, v6.16b, #12 ; CHECK-NEXT: ext v18.16b, v18.16b, v18.16b, #4 ; CHECK-NEXT: mov v19.16b, v7.16b -; CHECK-NEXT: ext v6.16b, v1.16b, v4.16b, #8 +; CHECK-NEXT: ext v6.16b, v3.16b, v4.16b, #8 ; CHECK-NEXT: mov v21.s[2], v4.s[1] ; CHECK-NEXT: mov v20.16b, v16.16b ; CHECK-NEXT: mov v19.s[1], v4.s[2] ; CHECK-NEXT: trn2 v0.4s, v17.4s, v0.4s ; CHECK-NEXT: sub v16.4s, v16.4s, v5.4s ; CHECK-NEXT: mov v17.16b, v18.16b -; CHECK-NEXT: ext v1.16b, v6.16b, v1.16b, #4 -; CHECK-NEXT: sub v7.4s, v7.4s, v3.4s -; CHECK-NEXT: mov v20.s[1], v2.s[2] -; CHECK-NEXT: mov v17.s[0], v2.s[1] -; CHECK-NEXT: mov v2.16b, v21.16b -; CHECK-NEXT: add v3.4s, v19.4s, v3.4s -; CHECK-NEXT: uzp2 v1.4s, v6.4s, v1.4s +; CHECK-NEXT: ext v3.16b, v6.16b, v3.16b, #4 +; CHECK-NEXT: sub v7.4s, v7.4s, v2.4s +; CHECK-NEXT: mov v20.s[1], v1.s[2] +; CHECK-NEXT: mov v17.s[0], v1.s[1] +; CHECK-NEXT: mov v1.16b, v21.16b +; CHECK-NEXT: add v2.4s, v19.4s, v2.4s +; CHECK-NEXT: uzp2 v3.4s, v6.4s, v3.4s ; CHECK-NEXT: add v5.4s, v20.4s, v5.4s -; CHECK-NEXT: mov v2.s[1], v4.s[0] +; CHECK-NEXT: mov v1.s[1], v4.s[0] ; CHECK-NEXT: sub v4.4s, v0.4s, v18.4s -; CHECK-NEXT: mov v3.d[1], v7.d[1] +; CHECK-NEXT: mov v2.d[1], v7.d[1] ; CHECK-NEXT: add v0.4s, v0.4s, v17.4s ; CHECK-NEXT: mov v5.d[1], v16.d[1] -; CHECK-NEXT: sub v6.4s, v21.4s, v1.4s -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: sub v6.4s, v21.4s, v3.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: mov v0.d[1], v4.d[1] -; CHECK-NEXT: cmlt v4.8h, v3.8h, #0 -; CHECK-NEXT: cmlt v2.8h, v5.8h, #0 +; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 +; CHECK-NEXT: cmlt v3.8h, v5.8h, #0 ; CHECK-NEXT: mov v1.d[1], v6.d[1] -; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: add v2.4s, v4.4s, v2.4s ; CHECK-NEXT: cmlt v6.8h, v0.8h, #0 -; CHECK-NEXT: add v5.4s, v2.4s, v5.4s -; CHECK-NEXT: eor v3.16b, v3.16b, v4.16b +; CHECK-NEXT: add v5.4s, v3.4s, v5.4s +; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b ; CHECK-NEXT: cmlt v7.8h, v1.8h, #0 ; CHECK-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-NEXT: eor v2.16b, v5.16b, v2.16b +; CHECK-NEXT: eor v3.16b, v5.16b, v3.16b ; CHECK-NEXT: add v1.4s, v7.4s, v1.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s +; CHECK-NEXT: add v2.4s, v3.4s, v2.4s ; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s @@ -227,114 +227,112 @@ entry: define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) { ; CHECK-LABEL: v2: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-NEXT: sxtw x9, w3 -; CHECK-NEXT: ldr d4, [x0] -; CHECK-NEXT: ldr d5, [x2] +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x2] ; CHECK-NEXT: add x10, x0, x8 ; CHECK-NEXT: add x11, x2, x9 -; CHECK-NEXT: add x12, x10, x8 +; CHECK-NEXT: ldr d2, [x10] +; CHECK-NEXT: add x10, x10, x8 +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: add x11, x11, x9 +; CHECK-NEXT: ldr d4, [x10, x8] ; CHECK-NEXT: ldr d6, [x10] +; CHECK-NEXT: ldr d5, [x11, x9] ; CHECK-NEXT: ldr d7, [x11] -; CHECK-NEXT: ldr d0, [x12, x8] -; CHECK-NEXT: add x8, x11, x9 -; CHECK-NEXT: ldr d1, [x12] -; CHECK-NEXT: ldr d2, [x8, x9] -; CHECK-NEXT: ldr d3, [x8] -; CHECK-NEXT: usubl v1.8h, v1.8b, v3.8b -; CHECK-NEXT: usubl v0.8h, v0.8b, v2.8b -; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: usubl v1.8h, v2.8b, v3.8b ; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b ; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 ; CHECK-NEXT: shll2 v5.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v7.4s, v3.8h, #16 ; CHECK-NEXT: shll2 v6.4s, v2.8h, #16 +; CHECK-NEXT: shll2 v7.4s, v3.8h, #16 ; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h ; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h -; CHECK-NEXT: saddw v3.4s, v7.4s, v3.4h ; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h -; CHECK-NEXT: uzp2 v4.4s, v0.4s, v1.4s -; CHECK-NEXT: mov v7.16b, v3.16b -; CHECK-NEXT: mov v17.16b, v1.16b -; CHECK-NEXT: zip1 v5.4s, v3.4s, v2.4s -; CHECK-NEXT: zip2 v6.4s, v3.4s, v2.4s -; CHECK-NEXT: zip2 v16.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v18.16b, v0.16b, v0.16b, #12 -; CHECK-NEXT: mov v7.s[3], v2.s[2] -; CHECK-NEXT: mov v17.s[1], v0.s[0] -; CHECK-NEXT: uzp2 v2.4s, v4.4s, v0.4s -; CHECK-NEXT: mov v4.16b, v0.16b +; CHECK-NEXT: saddw v3.4s, v7.4s, v3.4h +; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s +; CHECK-NEXT: trn1 v18.4s, v1.4s, v0.4s ; CHECK-NEXT: zip2 v0.4s, v1.4s, v0.4s -; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #8 -; CHECK-NEXT: mov v4.s[0], v1.s[1] +; CHECK-NEXT: uzp2 v5.4s, v2.4s, v3.4s +; CHECK-NEXT: mov v6.16b, v2.16b +; CHECK-NEXT: mov v16.16b, v3.16b +; CHECK-NEXT: zip2 v7.4s, v2.4s, v3.4s +; CHECK-NEXT: mov v6.s[0], v3.s[1] +; CHECK-NEXT: ext v17.16b, v1.16b, v4.16b, #8 +; CHECK-NEXT: mov v16.s[1], v2.s[0] +; CHECK-NEXT: uzp2 v1.4s, v5.4s, v2.4s +; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #12 +; CHECK-NEXT: zip2 v2.4s, v3.4s, v2.4s +; CHECK-NEXT: mov v7.d[1], v18.d[1] +; CHECK-NEXT: mov v6.d[1], v4.d[1] +; CHECK-NEXT: mov v16.d[1], v17.d[1] +; CHECK-NEXT: mov v1.d[1], v0.d[1] +; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #12 +; CHECK-NEXT: mov v2.d[1], v18.d[1] +; CHECK-NEXT: add v4.4s, v6.4s, v16.4s +; CHECK-NEXT: add v1.4s, v1.4s, v7.4s +; CHECK-NEXT: mov v3.d[1], v0.d[1] +; CHECK-NEXT: rev64 v5.4s, v4.4s +; CHECK-NEXT: rev64 v0.4s, v1.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s +; CHECK-NEXT: sub v3.4s, v16.4s, v6.4s +; CHECK-NEXT: mov v5.d[1], v4.d[1] +; CHECK-NEXT: mov v0.d[1], v1.d[1] +; CHECK-NEXT: add v6.4s, v2.4s, v3.4s +; CHECK-NEXT: sub v2.4s, v3.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v5.4s +; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s +; CHECK-NEXT: zip1 v3.4s, v1.4s, v6.4s +; CHECK-NEXT: uzp2 v4.4s, v1.4s, v6.4s +; CHECK-NEXT: zip2 v16.4s, v1.4s, v6.4s +; CHECK-NEXT: zip1 v5.4s, v0.4s, v2.4s +; CHECK-NEXT: trn1 v7.4s, v0.4s, v2.4s +; CHECK-NEXT: zip2 v2.4s, v0.4s, v2.4s +; CHECK-NEXT: trn2 v3.4s, v1.4s, v3.4s +; CHECK-NEXT: uzp2 v4.4s, v4.4s, v1.4s +; CHECK-NEXT: mov v1.s[1], v6.s[1] +; CHECK-NEXT: ext v0.16b, v0.16b, v5.16b, #8 ; CHECK-NEXT: mov v16.d[1], v7.d[1] -; CHECK-NEXT: ext v1.16b, v1.16b, v18.16b, #12 -; CHECK-NEXT: mov v2.d[1], v6.d[1] -; CHECK-NEXT: mov v0.d[1], v7.d[1] -; CHECK-NEXT: mov v17.d[1], v3.d[1] -; CHECK-NEXT: mov v4.d[1], v5.d[1] -; CHECK-NEXT: mov v1.d[1], v6.d[1] -; CHECK-NEXT: add v2.4s, v2.4s, v16.4s -; CHECK-NEXT: add v3.4s, v4.4s, v17.4s -; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v1.4s, v17.4s, v4.4s -; CHECK-NEXT: rev64 v6.4s, v3.4s -; CHECK-NEXT: mov v5.d[1], v2.d[1] -; CHECK-NEXT: sub v4.4s, v1.4s, v0.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mov v6.d[1], v3.d[1] -; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s -; CHECK-NEXT: add v1.4s, v2.4s, v6.4s -; CHECK-NEXT: zip1 v2.4s, v3.4s, v4.4s -; CHECK-NEXT: zip2 v7.4s, v3.4s, v4.4s -; CHECK-NEXT: zip1 v5.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v6.4s, v1.4s, v0.4s -; CHECK-NEXT: mov v18.16b, v1.16b -; CHECK-NEXT: ext v16.16b, v3.16b, v2.16b, #8 -; CHECK-NEXT: zip2 v17.4s, v1.4s, v0.4s -; CHECK-NEXT: mov v3.s[3], v4.s[2] -; CHECK-NEXT: mov v18.s[1], v0.s[1] -; CHECK-NEXT: trn2 v4.4s, v1.4s, v5.4s -; CHECK-NEXT: uzp2 v1.4s, v6.4s, v1.4s -; CHECK-NEXT: mov v17.d[1], v3.d[1] -; CHECK-NEXT: mov v18.d[1], v2.d[1] -; CHECK-NEXT: mov v4.d[1], v16.d[1] -; CHECK-NEXT: mov v1.d[1], v7.d[1] -; CHECK-NEXT: add v0.4s, v17.4s, v1.4s -; CHECK-NEXT: add v2.4s, v18.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s -; CHECK-NEXT: sub v3.4s, v4.4s, v18.4s -; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: mov v4.d[1], v2.d[1] +; CHECK-NEXT: mov v1.d[1], v5.d[1] +; CHECK-NEXT: mov v3.d[1], v0.d[1] +; CHECK-NEXT: add v0.4s, v16.4s, v4.4s +; CHECK-NEXT: sub v4.4s, v4.4s, v16.4s +; CHECK-NEXT: add v2.4s, v1.4s, v3.4s +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v0.4s, v4.4s +; CHECK-NEXT: zip2 v7.4s, v4.4s, v0.4s ; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #4 -; CHECK-NEXT: zip2 v6.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v7.4s, v1.4s, v0.4s -; CHECK-NEXT: zip2 v16.4s, v3.4s, v2.4s -; CHECK-NEXT: zip2 v17.4s, v2.4s, v3.4s -; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v18.16b, v4.16b, v1.16b, #8 -; CHECK-NEXT: ext v19.16b, v5.16b, v3.16b, #8 -; CHECK-NEXT: zip1 v1.4s, v2.4s, v3.4s +; CHECK-NEXT: zip2 v16.4s, v1.4s, v2.4s +; CHECK-NEXT: zip2 v17.4s, v2.4s, v1.4s +; CHECK-NEXT: zip1 v0.4s, v0.4s, v4.4s +; CHECK-NEXT: ext v18.16b, v3.16b, v4.16b, #8 +; CHECK-NEXT: ext v19.16b, v5.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v2.4s, v16.4s, v7.4s -; CHECK-NEXT: sub v3.4s, v6.4s, v17.4s -; CHECK-NEXT: ext v4.16b, v18.16b, v4.16b, #4 +; CHECK-NEXT: sub v4.4s, v6.4s, v17.4s +; CHECK-NEXT: ext v3.16b, v18.16b, v3.16b, #4 +; CHECK-NEXT: cmlt v6.8h, v2.8h, #0 ; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #4 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmlt v1.8h, v3.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v2.8h, #0 -; CHECK-NEXT: add v4.4s, v5.4s, v4.4s -; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v1.8h, v4.8h, #0 ; CHECK-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-NEXT: add v3.4s, v1.4s, v3.4s -; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 -; CHECK-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-NEXT: add v4.4s, v1.4s, v4.4s +; CHECK-NEXT: add v3.4s, v5.4s, v3.4s +; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 ; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b -; CHECK-NEXT: add v3.4s, v7.4s, v4.4s -; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b +; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b +; CHECK-NEXT: cmlt v7.8h, v3.8h, #0 +; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b ; CHECK-NEXT: eor v2.16b, v3.16b, v7.16b ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: add v0.4s, v2.4s, v0.4s @@ -478,73 +476,73 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s ; CHECK-NEXT: rev64 v7.4s, v3.4s ; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v4.4s, v2.4s, v4.4s -; CHECK-NEXT: addp v2.4s, v2.4s, v3.4s -; CHECK-NEXT: ext v1.16b, v5.16b, v6.16b, #4 -; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: addp v1.4s, v2.4s, v3.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: ext v16.16b, v5.16b, v6.16b, #4 +; CHECK-NEXT: sub v3.4s, v3.4s, v7.4s +; CHECK-NEXT: uzp2 v4.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp1 v7.4s, v1.4s, v0.4s ; CHECK-NEXT: mov v6.s[3], v5.s[2] -; CHECK-NEXT: zip2 v16.4s, v4.4s, v7.4s -; CHECK-NEXT: zip1 v4.4s, v4.4s, v7.4s -; CHECK-NEXT: ext v1.16b, v1.16b, v5.16b, #4 -; CHECK-NEXT: uzp2 v5.4s, v2.4s, v0.4s -; CHECK-NEXT: uzp1 v0.4s, v2.4s, v0.4s -; CHECK-NEXT: uzp1 v7.4s, v2.4s, v3.4s -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: mov v16.d[1], v6.d[1] -; CHECK-NEXT: mov v4.d[1], v1.d[1] -; CHECK-NEXT: rev64 v1.4s, v5.4s -; CHECK-NEXT: rev64 v0.4s, v0.4s -; CHECK-NEXT: sub v2.4s, v7.4s, v2.4s -; CHECK-NEXT: sub v3.4s, v4.4s, v16.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: add v1.4s, v16.4s, v4.4s -; CHECK-NEXT: zip1 v4.4s, v2.4s, v3.4s -; CHECK-NEXT: zip1 v5.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v6.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v7.4s, v0.4s, v1.4s +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: zip2 v17.4s, v2.4s, v3.4s -; CHECK-NEXT: ext v16.16b, v2.16b, v4.16b, #8 -; CHECK-NEXT: trn2 v5.4s, v0.4s, v5.4s -; CHECK-NEXT: uzp2 v6.4s, v6.4s, v0.4s -; CHECK-NEXT: mov v2.s[3], v3.s[2] -; CHECK-NEXT: mov v0.s[1], v1.s[1] -; CHECK-NEXT: mov v5.d[1], v16.d[1] -; CHECK-NEXT: mov v6.d[1], v17.d[1] -; CHECK-NEXT: mov v7.d[1], v2.d[1] -; CHECK-NEXT: mov v0.d[1], v4.d[1] -; CHECK-NEXT: add v1.4s, v6.4s, v7.4s -; CHECK-NEXT: add v2.4s, v5.4s, v0.4s -; CHECK-NEXT: sub v3.4s, v7.4s, v6.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s -; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: zip1 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: ext v3.16b, v16.16b, v5.16b, #4 +; CHECK-NEXT: rev64 v4.4s, v4.4s +; CHECK-NEXT: rev64 v5.4s, v7.4s +; CHECK-NEXT: mov v17.d[1], v6.d[1] +; CHECK-NEXT: mov v2.d[1], v3.d[1] +; CHECK-NEXT: uzp1 v3.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp2 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v4.4s, v5.4s +; CHECK-NEXT: add v4.4s, v17.4s, v2.4s +; CHECK-NEXT: sub v0.4s, v3.4s, v0.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v17.4s +; CHECK-NEXT: zip1 v3.4s, v1.4s, v4.4s +; CHECK-NEXT: uzp2 v5.4s, v1.4s, v4.4s +; CHECK-NEXT: zip1 v6.4s, v0.4s, v2.4s +; CHECK-NEXT: trn1 v7.4s, v0.4s, v2.4s +; CHECK-NEXT: zip2 v16.4s, v1.4s, v4.4s +; CHECK-NEXT: zip2 v2.4s, v0.4s, v2.4s +; CHECK-NEXT: trn2 v3.4s, v1.4s, v3.4s +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v1.4s +; CHECK-NEXT: ext v0.16b, v0.16b, v6.16b, #8 +; CHECK-NEXT: mov v1.s[1], v4.s[1] +; CHECK-NEXT: mov v16.d[1], v7.d[1] +; CHECK-NEXT: mov v3.d[1], v0.d[1] +; CHECK-NEXT: mov v5.d[1], v2.d[1] +; CHECK-NEXT: mov v1.d[1], v6.d[1] +; CHECK-NEXT: add v0.4s, v5.4s, v16.4s +; CHECK-NEXT: sub v4.4s, v16.4s, v5.4s +; CHECK-NEXT: add v2.4s, v3.4s, v1.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v0.4s, v4.4s +; CHECK-NEXT: zip2 v7.4s, v4.4s, v0.4s ; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #4 -; CHECK-NEXT: zip2 v6.4s, v1.4s, v3.4s -; CHECK-NEXT: zip2 v7.4s, v3.4s, v1.4s -; CHECK-NEXT: zip2 v16.4s, v0.4s, v2.4s -; CHECK-NEXT: zip2 v17.4s, v2.4s, v0.4s -; CHECK-NEXT: zip1 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ext v18.16b, v4.16b, v3.16b, #8 -; CHECK-NEXT: ext v19.16b, v5.16b, v0.16b, #8 -; CHECK-NEXT: zip1 v0.4s, v2.4s, v0.4s +; CHECK-NEXT: zip2 v16.4s, v1.4s, v2.4s +; CHECK-NEXT: zip2 v17.4s, v2.4s, v1.4s +; CHECK-NEXT: zip1 v0.4s, v0.4s, v4.4s +; CHECK-NEXT: ext v18.16b, v3.16b, v4.16b, #8 +; CHECK-NEXT: ext v19.16b, v5.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v2.4s, v16.4s, v7.4s -; CHECK-NEXT: sub v3.4s, v6.4s, v17.4s -; CHECK-NEXT: ext v4.16b, v18.16b, v4.16b, #4 -; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #4 -; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s -; CHECK-NEXT: cmlt v1.8h, v3.8h, #0 +; CHECK-NEXT: sub v4.4s, v6.4s, v17.4s +; CHECK-NEXT: ext v3.16b, v18.16b, v3.16b, #4 ; CHECK-NEXT: cmlt v6.8h, v2.8h, #0 -; CHECK-NEXT: add v4.4s, v5.4s, v4.4s -; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 +; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #4 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmlt v1.8h, v4.8h, #0 ; CHECK-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-NEXT: add v3.4s, v1.4s, v3.4s -; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 -; CHECK-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-NEXT: add v4.4s, v1.4s, v4.4s +; CHECK-NEXT: add v3.4s, v5.4s, v3.4s +; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 ; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b -; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b -; CHECK-NEXT: add v3.4s, v7.4s, v4.4s -; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b +; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b +; CHECK-NEXT: cmlt v7.8h, v3.8h, #0 +; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b ; CHECK-NEXT: eor v2.16b, v3.16b, v7.16b ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: add v0.4s, v2.4s, v0.4s diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll index 3685e9cf85bd6..2f490c8f3f20f 100644 --- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -369,16 +369,16 @@ define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) { ; CHECK: .Lfunc_begin8: ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ldp q0, q2, [x0] -; CHECK-NEXT: ldp q3, q4, [x0, #64] -; CHECK-NEXT: ldp q5, q6, [x0, #32] -; CHECK-NEXT: ldp q7, q16, [x0, #96] -; CHECK-NEXT: mov v0.h[5], v2.h[4] -; CHECK-NEXT: zip1 v2.8h, v3.8h, v4.8h -; CHECK-NEXT: zip1 v3.8h, v5.8h, v6.8h -; CHECK-NEXT: mov v7.h[5], v16.h[4] -; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: uzp1 v1.4s, v3.4s, v7.4s +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x0, #64] +; CHECK-NEXT: ldp q4, q5, [x0, #32] +; CHECK-NEXT: ldp q6, q7, [x0, #96] +; CHECK-NEXT: trn1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: zip1 v1.8h, v2.8h, v3.8h +; CHECK-NEXT: zip1 v2.8h, v4.8h, v5.8h +; CHECK-NEXT: trn1 v3.8h, v6.8h, v7.8h +; CHECK-NEXT: trn1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp1 v1.4s, v2.4s, v3.4s ; CHECK-NEXT: zip2 v2.4s, v0.4s, v1.4s ; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x0] ; CHECK-NEXT: str q2, [x0, #64]