diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index dc96b249c4e40..65a5c2157ec49 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3846,7 +3846,7 @@ let Predicates = [HasSVE2] in { let Predicates = [HasSVE2orSME] in { // SVE2 vector splice (constructive) - defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">; + defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice", AArch64splice>; } // End HasSVE2orSME let Predicates = [HasSVE2] in { diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 02ee0fe924457..ea6c826382871 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7245,11 +7245,33 @@ class sve2_int_perm_splice_cons sz8_64, string asm, let hasSideEffects = 0; } -multiclass sve2_int_perm_splice_cons { +multiclass sve2_int_perm_splice_cons { def _B : sve2_int_perm_splice_cons<0b00, asm, ZPR8, ZZ_b>; def _H : sve2_int_perm_splice_cons<0b01, asm, ZPR16, ZZ_h>; def _S : sve2_int_perm_splice_cons<0b10, asm, ZPR32, ZZ_s>; def _D : sve2_int_perm_splice_cons<0b11, asm, ZPR64, ZZ_d>; + + let AddedComplexity = 2 in { + foreach VT = [nxv16i8] in + def : Pat<(VT (op nxv16i1:$pred, VT:$zn1, VT:$zn2)), + (!cast(NAME # _B) + nxv16i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>; + + foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in + def : Pat<(VT (op nxv8i1:$pred, VT:$zn1, VT:$zn2)), + (!cast(NAME # _H) + nxv8i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>; + + foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in + def : Pat<(VT (op nxv4i1:$pred, VT:$zn1, VT:$zn2)), + (!cast(NAME # _S) + nxv4i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>; + + foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in + def : Pat<(VT (op nxv2i1:$pred, VT:$zn1, VT:$zn2)), + (!cast(NAME # _D) + nxv2i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>; + } } class sve_int_perm_rev sz8_64, bits<2> opc, string asm, diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll index c1810c678ea52..6e2ecfca9e963 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -61,10 +61,10 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) { define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-LABEL: concat_v16i8: ; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: splice z0.b, p0, { z0.b, z1.b } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -172,10 +172,10 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) { define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-LABEL: concat_v8i16: ; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -270,10 +270,10 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) { define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-LABEL: concat_v4i32: ; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -340,10 +340,10 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) { define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-LABEL: concat_v2i64: ; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: ptrue p0.d, vl1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: splice z0.d, p0, { z0.d, z1.d } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -406,17 +406,33 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) { ; define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) { -; CHECK-LABEL: concat_v4f16: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: mov z2.h, z1.h[1] -; CHECK-NEXT: mov z3.h, z0.h[1] -; CHECK-NEXT: zip1 z1.h, z1.h, z2.h -; CHECK-NEXT: zip1 z0.h, z0.h, z3.h -; CHECK-NEXT: zip1 z0.s, z0.s, z1.s -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: ret +; SVE2-LABEL: concat_v4f16: +; SVE2: // %bb.0: +; SVE2-NEXT: cnth x8 +; SVE2-NEXT: adrp x9, .LCPI15_0 +; SVE2-NEXT: adrp x10, .LCPI15_1 +; SVE2-NEXT: mov z2.h, w8 +; SVE2-NEXT: ldr q3, [x9, :lo12:.LCPI15_0] +; SVE2-NEXT: ldr q4, [x10, :lo12:.LCPI15_1] +; SVE2-NEXT: ptrue p0.h, vl8 +; SVE2-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: mad z2.h, p0/m, z3.h, z4.h +; SVE2-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +; +; SME-LABEL: concat_v4f16: +; SME: // %bb.0: +; SME-NEXT: // kill: def $d1 killed $d1 def $z1 +; SME-NEXT: // kill: def $d0 killed $d0 def $z0 +; SME-NEXT: mov z2.h, z1.h[1] +; SME-NEXT: mov z3.h, z0.h[1] +; SME-NEXT: zip1 z1.h, z1.h, z2.h +; SME-NEXT: zip1 z0.h, z0.h, z3.h +; SME-NEXT: zip1 z0.s, z0.s, z1.s +; SME-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SME-NEXT: ret ; ; NONEON-NOSVE-LABEL: concat_v4f16: ; NONEON-NOSVE: // %bb.0: @@ -436,10 +452,10 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) { define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) { ; CHECK-LABEL: concat_v8f16: ; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -534,10 +550,10 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) { define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) { ; CHECK-LABEL: concat_v4f32: ; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -604,10 +620,10 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) { define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) { ; CHECK-LABEL: concat_v2f64: ; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1 ; CHECK-NEXT: ptrue p0.d, vl1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: splice z0.d, p0, { z0.d, z1.d } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll index f1771a753826c..2282e74af5d00 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE -; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=SVE +; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=SVE2 +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=SVE2 ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -842,16 +842,16 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) { ; ; SVE2-LABEL: test_copysign_v4f32_v4f64: ; SVE2: // %bb.0: -; SVE2-NEXT: ldp q0, q1, [x1] +; SVE2-NEXT: ldp q1, q0, [x1] ; SVE2-NEXT: ptrue p0.d -; SVE2-NEXT: ldr q2, [x0] -; SVE2-NEXT: fcvt z1.s, p0/m, z1.d ; SVE2-NEXT: fcvt z0.s, p0/m, z0.d +; SVE2-NEXT: fcvt z1.s, p0/m, z1.d ; SVE2-NEXT: ptrue p0.s, vl2 -; SVE2-NEXT: uzp1 z1.s, z1.s, z1.s -; SVE2-NEXT: uzp1 z0.s, z0.s, z0.s -; SVE2-NEXT: splice z0.s, p0, z0.s, z1.s +; SVE2-NEXT: uzp1 z3.s, z0.s, z0.s +; SVE2-NEXT: uzp1 z2.s, z1.s, z1.s ; SVE2-NEXT: mov z1.s, #0x7fffffff +; SVE2-NEXT: splice z0.s, p0, { z2.s, z3.s } +; SVE2-NEXT: ldr q2, [x0] ; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d ; SVE2-NEXT: str q2, [x0] ; SVE2-NEXT: ret @@ -1237,16 +1237,16 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) { ; ; SVE2-LABEL: test_copysign_v8f16_v8f32: ; SVE2: // %bb.0: -; SVE2-NEXT: ldp q0, q1, [x1] +; SVE2-NEXT: ldp q1, q0, [x1] ; SVE2-NEXT: ptrue p0.s -; SVE2-NEXT: ldr q2, [x0] -; SVE2-NEXT: fcvt z1.h, p0/m, z1.s ; SVE2-NEXT: fcvt z0.h, p0/m, z0.s +; SVE2-NEXT: fcvt z1.h, p0/m, z1.s ; SVE2-NEXT: ptrue p0.h, vl4 -; SVE2-NEXT: uzp1 z1.h, z1.h, z1.h -; SVE2-NEXT: uzp1 z0.h, z0.h, z0.h -; SVE2-NEXT: splice z0.h, p0, z0.h, z1.h +; SVE2-NEXT: uzp1 z3.h, z0.h, z0.h +; SVE2-NEXT: uzp1 z2.h, z1.h, z1.h ; SVE2-NEXT: mov z1.h, #32767 // =0x7fff +; SVE2-NEXT: splice z0.h, p0, { z2.h, z3.h } +; SVE2-NEXT: ldr q2, [x0] ; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d ; SVE2-NEXT: str q2, [x0] ; SVE2-NEXT: ret @@ -1349,5 +1349,3 @@ declare <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b) #0 declare <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) #0 declare <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) #0 -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index 516772b8ca664..1fdcd4f826870 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE -; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2 -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" @@ -26,19 +25,6 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v4i8: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8 -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 -; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v4i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #32 @@ -85,27 +71,12 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v8i8: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0 -; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0 -; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NEON-NOSVE-NEXT: xtn v0.8b, v0.8h -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v8i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #32 @@ -177,45 +148,21 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z1.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z4.h, z5.h } +; CHECK-NEXT: splice z1.h, p0, { z1.h, z2.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: uzp1 z2.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z3.b, z1.b, z1.b +; CHECK-NEXT: splice z0.b, p0, { z2.b, z3.b } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v16i8: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: sshll2 v2.8h, v1.16b, #0 -; NEON-NOSVE-NEXT: sshll2 v3.8h, v0.16b, #0 -; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: sshll2 v4.4s, v2.8h, #0 -; NEON-NOSVE-NEXT: sshll2 v5.4s, v3.8h, #0 -; NEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NEON-NOSVE-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; NEON-NOSVE-NEXT: sshll2 v5.4s, v0.8h, #0 -; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; NEON-NOSVE-NEXT: sshll2 v3.4s, v1.8h, #0 -; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: sdivr z3.s, p0/m, z3.s, z5.s -; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h -; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; NEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v16i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! @@ -319,7 +266,6 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z4.h, z2.b ; CHECK-NEXT: sunpklo z2.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: sunpklo z5.s, z4.h ; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: sunpklo z3.s, z3.h @@ -328,7 +274,6 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q5, [x0] ; CHECK-NEXT: sunpklo z16.h, z5.b ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: sunpklo z5.h, z5.b ; CHECK-NEXT: sunpklo z18.s, z16.h ; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 @@ -337,81 +282,36 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z18.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 ; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s ; CHECK-NEXT: sunpklo z16.s, z6.h ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 ; CHECK-NEXT: sunpklo z6.s, z6.h +; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h ; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s +; CHECK-NEXT: uzp1 z18.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z19.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z21.h, z7.h, z7.h ; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h -; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z0.h, z16.h, z16.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: uzp1 z4.h, z17.h, z17.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z6.h -; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b +; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h +; CHECK-NEXT: splice z2.h, p0, { z20.h, z21.h } +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } +; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h +; CHECK-NEXT: splice z3.h, p0, { z18.h, z19.h } +; CHECK-NEXT: splice z1.h, p0, { z4.h, z5.h } +; CHECK-NEXT: uzp1 z4.b, z2.b, z2.b ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: splice z1.b, p0, z1.b, z3.b -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z2.b -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b +; CHECK-NEXT: uzp1 z5.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z3.b, z1.b, z1.b +; CHECK-NEXT: splice z0.b, p0, { z4.b, z5.b } +; CHECK-NEXT: splice z1.b, p0, { z2.b, z3.b } +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v32i8: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ldp q6, q3, [x1] -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NEON-NOSVE-NEXT: sshll2 v1.8h, v3.16b, #0 -; NEON-NOSVE-NEXT: sshll2 v4.8h, v2.16b, #0 -; NEON-NOSVE-NEXT: sshll v3.8h, v3.8b, #0 -; NEON-NOSVE-NEXT: sshll v2.8h, v2.8b, #0 -; NEON-NOSVE-NEXT: sshll2 v7.8h, v6.16b, #0 -; NEON-NOSVE-NEXT: sshll v6.8h, v6.8b, #0 -; NEON-NOSVE-NEXT: sshll2 v0.4s, v1.8h, #0 -; NEON-NOSVE-NEXT: sshll2 v5.4s, v4.8h, #0 -; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0 -; NEON-NOSVE-NEXT: sshll2 v17.4s, v7.8h, #0 -; NEON-NOSVE-NEXT: sshll v7.4s, v7.4h, #0 -; NEON-NOSVE-NEXT: sdivr z0.s, p0/m, z0.s, z5.s -; NEON-NOSVE-NEXT: sshll2 v5.4s, v2.8h, #0 -; NEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NEON-NOSVE-NEXT: sdivr z1.s, p0/m, z1.s, z4.s -; NEON-NOSVE-NEXT: sshll2 v4.4s, v3.8h, #0 -; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NEON-NOSVE-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; NEON-NOSVE-NEXT: ldr q5, [x0] -; NEON-NOSVE-NEXT: sshll2 v16.8h, v5.16b, #0 -; NEON-NOSVE-NEXT: sshll v5.8h, v5.8b, #0 -; NEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NEON-NOSVE-NEXT: sshll2 v18.4s, v16.8h, #0 -; NEON-NOSVE-NEXT: sshll v16.4s, v16.4h, #0 -; NEON-NOSVE-NEXT: sdivr z17.s, p0/m, z17.s, z18.s -; NEON-NOSVE-NEXT: sshll2 v18.4s, v5.8h, #0 -; NEON-NOSVE-NEXT: sshll v5.4s, v5.4h, #0 -; NEON-NOSVE-NEXT: sdivr z7.s, p0/m, z7.s, z16.s -; NEON-NOSVE-NEXT: sshll2 v16.4s, v6.8h, #0 -; NEON-NOSVE-NEXT: sshll v6.4s, v6.4h, #0 -; NEON-NOSVE-NEXT: sdivr z16.s, p0/m, z16.s, z18.s -; NEON-NOSVE-NEXT: sdiv z5.s, p0/m, z5.s, z6.s -; NEON-NOSVE-NEXT: sdiv z2.s, p0/m, z2.s, z3.s -; NEON-NOSVE-NEXT: uzp1 v3.8h, v7.8h, v17.8h -; NEON-NOSVE-NEXT: uzp1 v5.8h, v5.8h, v16.8h -; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h -; NEON-NOSVE-NEXT: uzp1 v2.16b, v5.16b, v3.16b -; NEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NEON-NOSVE-NEXT: stp q2, q0, [x0] -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #96 @@ -571,17 +471,6 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v2i16: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: shl v1.2s, v1.2s, #16 -; NEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NEON-NOSVE-NEXT: ptrue p0.s, vl2 -; NEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16 -; NEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0 -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v2i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #32 @@ -614,15 +503,6 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v4i16: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v4i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #32 @@ -664,26 +544,14 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z0.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v8i16: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0 -; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0 -; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v8i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! @@ -748,41 +616,18 @@ define void @sdiv_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z4.h, z5.h, z5.h ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z3.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z0.h -; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z4.h, z5.h } +; CHECK-NEXT: splice z1.h, p0, { z1.h, z2.h } +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v16i16: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ldp q4, q1, [x1] -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0 -; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0 -; NEON-NOSVE-NEXT: sshll2 v5.4s, v4.8h, #0 -; NEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0 -; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; NEON-NOSVE-NEXT: ldr q3, [x0] -; NEON-NOSVE-NEXT: sshll2 v6.4s, v3.8h, #0 -; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NEON-NOSVE-NEXT: sdivr z5.s, p0/m, z5.s, z6.s -; NEON-NOSVE-NEXT: sdiv z3.s, p0/m, z3.s, z4.s -; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v5.8h -; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NEON-NOSVE-NEXT: stp q1, q0, [x0] -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #96 @@ -876,15 +721,6 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v2i32: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ptrue p0.s, vl2 -; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0 -; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1 -; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0 -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v2i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #32 @@ -913,15 +749,6 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v4i32: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0 -; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1 -; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0 -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v4i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! @@ -957,17 +784,6 @@ define void @sdiv_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v8i32: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NEON-NOSVE-NEXT: sdivr z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: movprfx z1, z2 -; NEON-NOSVE-NEXT: sdiv z1.s, p0/m, z1.s, z3.s -; NEON-NOSVE-NEXT: stp q0, q1, [x0] -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #96 @@ -1021,15 +837,6 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v1i64: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ptrue p0.d, vl1 -; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0 -; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1 -; NEON-NOSVE-NEXT: sdiv z0.d, p0/m, z0.d, z1.d -; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0 -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v1i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #16 @@ -1055,15 +862,6 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v2i64: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ptrue p0.d, vl2 -; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0 -; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1 -; NEON-NOSVE-NEXT: sdiv z0.d, p0/m, z0.d, z1.d -; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0 -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v2i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! @@ -1093,17 +891,6 @@ define void @sdiv_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: sdiv_v4i64: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NEON-NOSVE-NEXT: ptrue p0.d, vl2 -; NEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NEON-NOSVE-NEXT: sdivr z0.d, p0/m, z0.d, z1.d -; NEON-NOSVE-NEXT: movprfx z1, z2 -; NEON-NOSVE-NEXT: sdiv z1.d, p0/m, z1.d, z3.d -; NEON-NOSVE-NEXT: stp q0, q1, [x0] -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: sdiv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #96 @@ -1135,9 +922,7 @@ define void @sdiv_v4i64(ptr %a, ptr %b) { ret void } -; ; UDIV -; define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: udiv_v4i8: @@ -1154,17 +939,6 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v4i8: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: bic v0.4h, #255, lsl #8 -; NEON-NOSVE-NEXT: bic v1.4h, #255, lsl #8 -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v4i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #32 @@ -1211,27 +985,12 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v8i8: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0 -; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0 -; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NEON-NOSVE-NEXT: xtn v0.8b, v0.8h -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v8i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #32 @@ -1303,45 +1062,21 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z1.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z4.h, z5.h } +; CHECK-NEXT: splice z1.h, p0, { z1.h, z2.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: uzp1 z2.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z3.b, z1.b, z1.b +; CHECK-NEXT: splice z0.b, p0, { z2.b, z3.b } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v16i8: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ushll2 v2.8h, v1.16b, #0 -; NEON-NOSVE-NEXT: ushll2 v3.8h, v0.16b, #0 -; NEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: ushll2 v4.4s, v2.8h, #0 -; NEON-NOSVE-NEXT: ushll2 v5.4s, v3.8h, #0 -; NEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NEON-NOSVE-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; NEON-NOSVE-NEXT: ushll2 v5.4s, v0.8h, #0 -; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; NEON-NOSVE-NEXT: ushll2 v3.4s, v1.8h, #0 -; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: udivr z3.s, p0/m, z3.s, z5.s -; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h -; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; NEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v16i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! @@ -1445,7 +1180,6 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z4.h, z2.b ; CHECK-NEXT: uunpklo z2.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uunpklo z5.s, z4.h ; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: uunpklo z3.s, z3.h @@ -1454,7 +1188,6 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q5, [x0] ; CHECK-NEXT: uunpklo z16.h, z5.b ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: uunpklo z5.h, z5.b ; CHECK-NEXT: uunpklo z18.s, z16.h ; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 @@ -1463,81 +1196,36 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z18.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 ; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s ; CHECK-NEXT: uunpklo z16.s, z6.h ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 ; CHECK-NEXT: uunpklo z6.s, z6.h +; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h ; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s +; CHECK-NEXT: uzp1 z18.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z19.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z21.h, z7.h, z7.h ; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h -; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z0.h, z16.h, z16.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: uzp1 z4.h, z17.h, z17.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z6.h -; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b +; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h +; CHECK-NEXT: splice z2.h, p0, { z20.h, z21.h } +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } +; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h +; CHECK-NEXT: splice z3.h, p0, { z18.h, z19.h } +; CHECK-NEXT: splice z1.h, p0, { z4.h, z5.h } +; CHECK-NEXT: uzp1 z4.b, z2.b, z2.b ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: splice z1.b, p0, z1.b, z3.b -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z2.b -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b +; CHECK-NEXT: uzp1 z5.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z3.b, z1.b, z1.b +; CHECK-NEXT: splice z0.b, p0, { z4.b, z5.b } +; CHECK-NEXT: splice z1.b, p0, { z2.b, z3.b } +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v32i8: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ldp q6, q3, [x1] -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NEON-NOSVE-NEXT: ushll2 v1.8h, v3.16b, #0 -; NEON-NOSVE-NEXT: ushll2 v4.8h, v2.16b, #0 -; NEON-NOSVE-NEXT: ushll v3.8h, v3.8b, #0 -; NEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 -; NEON-NOSVE-NEXT: ushll2 v7.8h, v6.16b, #0 -; NEON-NOSVE-NEXT: ushll v6.8h, v6.8b, #0 -; NEON-NOSVE-NEXT: ushll2 v0.4s, v1.8h, #0 -; NEON-NOSVE-NEXT: ushll2 v5.4s, v4.8h, #0 -; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0 -; NEON-NOSVE-NEXT: ushll2 v17.4s, v7.8h, #0 -; NEON-NOSVE-NEXT: ushll v7.4s, v7.4h, #0 -; NEON-NOSVE-NEXT: udivr z0.s, p0/m, z0.s, z5.s -; NEON-NOSVE-NEXT: ushll2 v5.4s, v2.8h, #0 -; NEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NEON-NOSVE-NEXT: udivr z1.s, p0/m, z1.s, z4.s -; NEON-NOSVE-NEXT: ushll2 v4.4s, v3.8h, #0 -; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NEON-NOSVE-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; NEON-NOSVE-NEXT: ldr q5, [x0] -; NEON-NOSVE-NEXT: ushll2 v16.8h, v5.16b, #0 -; NEON-NOSVE-NEXT: ushll v5.8h, v5.8b, #0 -; NEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NEON-NOSVE-NEXT: ushll2 v18.4s, v16.8h, #0 -; NEON-NOSVE-NEXT: ushll v16.4s, v16.4h, #0 -; NEON-NOSVE-NEXT: udivr z17.s, p0/m, z17.s, z18.s -; NEON-NOSVE-NEXT: ushll2 v18.4s, v5.8h, #0 -; NEON-NOSVE-NEXT: ushll v5.4s, v5.4h, #0 -; NEON-NOSVE-NEXT: udivr z7.s, p0/m, z7.s, z16.s -; NEON-NOSVE-NEXT: ushll2 v16.4s, v6.8h, #0 -; NEON-NOSVE-NEXT: ushll v6.4s, v6.4h, #0 -; NEON-NOSVE-NEXT: udivr z16.s, p0/m, z16.s, z18.s -; NEON-NOSVE-NEXT: udiv z5.s, p0/m, z5.s, z6.s -; NEON-NOSVE-NEXT: udiv z2.s, p0/m, z2.s, z3.s -; NEON-NOSVE-NEXT: uzp1 v3.8h, v7.8h, v17.8h -; NEON-NOSVE-NEXT: uzp1 v5.8h, v5.8h, v16.8h -; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h -; NEON-NOSVE-NEXT: uzp1 v2.16b, v5.16b, v3.16b -; NEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NEON-NOSVE-NEXT: stp q2, q0, [x0] -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #96 @@ -1697,16 +1385,6 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v2i16: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff -; NEON-NOSVE-NEXT: ptrue p0.s, vl2 -; NEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b -; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0 -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v2i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #32 @@ -1739,15 +1417,6 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v4i16: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v4i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #32 @@ -1789,26 +1458,14 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z0.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v8i16: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0 -; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0 -; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v8i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! @@ -1873,41 +1530,18 @@ define void @udiv_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z4.h, z5.h, z5.h ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z3.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z0.h -; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z4.h, z5.h } +; CHECK-NEXT: splice z1.h, p0, { z1.h, z2.h } +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v16i16: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ldp q4, q1, [x1] -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0 -; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0 -; NEON-NOSVE-NEXT: ushll2 v5.4s, v4.8h, #0 -; NEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0 -; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; NEON-NOSVE-NEXT: ldr q3, [x0] -; NEON-NOSVE-NEXT: ushll2 v6.4s, v3.8h, #0 -; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NEON-NOSVE-NEXT: udivr z5.s, p0/m, z5.s, z6.s -; NEON-NOSVE-NEXT: udiv z3.s, p0/m, z3.s, z4.s -; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v5.8h -; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NEON-NOSVE-NEXT: stp q1, q0, [x0] -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #96 @@ -2001,15 +1635,6 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v2i32: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ptrue p0.s, vl2 -; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0 -; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1 -; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0 -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v2i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #32 @@ -2038,15 +1663,6 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v4i32: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0 -; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1 -; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0 -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v4i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! @@ -2082,17 +1698,6 @@ define void @udiv_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v8i32: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NEON-NOSVE-NEXT: ptrue p0.s, vl4 -; NEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NEON-NOSVE-NEXT: udivr z0.s, p0/m, z0.s, z1.s -; NEON-NOSVE-NEXT: movprfx z1, z2 -; NEON-NOSVE-NEXT: udiv z1.s, p0/m, z1.s, z3.s -; NEON-NOSVE-NEXT: stp q0, q1, [x0] -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #96 @@ -2146,15 +1751,6 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v1i64: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ptrue p0.d, vl1 -; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0 -; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1 -; NEON-NOSVE-NEXT: udiv z0.d, p0/m, z0.d, z1.d -; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0 -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v1i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #16 @@ -2180,15 +1776,6 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v2i64: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ptrue p0.d, vl2 -; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0 -; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1 -; NEON-NOSVE-NEXT: udiv z0.d, p0/m, z0.d, z1.d -; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0 -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v2i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! @@ -2218,17 +1805,6 @@ define void @udiv_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; -; NEON-NOSVE-LABEL: udiv_v4i64: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NEON-NOSVE-NEXT: ptrue p0.d, vl2 -; NEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NEON-NOSVE-NEXT: udivr z0.d, p0/m, z0.d, z1.d -; NEON-NOSVE-NEXT: movprfx z1, z2 -; NEON-NOSVE-NEXT: udiv z1.d, p0/m, z1.d, z3.d -; NEON-NOSVE-NEXT: stp q0, q1, [x0] -; NEON-NOSVE-NEXT: ret -; ; NONEON-NOSVE-LABEL: udiv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: sub sp, sp, #96 @@ -2261,64 +1837,22 @@ define void @udiv_v4i64(ptr %a, ptr %b) { } define void @udiv_constantsplat_v8i32(ptr %a) { -; SVE-LABEL: udiv_constantsplat_v8i32: -; SVE: // %bb.0: -; SVE-NEXT: mov w8, #8969 // =0x2309 -; SVE-NEXT: ldp q1, q2, [x0] -; SVE-NEXT: movk w8, #22765, lsl #16 -; SVE-NEXT: ptrue p0.s, vl4 -; SVE-NEXT: mov z0.s, w8 -; SVE-NEXT: movprfx z3, z1 -; SVE-NEXT: umulh z3.s, p0/m, z3.s, z0.s -; SVE-NEXT: umulh z0.s, p0/m, z0.s, z2.s -; SVE-NEXT: sub z1.s, z1.s, z3.s -; SVE-NEXT: sub z2.s, z2.s, z0.s -; SVE-NEXT: lsr z1.s, z1.s, #1 -; SVE-NEXT: lsr z2.s, z2.s, #1 -; SVE-NEXT: add z1.s, z1.s, z3.s -; SVE-NEXT: add z0.s, z2.s, z0.s -; SVE-NEXT: lsr z1.s, z1.s, #6 -; SVE-NEXT: lsr z0.s, z0.s, #6 -; SVE-NEXT: stp q1, q0, [x0] -; SVE-NEXT: ret -; -; SVE2-LABEL: udiv_constantsplat_v8i32: -; SVE2: // %bb.0: -; SVE2-NEXT: mov w8, #8969 // =0x2309 -; SVE2-NEXT: ldp q1, q2, [x0] -; SVE2-NEXT: movk w8, #22765, lsl #16 -; SVE2-NEXT: mov z0.s, w8 -; SVE2-NEXT: umulh z3.s, z1.s, z0.s -; SVE2-NEXT: umulh z0.s, z2.s, z0.s -; SVE2-NEXT: sub z1.s, z1.s, z3.s -; SVE2-NEXT: sub z2.s, z2.s, z0.s -; SVE2-NEXT: usra z3.s, z1.s, #1 -; SVE2-NEXT: usra z0.s, z2.s, #1 -; SVE2-NEXT: lsr z1.s, z3.s, #6 -; SVE2-NEXT: lsr z0.s, z0.s, #6 -; SVE2-NEXT: stp q1, q0, [x0] -; SVE2-NEXT: ret -; -; NEON-NOSVE-LABEL: udiv_constantsplat_v8i32: -; NEON-NOSVE: // %bb.0: -; NEON-NOSVE-NEXT: mov w8, #8969 // =0x2309 -; NEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NEON-NOSVE-NEXT: movk w8, #22765, lsl #16 -; NEON-NOSVE-NEXT: dup v0.4s, w8 -; NEON-NOSVE-NEXT: umull2 v3.2d, v1.4s, v0.4s -; NEON-NOSVE-NEXT: umull v4.2d, v1.2s, v0.2s -; NEON-NOSVE-NEXT: umull2 v5.2d, v2.4s, v0.4s -; NEON-NOSVE-NEXT: umull v0.2d, v2.2s, v0.2s -; NEON-NOSVE-NEXT: uzp2 v3.4s, v4.4s, v3.4s -; NEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v5.4s -; NEON-NOSVE-NEXT: sub v1.4s, v1.4s, v3.4s -; NEON-NOSVE-NEXT: sub v2.4s, v2.4s, v0.4s -; NEON-NOSVE-NEXT: usra v3.4s, v1.4s, #1 -; NEON-NOSVE-NEXT: usra v0.4s, v2.4s, #1 -; NEON-NOSVE-NEXT: ushr v1.4s, v3.4s, #6 -; NEON-NOSVE-NEXT: ushr v0.4s, v0.4s, #6 -; NEON-NOSVE-NEXT: stp q1, q0, [x0] -; NEON-NOSVE-NEXT: ret +; CHECK-LABEL: udiv_constantsplat_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #8969 // =0x2309 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: mov z0.s, w8 +; CHECK-NEXT: umulh z3.s, z1.s, z0.s +; CHECK-NEXT: umulh z0.s, z2.s, z0.s +; CHECK-NEXT: sub z1.s, z1.s, z3.s +; CHECK-NEXT: sub z2.s, z2.s, z0.s +; CHECK-NEXT: usra z3.s, z1.s, #1 +; CHECK-NEXT: usra z0.s, z2.s, #1 +; CHECK-NEXT: lsr z1.s, z3.s, #6 +; CHECK-NEXT: lsr z0.s, z0.s, #6 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32: ; NONEON-NOSVE: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index b4641172f8b06..9497ec88e57b4 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE @@ -76,10 +76,10 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h +; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h +; CHECK-NEXT: splice z2.h, p0, { z3.h, z4.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -160,23 +160,23 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: sunpklo z5.s, z5.h ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: sunpklo z3.h, z3.b ; CHECK-NEXT: sunpklo z6.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: splice z5.h, p0, z5.h, z3.h +; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } +; CHECK-NEXT: uzp1 z4.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z7.h, z3.h, z3.h +; CHECK-NEXT: splice z3.h, p0, { z6.h, z7.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z3.b +; CHECK-NEXT: uzp1 z5.b, z3.b, z3.b +; CHECK-NEXT: splice z2.b, p0, { z4.b, z5.b } ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -300,14 +300,12 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z4.s, z16.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 ; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sunpklo z7.s, z7.h ; CHECK-NEXT: movprfx z6, z4 ; CHECK-NEXT: sdiv z6.s, p0/m, z6.s, z3.s ; CHECK-NEXT: ldr q3, [x0] ; CHECK-NEXT: ldr q4, [x1] ; CHECK-NEXT: sunpklo z16.s, z16.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: sunpklo z17.h, z4.b ; CHECK-NEXT: sunpklo z18.h, z3.b ; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s @@ -317,11 +315,9 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: sunpklo z17.s, z17.h ; CHECK-NEXT: sunpklo z18.s, z18.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s ; CHECK-NEXT: mov z20.d, z3.d ; CHECK-NEXT: ext z20.b, z20.b, z3.b, #8 -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: sunpklo z20.h, z20.b ; CHECK-NEXT: sunpklo z22.s, z20.h ; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 @@ -329,32 +325,36 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: mov z18.d, z4.d ; CHECK-NEXT: sunpklo z20.s, z20.h ; CHECK-NEXT: ext z18.b, z18.b, z4.b, #8 -; CHECK-NEXT: uzp1 z16.h, z19.h, z19.h ; CHECK-NEXT: sunpklo z18.h, z18.b ; CHECK-NEXT: sunpklo z21.s, z18.h ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: sunpklo z18.s, z18.h ; CHECK-NEXT: sdivr z21.s, p0/m, z21.s, z22.s -; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z22.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z23.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h ; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z20.s +; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h -; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h -; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h -; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b -; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h -; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h +; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h +; CHECK-NEXT: splice z7.h, p0, { z22.h, z23.h } +; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } +; CHECK-NEXT: uzp1 z16.h, z21.h, z21.h +; CHECK-NEXT: splice z2.h, p0, { z19.h, z20.h } +; CHECK-NEXT: uzp1 z6.b, z7.b, z7.b +; CHECK-NEXT: uzp1 z7.b, z5.b, z5.b +; CHECK-NEXT: uzp1 z17.h, z18.h, z18.h +; CHECK-NEXT: splice z16.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z17.b, z2.b, z2.b ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: splice z2.b, p0, z2.b, z6.b -; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b -; CHECK-NEXT: splice z5.b, p0, z5.b, z7.b +; CHECK-NEXT: splice z5.b, p0, { z6.b, z7.b } +; CHECK-NEXT: uzp1 z18.b, z16.b, z16.b +; CHECK-NEXT: splice z2.b, p0, { z17.b, z18.b } ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b -; CHECK-NEXT: mls z3.b, p0/m, z5.b, z4.b -; CHECK-NEXT: stp q3, q0, [x0] +; CHECK-NEXT: mls z0.b, p0/m, z5.b, z1.b +; CHECK-NEXT: msb z2.b, p0/m, z4.b, z3.b +; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: srem_v32i8: @@ -600,9 +600,9 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h +; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -680,23 +680,23 @@ define void @srem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s ; CHECK-NEXT: mov z6.d, z4.d ; CHECK-NEXT: ext z6.b, z6.b, z4.b, #8 -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sunpklo z6.s, z6.h ; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: mov z7.d, z1.d ; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: sunpklo z7.s, z7.h ; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: uzp1 z16.h, z5.h, z5.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: splice z5.h, p0, z5.h, z6.h -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z7.h +; CHECK-NEXT: uzp1 z17.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h +; CHECK-NEXT: splice z2.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h +; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: mls z3.h, p0/m, z5.h, z4.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h -; CHECK-NEXT: stp q3, q0, [x0] +; CHECK-NEXT: msb z2.h, p0/m, z4.h, z3.h +; CHECK-NEXT: mls z0.h, p0/m, z5.h, z1.h +; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: srem_v16i16: @@ -1126,10 +1126,10 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h +; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h +; CHECK-NEXT: splice z2.h, p0, { z3.h, z4.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -1210,23 +1210,23 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: uunpklo z5.s, z5.h ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z6.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: splice z5.h, p0, z5.h, z3.h +; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } +; CHECK-NEXT: uzp1 z4.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z7.h, z3.h, z3.h +; CHECK-NEXT: splice z3.h, p0, { z6.h, z7.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z3.b +; CHECK-NEXT: uzp1 z5.b, z3.b, z3.b +; CHECK-NEXT: splice z2.b, p0, { z4.b, z5.b } ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -1350,14 +1350,12 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z4.s, z16.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 ; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uunpklo z7.s, z7.h ; CHECK-NEXT: movprfx z6, z4 ; CHECK-NEXT: udiv z6.s, p0/m, z6.s, z3.s ; CHECK-NEXT: ldr q3, [x0] ; CHECK-NEXT: ldr q4, [x1] ; CHECK-NEXT: uunpklo z16.s, z16.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: uunpklo z17.h, z4.b ; CHECK-NEXT: uunpklo z18.h, z3.b ; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s @@ -1367,11 +1365,9 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: uunpklo z17.s, z17.h ; CHECK-NEXT: uunpklo z18.s, z18.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s ; CHECK-NEXT: mov z20.d, z3.d ; CHECK-NEXT: ext z20.b, z20.b, z3.b, #8 -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: uunpklo z20.h, z20.b ; CHECK-NEXT: uunpklo z22.s, z20.h ; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 @@ -1379,32 +1375,36 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: mov z18.d, z4.d ; CHECK-NEXT: uunpklo z20.s, z20.h ; CHECK-NEXT: ext z18.b, z18.b, z4.b, #8 -; CHECK-NEXT: uzp1 z16.h, z19.h, z19.h ; CHECK-NEXT: uunpklo z18.h, z18.b ; CHECK-NEXT: uunpklo z21.s, z18.h ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: uunpklo z18.s, z18.h ; CHECK-NEXT: udivr z21.s, p0/m, z21.s, z22.s -; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z22.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z23.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h ; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z20.s +; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h -; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h -; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h -; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b -; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h -; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h +; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h +; CHECK-NEXT: splice z7.h, p0, { z22.h, z23.h } +; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } +; CHECK-NEXT: uzp1 z16.h, z21.h, z21.h +; CHECK-NEXT: splice z2.h, p0, { z19.h, z20.h } +; CHECK-NEXT: uzp1 z6.b, z7.b, z7.b +; CHECK-NEXT: uzp1 z7.b, z5.b, z5.b +; CHECK-NEXT: uzp1 z17.h, z18.h, z18.h +; CHECK-NEXT: splice z16.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z17.b, z2.b, z2.b ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: splice z2.b, p0, z2.b, z6.b -; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b -; CHECK-NEXT: splice z5.b, p0, z5.b, z7.b +; CHECK-NEXT: splice z5.b, p0, { z6.b, z7.b } +; CHECK-NEXT: uzp1 z18.b, z16.b, z16.b +; CHECK-NEXT: splice z2.b, p0, { z17.b, z18.b } ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b -; CHECK-NEXT: mls z3.b, p0/m, z5.b, z4.b -; CHECK-NEXT: stp q3, q0, [x0] +; CHECK-NEXT: mls z0.b, p0/m, z5.b, z1.b +; CHECK-NEXT: msb z2.b, p0/m, z4.b, z3.b +; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: urem_v32i8: @@ -1650,9 +1650,9 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h +; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -1730,23 +1730,23 @@ define void @urem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s ; CHECK-NEXT: mov z6.d, z4.d ; CHECK-NEXT: ext z6.b, z6.b, z4.b, #8 -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uunpklo z6.s, z6.h ; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: mov z7.d, z1.d ; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: uunpklo z7.s, z7.h ; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: uzp1 z16.h, z5.h, z5.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: splice z5.h, p0, z5.h, z6.h -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z7.h +; CHECK-NEXT: uzp1 z17.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h +; CHECK-NEXT: splice z2.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h +; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: mls z3.h, p0/m, z5.h, z4.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h -; CHECK-NEXT: stp q3, q0, [x0] +; CHECK-NEXT: msb z2.h, p0/m, z4.h, z3.h +; CHECK-NEXT: mls z0.h, p0/m, z5.h, z1.h +; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: urem_v16i16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll index 5235423c00d9a..e07036f2a1acf 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE @@ -11,28 +11,28 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q3, q2, [x0] ; CHECK-NEXT: fcmne p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: fcmne p2.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 -; CHECK-NEXT: fcmne p0.s, p0/z, z2.s, #0.0 +; CHECK-NEXT: fcmne p3.s, p0/z, z2.s, #0.0 +; CHECK-NEXT: fcmne p0.s, p0/z, z3.s, #0.0 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h +; CHECK-NEXT: uzp1 z5.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z4.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z3.h, z3.h +; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b -; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b -; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b +; CHECK-NEXT: splice z0.b, p0, { z1.b, z2.b } ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: umaxv b0, p0, z1.b +; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -120,49 +120,49 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) { define i1 @ptest_or_v16i1(ptr %a, ptr %b) { ; CHECK-LABEL: ptest_or_v16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ldp q0, q1, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: fcmne p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: fcmne p2.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ldp q2, q3, [x1, #32] +; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: fcmne p1.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: ldp q1, q6, [x1] ; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 -; CHECK-NEXT: fcmne p4.s, p0/z, z2.s, #0.0 -; CHECK-NEXT: fcmne p5.s, p0/z, z5.s, #0.0 -; CHECK-NEXT: fcmne p6.s, p0/z, z4.s, #0.0 -; CHECK-NEXT: fcmne p7.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmne p2.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmne p5.s, p0/z, z2.s, #0.0 +; CHECK-NEXT: fcmne p4.s, p0/z, z5.s, #0.0 +; CHECK-NEXT: fcmne p7.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: fcmne p6.s, p0/z, z6.s, #0.0 +; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z3.s, p4/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z4.s, p5/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.s, p6/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: mov z3.s, p4/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z6.s, p7/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.s, p6/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z17.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z19.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z16.h, z1.h, z1.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h -; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h -; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h +; CHECK-NEXT: uzp1 z1.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z18.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z0.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z2.h, z7.h, z7.h +; CHECK-NEXT: splice z4.h, p0, { z16.h, z17.h } +; CHECK-NEXT: splice z5.h, p0, { z18.h, z19.h } +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } +; CHECK-NEXT: splice z1.h, p0, { z2.h, z3.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b -; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b -; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b -; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b -; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b +; CHECK-NEXT: uzp1 z3.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b +; CHECK-NEXT: uzp1 z2.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z4.b, z1.b, z1.b +; CHECK-NEXT: splice z0.b, p0, { z2.b, z3.b } +; CHECK-NEXT: splice z1.b, p0, { z4.b, z5.b } ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: orr z0.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 @@ -329,49 +329,49 @@ declare i1 @llvm.vector.reduce.or.i1.v16i1(<16 x i1>) define i1 @ptest_and_v16i1(ptr %a, ptr %b) { ; CHECK-LABEL: ptest_and_v16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ldp q0, q1, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: fcmne p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: fcmne p2.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ldp q2, q3, [x1, #32] +; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: fcmne p1.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: ldp q1, q6, [x1] ; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 -; CHECK-NEXT: fcmne p4.s, p0/z, z2.s, #0.0 -; CHECK-NEXT: fcmne p5.s, p0/z, z5.s, #0.0 -; CHECK-NEXT: fcmne p6.s, p0/z, z4.s, #0.0 -; CHECK-NEXT: fcmne p7.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmne p2.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmne p5.s, p0/z, z2.s, #0.0 +; CHECK-NEXT: fcmne p4.s, p0/z, z5.s, #0.0 +; CHECK-NEXT: fcmne p7.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: fcmne p6.s, p0/z, z6.s, #0.0 +; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z3.s, p4/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z4.s, p5/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.s, p6/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: mov z3.s, p4/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z6.s, p7/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.s, p6/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z17.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z19.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z16.h, z1.h, z1.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h -; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h -; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h +; CHECK-NEXT: uzp1 z1.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z18.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z0.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z2.h, z7.h, z7.h +; CHECK-NEXT: splice z4.h, p0, { z16.h, z17.h } +; CHECK-NEXT: splice z5.h, p0, { z18.h, z19.h } +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } +; CHECK-NEXT: splice z1.h, p0, { z2.h, z3.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b -; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b -; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b -; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b -; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b +; CHECK-NEXT: uzp1 z3.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b +; CHECK-NEXT: uzp1 z2.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z4.b, z1.b, z1.b +; CHECK-NEXT: splice z0.b, p0, { z2.b, z3.b } +; CHECK-NEXT: splice z1.b, p0, { z4.b, z5.b } ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: and z0.d, z1.d, z3.d +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll index c0aa162b19b77..13fcd94ea8a26 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE @@ -129,11 +129,11 @@ define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) { define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) { ; CHECK-LABEL: store_trunc_v2i256i64: ; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0, #32] ; CHECK-NEXT: ptrue p0.d, vl1 -; CHECK-NEXT: ldr d0, [x0, #32] -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d -; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: splice z0.d, p0, { z0.d, z1.d } +; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: store_trunc_v2i256i64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll index 77aaeeadcfc2f..9d241f6f927e1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE @@ -13,11 +13,11 @@ target triple = "aarch64-unknown-linux-gnu" define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind { ; CHECK-LABEL: trunc_v16i16_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: uzp1 z3.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z2.b, z1.b, z1.b +; CHECK-NEXT: splice z0.b, p0, { z2.b, z3.b } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -69,18 +69,18 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind { define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v32i16_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z3.b -; CHECK-NEXT: add z0.b, z0.b, z0.b +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: uzp1 z5.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z4.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z1.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z0.b, z3.b, z3.b +; CHECK-NEXT: splice z2.b, p0, { z4.b, z5.b } +; CHECK-NEXT: splice z0.b, p0, { z0.b, z1.b } ; CHECK-NEXT: add z1.b, z2.b, z2.b -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: add z0.b, z0.b, z0.b +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: trunc_v32i16_v32i8: @@ -216,27 +216,27 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind { define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v64i16_v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #64] +; CHECK-NEXT: ldp q1, q0, [x0, #64] ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: ldp q6, q7, [x0, #32] -; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b -; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b -; CHECK-NEXT: uzp1 z7.b, z7.b, z7.b -; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z3.b -; CHECK-NEXT: splice z4.b, p0, z4.b, z5.b -; CHECK-NEXT: splice z6.b, p0, z6.b, z7.b +; CHECK-NEXT: ldp q2, q3, [x0, #96] +; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: uzp1 z7.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z6.b, z1.b, z1.b +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: uzp1 z17.b, z3.b, z3.b +; CHECK-NEXT: uzp1 z16.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b +; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z5.b, z0.b, z0.b +; CHECK-NEXT: splice z0.b, p0, { z6.b, z7.b } +; CHECK-NEXT: uzp1 z4.b, z1.b, z1.b +; CHECK-NEXT: splice z1.b, p0, { z16.b, z17.b } +; CHECK-NEXT: splice z2.b, p0, { z2.b, z3.b } +; CHECK-NEXT: splice z3.b, p0, { z4.b, z5.b } ; CHECK-NEXT: add z0.b, z0.b, z0.b +; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: add z2.b, z2.b, z2.b -; CHECK-NEXT: add z1.b, z4.b, z4.b -; CHECK-NEXT: add z3.b, z6.b, z6.b +; CHECK-NEXT: add z3.b, z3.b, z3.b ; CHECK-NEXT: stp q0, q1, [x1, #32] ; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret @@ -527,49 +527,49 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: ldp q4, q5, [x0] ; CHECK-NEXT: ldp q6, q7, [x0, #64] -; CHECK-NEXT: ldp q16, q17, [x0, #224] -; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b -; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: ldp q20, q21, [x0, #160] -; CHECK-NEXT: uzp1 z7.b, z7.b, z7.b +; CHECK-NEXT: uzp1 z17.b, z3.b, z3.b +; CHECK-NEXT: ldp q3, q18, [x0, #224] +; CHECK-NEXT: uzp1 z16.b, z2.b, z2.b +; CHECK-NEXT: ldp q2, q19, [x0, #128] ; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: uzp1 z17.b, z17.b, z17.b -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: uzp1 z16.b, z16.b, z16.b -; CHECK-NEXT: ldp q18, q19, [x0, #128] -; CHECK-NEXT: splice z2.b, p0, z2.b, z3.b -; CHECK-NEXT: uzp1 z3.b, z21.b, z21.b -; CHECK-NEXT: uzp1 z20.b, z20.b, z20.b -; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b -; CHECK-NEXT: ldp q21, q22, [x0] -; CHECK-NEXT: splice z16.b, p0, z16.b, z17.b +; CHECK-NEXT: uzp1 z21.b, z18.b, z18.b +; CHECK-NEXT: ldp q18, q22, [x0, #160] +; CHECK-NEXT: uzp1 z20.b, z3.b, z3.b +; CHECK-NEXT: uzp1 z24.b, z19.b, z19.b +; CHECK-NEXT: ldp q3, q19, [x0, #96] +; CHECK-NEXT: uzp1 z23.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z26.b, z22.b, z22.b +; CHECK-NEXT: splice z2.b, p0, { z16.b, z17.b } +; CHECK-NEXT: uzp1 z17.b, z7.b, z7.b +; CHECK-NEXT: uzp1 z25.b, z18.b, z18.b +; CHECK-NEXT: splice z7.b, p0, { z20.b, z21.b } +; CHECK-NEXT: uzp1 z21.b, z5.b, z5.b ; CHECK-NEXT: uzp1 z19.b, z19.b, z19.b -; CHECK-NEXT: uzp1 z18.b, z18.b, z18.b -; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b -; CHECK-NEXT: splice z20.b, p0, z20.b, z3.b -; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b -; CHECK-NEXT: splice z6.b, p0, z6.b, z7.b -; CHECK-NEXT: uzp1 z5.b, z22.b, z22.b -; CHECK-NEXT: uzp1 z7.b, z21.b, z21.b -; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: splice z18.b, p0, z18.b, z19.b -; CHECK-NEXT: add z2.b, z2.b, z2.b -; CHECK-NEXT: splice z4.b, p0, z4.b, z3.b -; CHECK-NEXT: add z3.b, z16.b, z16.b -; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b -; CHECK-NEXT: add z1.b, z20.b, z20.b -; CHECK-NEXT: add z5.b, z18.b, z18.b -; CHECK-NEXT: stp q2, q3, [x1, #96] -; CHECK-NEXT: add z2.b, z6.b, z6.b +; CHECK-NEXT: uzp1 z20.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z5.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z16.b, z6.b, z6.b +; CHECK-NEXT: splice z6.b, p0, { z23.b, z24.b } +; CHECK-NEXT: uzp1 z18.b, z3.b, z3.b +; CHECK-NEXT: splice z3.b, p0, { z25.b, z26.b } +; CHECK-NEXT: uzp1 z4.b, z0.b, z0.b +; CHECK-NEXT: add z0.b, z2.b, z2.b +; CHECK-NEXT: add z7.b, z7.b, z7.b +; CHECK-NEXT: splice z1.b, p0, { z16.b, z17.b } +; CHECK-NEXT: splice z2.b, p0, { z18.b, z19.b } +; CHECK-NEXT: splice z16.b, p0, { z20.b, z21.b } +; CHECK-NEXT: splice z4.b, p0, { z4.b, z5.b } +; CHECK-NEXT: add z6.b, z6.b, z6.b +; CHECK-NEXT: add z3.b, z3.b, z3.b +; CHECK-NEXT: stp q0, q7, [x1, #96] +; CHECK-NEXT: add z0.b, z1.b, z1.b +; CHECK-NEXT: add z1.b, z2.b, z2.b +; CHECK-NEXT: add z2.b, z16.b, z16.b +; CHECK-NEXT: stp q6, q3, [x1, #64] ; CHECK-NEXT: add z3.b, z4.b, z4.b -; CHECK-NEXT: add z4.b, z7.b, z7.b -; CHECK-NEXT: add z0.b, z0.b, z0.b -; CHECK-NEXT: stp q5, q1, [x1, #64] -; CHECK-NEXT: stp q2, q3, [x1, #32] -; CHECK-NEXT: stp q4, q0, [x1] +; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: trunc_v128i16_v128i8: @@ -1181,11 +1181,11 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind { define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind { ; CHECK-LABEL: trunc_v8i32_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h +; CHECK-NEXT: uzp1 z3.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z2.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p0, { z2.h, z3.h } ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -1219,17 +1219,17 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: uzp1 z5.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z4.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z3.h, z3.h +; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b -; CHECK-NEXT: uzp1 z0.b, z2.b, z2.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b +; CHECK-NEXT: splice z0.b, p0, { z1.b, z2.b } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -1277,32 +1277,32 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind { define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v32i32_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q0, q1, [x0, #96] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ldp q2, q3, [x0, #96] +; CHECK-NEXT: ldp q2, q3, [x0, #32] ; CHECK-NEXT: ldp q4, q5, [x0, #64] ; CHECK-NEXT: ldp q6, q7, [x0] -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h -; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: uzp1 z17.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z16.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z19.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z0.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z7.h, z7.h +; CHECK-NEXT: uzp1 z18.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z6.h, z6.h +; CHECK-NEXT: splice z4.h, p0, { z16.h, z17.h } +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } +; CHECK-NEXT: splice z5.h, p0, { z18.h, z19.h } +; CHECK-NEXT: splice z1.h, p0, { z2.h, z3.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z1.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b -; CHECK-NEXT: uzp1 z3.b, z6.b, z6.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z1.b -; CHECK-NEXT: splice z3.b, p0, z3.b, z0.b -; CHECK-NEXT: add z0.b, z2.b, z2.b -; CHECK-NEXT: add z1.b, z3.b, z3.b +; CHECK-NEXT: uzp1 z3.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z7.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b +; CHECK-NEXT: uzp1 z6.b, z1.b, z1.b +; CHECK-NEXT: splice z0.b, p0, { z2.b, z3.b } +; CHECK-NEXT: splice z1.b, p0, { z6.b, z7.b } +; CHECK-NEXT: add z0.b, z0.b, z0.b +; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret ; @@ -1429,56 +1429,56 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q2, q3, [x0, #160] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ldp q4, q5, [x0, #128] +; CHECK-NEXT: ldp q4, q5, [x0, #96] +; CHECK-NEXT: ldp q6, q7, [x0] +; CHECK-NEXT: uzp1 z17.h, z3.h, z3.h +; CHECK-NEXT: ldp q3, q18, [x0, #128] +; CHECK-NEXT: uzp1 z16.h, z2.h, z2.h +; CHECK-NEXT: ldp q2, q19, [x0, #192] ; CHECK-NEXT: ldp q0, q1, [x0, #64] -; CHECK-NEXT: ldp q6, q7, [x0, #96] -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: ldp q16, q17, [x0] -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: ldp q18, q19, [x0, #192] -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h -; CHECK-NEXT: ldp q20, q21, [x0, #224] -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: ldp q22, q23, [x0, #32] -; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h -; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h -; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h -; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h -; CHECK-NEXT: uzp1 z3.h, z21.h, z21.h -; CHECK-NEXT: uzp1 z5.h, z20.h, z20.h -; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h -; CHECK-NEXT: uzp1 z20.h, z23.h, z23.h -; CHECK-NEXT: uzp1 z21.h, z22.h, z22.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z18.h, p0, z18.h, z19.h -; CHECK-NEXT: splice z5.h, p0, z5.h, z3.h -; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h -; CHECK-NEXT: splice z21.h, p0, z21.h, z20.h -; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: uzp1 z1.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z21.h, z18.h, z18.h +; CHECK-NEXT: ldp q18, q22, [x0, #224] +; CHECK-NEXT: uzp1 z20.h, z3.h, z3.h +; CHECK-NEXT: ldp q3, q23, [x0, #32] +; CHECK-NEXT: splice z16.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z27.h, z19.h, z19.h +; CHECK-NEXT: uzp1 z25.h, z22.h, z22.h +; CHECK-NEXT: uzp1 z26.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z24.h, z18.h, z18.h +; CHECK-NEXT: uzp1 z18.h, z23.h, z23.h +; CHECK-NEXT: uzp1 z23.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z17.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z3.h, z7.h, z7.h +; CHECK-NEXT: uzp1 z22.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z5.h, z1.h, z1.h +; CHECK-NEXT: splice z1.h, p0, { z20.h, z21.h } +; CHECK-NEXT: splice z6.h, p0, { z24.h, z25.h } +; CHECK-NEXT: uzp1 z4.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z26.h, z27.h } +; CHECK-NEXT: splice z7.h, p0, { z17.h, z18.h } +; CHECK-NEXT: uzp1 z17.b, z16.b, z16.b +; CHECK-NEXT: splice z2.h, p0, { z2.h, z3.h } +; CHECK-NEXT: splice z3.h, p0, { z22.h, z23.h } +; CHECK-NEXT: splice z4.h, p0, { z4.h, z5.h } +; CHECK-NEXT: uzp1 z16.b, z1.b, z1.b ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z4.b, z18.b, z18.b -; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b -; CHECK-NEXT: uzp1 z7.b, z16.b, z16.b -; CHECK-NEXT: uzp1 z5.b, z21.b, z21.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z1.b -; CHECK-NEXT: uzp1 z1.b, z6.b, z6.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: splice z4.b, p0, z4.b, z3.b -; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b -; CHECK-NEXT: add z1.b, z2.b, z2.b -; CHECK-NEXT: add z2.b, z4.b, z4.b -; CHECK-NEXT: add z3.b, z7.b, z7.b +; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b +; CHECK-NEXT: uzp1 z5.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z7.b, z7.b +; CHECK-NEXT: uzp1 z0.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b +; CHECK-NEXT: splice z7.b, p0, { z16.b, z17.b } +; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b +; CHECK-NEXT: splice z4.b, p0, { z5.b, z6.b } +; CHECK-NEXT: splice z0.b, p0, { z0.b, z1.b } +; CHECK-NEXT: splice z1.b, p0, { z2.b, z3.b } +; CHECK-NEXT: add z2.b, z7.b, z7.b +; CHECK-NEXT: add z3.b, z4.b, z4.b ; CHECK-NEXT: add z0.b, z0.b, z0.b -; CHECK-NEXT: stp q1, q2, [x1, #32] -; CHECK-NEXT: stp q3, q0, [x1] +; CHECK-NEXT: add z1.b, z1.b, z1.b +; CHECK-NEXT: stp q2, q3, [x1, #32] +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: trunc_v64i32_v64i8: @@ -1765,11 +1765,11 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind { define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind { ; CHECK-LABEL: trunc_v8i32_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h +; CHECK-NEXT: uzp1 z3.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z2.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p0, { z2.h, z3.h } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -1801,18 +1801,18 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind { define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v16i32_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: add z0.h, z0.h, z0.h +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: uzp1 z5.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z4.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z3.h, z3.h +; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } ; CHECK-NEXT: add z1.h, z2.h, z2.h -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: add z0.h, z0.h, z0.h +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: trunc_v16i32_v16i16: @@ -1877,27 +1877,27 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind { define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v32i32_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #64] +; CHECK-NEXT: ldp q1, q0, [x0, #64] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: ldp q6, q7, [x0, #32] -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h -; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: ldp q2, q3, [x0, #96] +; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: uzp1 z7.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z6.h, z1.h, z1.h +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: uzp1 z17.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z16.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z2.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z5.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z6.h, z7.h } +; CHECK-NEXT: uzp1 z4.h, z1.h, z1.h +; CHECK-NEXT: splice z1.h, p0, { z16.h, z17.h } +; CHECK-NEXT: splice z2.h, p0, { z2.h, z3.h } +; CHECK-NEXT: splice z3.h, p0, { z4.h, z5.h } ; CHECK-NEXT: add z0.h, z0.h, z0.h +; CHECK-NEXT: add z1.h, z1.h, z1.h ; CHECK-NEXT: add z2.h, z2.h, z2.h -; CHECK-NEXT: add z1.h, z4.h, z4.h -; CHECK-NEXT: add z3.h, z6.h, z6.h +; CHECK-NEXT: add z3.h, z3.h, z3.h ; CHECK-NEXT: stp q0, q1, [x1, #32] ; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret @@ -2027,49 +2027,49 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldp q4, q5, [x0] ; CHECK-NEXT: ldp q6, q7, [x0, #64] -; CHECK-NEXT: ldp q16, q17, [x0, #224] -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: ldp q20, q21, [x0, #160] -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: uzp1 z17.h, z3.h, z3.h +; CHECK-NEXT: ldp q3, q18, [x0, #224] +; CHECK-NEXT: uzp1 z16.h, z2.h, z2.h +; CHECK-NEXT: ldp q2, q19, [x0, #128] ; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h -; CHECK-NEXT: ldp q18, q19, [x0, #128] -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: uzp1 z3.h, z21.h, z21.h -; CHECK-NEXT: uzp1 z20.h, z20.h, z20.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: ldp q21, q22, [x0] -; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h +; CHECK-NEXT: uzp1 z21.h, z18.h, z18.h +; CHECK-NEXT: ldp q18, q22, [x0, #160] +; CHECK-NEXT: uzp1 z20.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z24.h, z19.h, z19.h +; CHECK-NEXT: ldp q3, q19, [x0, #96] +; CHECK-NEXT: uzp1 z23.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z26.h, z22.h, z22.h +; CHECK-NEXT: splice z2.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z17.h, z7.h, z7.h +; CHECK-NEXT: uzp1 z25.h, z18.h, z18.h +; CHECK-NEXT: splice z7.h, p0, { z20.h, z21.h } +; CHECK-NEXT: uzp1 z21.h, z5.h, z5.h ; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h -; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: splice z20.h, p0, z20.h, z3.h -; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h -; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h -; CHECK-NEXT: uzp1 z5.h, z22.h, z22.h -; CHECK-NEXT: uzp1 z7.h, z21.h, z21.h -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z18.h, p0, z18.h, z19.h -; CHECK-NEXT: add z2.h, z2.h, z2.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z3.h -; CHECK-NEXT: add z3.h, z16.h, z16.h -; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: add z1.h, z20.h, z20.h -; CHECK-NEXT: add z5.h, z18.h, z18.h -; CHECK-NEXT: stp q2, q3, [x1, #96] -; CHECK-NEXT: add z2.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z20.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z5.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z16.h, z6.h, z6.h +; CHECK-NEXT: splice z6.h, p0, { z23.h, z24.h } +; CHECK-NEXT: uzp1 z18.h, z3.h, z3.h +; CHECK-NEXT: splice z3.h, p0, { z25.h, z26.h } +; CHECK-NEXT: uzp1 z4.h, z0.h, z0.h +; CHECK-NEXT: add z0.h, z2.h, z2.h +; CHECK-NEXT: add z7.h, z7.h, z7.h +; CHECK-NEXT: splice z1.h, p0, { z16.h, z17.h } +; CHECK-NEXT: splice z2.h, p0, { z18.h, z19.h } +; CHECK-NEXT: splice z16.h, p0, { z20.h, z21.h } +; CHECK-NEXT: splice z4.h, p0, { z4.h, z5.h } +; CHECK-NEXT: add z6.h, z6.h, z6.h +; CHECK-NEXT: add z3.h, z3.h, z3.h +; CHECK-NEXT: stp q0, q7, [x1, #96] +; CHECK-NEXT: add z0.h, z1.h, z1.h +; CHECK-NEXT: add z1.h, z2.h, z2.h +; CHECK-NEXT: add z2.h, z16.h, z16.h +; CHECK-NEXT: stp q6, q3, [x1, #64] ; CHECK-NEXT: add z3.h, z4.h, z4.h -; CHECK-NEXT: add z4.h, z7.h, z7.h -; CHECK-NEXT: add z0.h, z0.h, z0.h -; CHECK-NEXT: stp q5, q1, [x1, #64] -; CHECK-NEXT: stp q2, q3, [x1, #32] -; CHECK-NEXT: stp q4, q0, [x1] +; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: trunc_v64i32_v64i16: @@ -2360,11 +2360,11 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind { define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind { ; CHECK-LABEL: trunc_v4i64_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s +; CHECK-NEXT: uzp1 z3.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z2.s, z1.s, z1.s +; CHECK-NEXT: splice z0.s, p0, { z2.s, z3.s } ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -2392,18 +2392,18 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: uzp1 z5.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z4.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z1.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z0.s, z3.s, z3.s +; CHECK-NEXT: splice z2.s, p0, { z4.s, z5.s } +; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s } ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z0.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -2439,34 +2439,34 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind { define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind { ; CHECK-LABEL: trunc_v16i64_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q0, q1, [x0, #96] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldp q2, q3, [x0, #96] +; CHECK-NEXT: ldp q2, q3, [x0, #32] ; CHECK-NEXT: ldp q4, q5, [x0, #64] ; CHECK-NEXT: ldp q6, q7, [x0] -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s -; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s -; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s +; CHECK-NEXT: uzp1 z17.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z16.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z19.s, z3.s, z3.s +; CHECK-NEXT: uzp1 z1.s, z5.s, z5.s +; CHECK-NEXT: uzp1 z18.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z0.s, z4.s, z4.s +; CHECK-NEXT: uzp1 z3.s, z7.s, z7.s +; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s +; CHECK-NEXT: splice z4.s, p0, { z16.s, z17.s } +; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s } +; CHECK-NEXT: splice z1.s, p0, { z18.s, z19.s } +; CHECK-NEXT: splice z2.s, p0, { z2.s, z3.s } ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z2.h, z4.h, z4.h -; CHECK-NEXT: uzp1 z3.h, z6.h, z6.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z1.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z3.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z2.h, z2.h +; CHECK-NEXT: splice z2.h, p0, { z3.h, z4.h } +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z1.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z0.b, z3.b, z3.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b +; CHECK-NEXT: splice z0.b, p0, { z1.b, z2.b } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -2523,62 +2523,62 @@ define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind { define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v32i64_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q5, q6, [x0, #224] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldp q2, q3, [x0, #224] -; CHECK-NEXT: ldp q4, q5, [x0, #32] -; CHECK-NEXT: ldp q6, q7, [x0, #64] -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: ldp q16, q17, [x0, #192] -; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: ldp q18, q19, [x0, #128] -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: ldp q20, q21, [x0, #160] -; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s -; CHECK-NEXT: ldp q22, q23, [x0, #96] -; CHECK-NEXT: uzp1 z17.s, z17.s, z17.s -; CHECK-NEXT: uzp1 z16.s, z16.s, z16.s -; CHECK-NEXT: uzp1 z19.s, z19.s, z19.s -; CHECK-NEXT: uzp1 z18.s, z18.s, z18.s -; CHECK-NEXT: uzp1 z21.s, z21.s, z21.s -; CHECK-NEXT: uzp1 z20.s, z20.s, z20.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: uzp1 z23.s, z23.s, z23.s -; CHECK-NEXT: uzp1 z22.s, z22.s, z22.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s -; CHECK-NEXT: splice z20.s, p0, z20.s, z21.s -; CHECK-NEXT: splice z18.s, p0, z18.s, z19.s -; CHECK-NEXT: splice z22.s, p0, z22.s, z23.s -; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s -; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s +; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ldp q4, q7, [x0, #64] +; CHECK-NEXT: uzp1 z17.s, z6.s, z6.s +; CHECK-NEXT: ldp q6, q18, [x0, #192] +; CHECK-NEXT: uzp1 z16.s, z5.s, z5.s +; CHECK-NEXT: ldp q5, q19, [x0, #128] +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: uzp1 z21.s, z18.s, z18.s +; CHECK-NEXT: ldp q18, q22, [x0, #160] +; CHECK-NEXT: uzp1 z20.s, z6.s, z6.s +; CHECK-NEXT: ldp q6, q23, [x0, #96] +; CHECK-NEXT: splice z16.s, p0, { z16.s, z17.s } +; CHECK-NEXT: uzp1 z27.s, z19.s, z19.s +; CHECK-NEXT: uzp1 z25.s, z22.s, z22.s +; CHECK-NEXT: uzp1 z26.s, z5.s, z5.s +; CHECK-NEXT: uzp1 z24.s, z18.s, z18.s +; CHECK-NEXT: uzp1 z18.s, z23.s, z23.s +; CHECK-NEXT: uzp1 z23.s, z3.s, z3.s +; CHECK-NEXT: uzp1 z17.s, z6.s, z6.s +; CHECK-NEXT: uzp1 z6.s, z7.s, z7.s +; CHECK-NEXT: uzp1 z22.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z5.s, z4.s, z4.s +; CHECK-NEXT: uzp1 z2.s, z1.s, z1.s +; CHECK-NEXT: splice z3.s, p0, { z20.s, z21.s } +; CHECK-NEXT: uzp1 z1.s, z0.s, z0.s +; CHECK-NEXT: splice z0.s, p0, { z24.s, z25.s } +; CHECK-NEXT: splice z7.s, p0, { z26.s, z27.s } +; CHECK-NEXT: splice z4.s, p0, { z17.s, z18.s } +; CHECK-NEXT: uzp1 z17.h, z16.h, z16.h +; CHECK-NEXT: splice z5.s, p0, { z5.s, z6.s } +; CHECK-NEXT: splice z6.s, p0, { z22.s, z23.s } +; CHECK-NEXT: splice z1.s, p0, { z1.s, z2.s } +; CHECK-NEXT: uzp1 z16.h, z3.h, z3.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z2.h, z16.h, z16.h -; CHECK-NEXT: uzp1 z3.h, z20.h, z20.h -; CHECK-NEXT: uzp1 z5.h, z18.h, z18.h -; CHECK-NEXT: uzp1 z7.h, z22.h, z22.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z1.h -; CHECK-NEXT: splice z5.h, p0, z5.h, z3.h -; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z4.h +; CHECK-NEXT: uzp1 z3.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z19.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z7.h, z7.h +; CHECK-NEXT: uzp1 z18.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h +; CHECK-NEXT: splice z0.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z4.h, z1.h, z1.h +; CHECK-NEXT: splice z1.h, p0, { z2.h, z3.h } +; CHECK-NEXT: splice z2.h, p0, { z18.h, z19.h } +; CHECK-NEXT: splice z3.h, p0, { z4.h, z5.h } +; CHECK-NEXT: uzp1 z5.b, z0.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z1.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b -; CHECK-NEXT: uzp1 z3.b, z6.b, z6.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z1.b -; CHECK-NEXT: splice z0.b, p0, z0.b, z3.b -; CHECK-NEXT: add z1.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z4.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z7.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z6.b, z3.b, z3.b +; CHECK-NEXT: splice z0.b, p0, { z4.b, z5.b } +; CHECK-NEXT: splice z1.b, p0, { z6.b, z7.b } ; CHECK-NEXT: add z0.b, z0.b, z0.b -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: add z1.b, z1.b, z1.b +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i8: @@ -2731,11 +2731,11 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind { define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind { ; CHECK-LABEL: trunc_v4i64_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s +; CHECK-NEXT: uzp1 z3.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z2.s, z1.s, z1.s +; CHECK-NEXT: splice z0.s, p0, { z2.s, z3.s } ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -2763,17 +2763,17 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: uzp1 z5.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z4.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z1.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z0.s, z3.s, z3.s +; CHECK-NEXT: splice z2.s, p0, { z4.s, z5.s } +; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s } ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z0.h, z2.h, z2.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z0.h, z0.h +; CHECK-NEXT: splice z0.h, p0, { z1.h, z2.h } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -2810,32 +2810,32 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind { define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v16i64_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q0, q1, [x0, #96] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldp q2, q3, [x0, #96] +; CHECK-NEXT: ldp q2, q3, [x0, #32] ; CHECK-NEXT: ldp q4, q5, [x0, #64] ; CHECK-NEXT: ldp q6, q7, [x0] -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s -; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s -; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s +; CHECK-NEXT: uzp1 z17.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z16.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z3.s, z3.s +; CHECK-NEXT: uzp1 z19.s, z5.s, z5.s +; CHECK-NEXT: uzp1 z0.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z3.s, z7.s, z7.s +; CHECK-NEXT: uzp1 z18.s, z4.s, z4.s +; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s +; CHECK-NEXT: splice z4.s, p0, { z16.s, z17.s } +; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s } +; CHECK-NEXT: splice z5.s, p0, { z18.s, z19.s } +; CHECK-NEXT: splice z1.s, p0, { z2.s, z3.s } ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z2.h, z4.h, z4.h -; CHECK-NEXT: uzp1 z3.h, z6.h, z6.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z1.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z0.h -; CHECK-NEXT: add z0.h, z2.h, z2.h -; CHECK-NEXT: add z1.h, z3.h, z3.h +; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z7.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z2.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z6.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p0, { z2.h, z3.h } +; CHECK-NEXT: splice z1.h, p0, { z6.h, z7.h } +; CHECK-NEXT: add z0.h, z0.h, z0.h +; CHECK-NEXT: add z1.h, z1.h, z1.h ; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret ; @@ -2915,56 +2915,56 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q2, q3, [x0, #160] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldp q4, q5, [x0, #128] +; CHECK-NEXT: ldp q4, q5, [x0, #96] +; CHECK-NEXT: ldp q6, q7, [x0] +; CHECK-NEXT: uzp1 z17.s, z3.s, z3.s +; CHECK-NEXT: ldp q3, q18, [x0, #128] +; CHECK-NEXT: uzp1 z16.s, z2.s, z2.s +; CHECK-NEXT: ldp q2, q19, [x0, #192] ; CHECK-NEXT: ldp q0, q1, [x0, #64] -; CHECK-NEXT: ldp q6, q7, [x0, #96] -; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s -; CHECK-NEXT: ldp q16, q17, [x0] -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: ldp q18, q19, [x0, #192] -; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s -; CHECK-NEXT: ldp q20, q21, [x0, #224] -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: ldp q22, q23, [x0, #32] -; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s -; CHECK-NEXT: uzp1 z19.s, z19.s, z19.s -; CHECK-NEXT: uzp1 z18.s, z18.s, z18.s -; CHECK-NEXT: uzp1 z17.s, z17.s, z17.s -; CHECK-NEXT: uzp1 z3.s, z21.s, z21.s -; CHECK-NEXT: uzp1 z5.s, z20.s, z20.s -; CHECK-NEXT: uzp1 z16.s, z16.s, z16.s -; CHECK-NEXT: uzp1 z20.s, z23.s, z23.s -; CHECK-NEXT: uzp1 z21.s, z22.s, z22.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: splice z18.s, p0, z18.s, z19.s -; CHECK-NEXT: splice z5.s, p0, z5.s, z3.s -; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s -; CHECK-NEXT: splice z21.s, p0, z21.s, z20.s -; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z2.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z21.s, z18.s, z18.s +; CHECK-NEXT: ldp q18, q22, [x0, #224] +; CHECK-NEXT: uzp1 z20.s, z3.s, z3.s +; CHECK-NEXT: ldp q3, q23, [x0, #32] +; CHECK-NEXT: splice z16.s, p0, { z16.s, z17.s } +; CHECK-NEXT: uzp1 z27.s, z19.s, z19.s +; CHECK-NEXT: uzp1 z25.s, z22.s, z22.s +; CHECK-NEXT: uzp1 z26.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z24.s, z18.s, z18.s +; CHECK-NEXT: uzp1 z18.s, z23.s, z23.s +; CHECK-NEXT: uzp1 z23.s, z5.s, z5.s +; CHECK-NEXT: uzp1 z17.s, z3.s, z3.s +; CHECK-NEXT: uzp1 z3.s, z7.s, z7.s +; CHECK-NEXT: uzp1 z22.s, z4.s, z4.s +; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s +; CHECK-NEXT: uzp1 z5.s, z1.s, z1.s +; CHECK-NEXT: splice z1.s, p0, { z20.s, z21.s } +; CHECK-NEXT: splice z6.s, p0, { z24.s, z25.s } +; CHECK-NEXT: uzp1 z4.s, z0.s, z0.s +; CHECK-NEXT: splice z0.s, p0, { z26.s, z27.s } +; CHECK-NEXT: splice z7.s, p0, { z17.s, z18.s } +; CHECK-NEXT: uzp1 z17.h, z16.h, z16.h +; CHECK-NEXT: splice z2.s, p0, { z2.s, z3.s } +; CHECK-NEXT: splice z3.s, p0, { z22.s, z23.s } +; CHECK-NEXT: splice z4.s, p0, { z4.s, z5.s } +; CHECK-NEXT: uzp1 z16.h, z1.h, z1.h ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z4.h, z18.h, z18.h -; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h -; CHECK-NEXT: uzp1 z5.h, z21.h, z21.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z1.h -; CHECK-NEXT: uzp1 z1.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z3.h -; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: add z1.h, z2.h, z2.h -; CHECK-NEXT: add z2.h, z4.h, z4.h -; CHECK-NEXT: add z3.h, z7.h, z7.h +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z5.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z7.h, z7.h +; CHECK-NEXT: uzp1 z0.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h +; CHECK-NEXT: splice z7.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z2.h, z4.h, z4.h +; CHECK-NEXT: splice z4.h, p0, { z5.h, z6.h } +; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } +; CHECK-NEXT: splice z1.h, p0, { z2.h, z3.h } +; CHECK-NEXT: add z2.h, z7.h, z7.h +; CHECK-NEXT: add z3.h, z4.h, z4.h ; CHECK-NEXT: add z0.h, z0.h, z0.h -; CHECK-NEXT: stp q1, q2, [x1, #32] -; CHECK-NEXT: stp q3, q0, [x1] +; CHECK-NEXT: add z1.h, z1.h, z1.h +; CHECK-NEXT: stp q2, q3, [x1, #32] +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i16: @@ -3118,11 +3118,11 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind { define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind { ; CHECK-LABEL: trunc_v4i64_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s +; CHECK-NEXT: uzp1 z3.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z2.s, z1.s, z1.s +; CHECK-NEXT: splice z0.s, p0, { z2.s, z3.s } ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -3146,18 +3146,18 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind { define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v8i64_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: add z0.s, z0.s, z0.s +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: uzp1 z5.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z4.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z1.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z0.s, z3.s, z3.s +; CHECK-NEXT: splice z2.s, p0, { z4.s, z5.s } +; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s } ; CHECK-NEXT: add z1.s, z2.s, z2.s -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: add z0.s, z0.s, z0.s +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i32: @@ -3202,27 +3202,27 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind { define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v16i64_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #64] +; CHECK-NEXT: ldp q1, q0, [x0, #64] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: ldp q6, q7, [x0, #32] -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s -; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s -; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s +; CHECK-NEXT: ldp q2, q3, [x0, #96] +; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: uzp1 z7.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z6.s, z1.s, z1.s +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: uzp1 z17.s, z3.s, z3.s +; CHECK-NEXT: uzp1 z16.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z3.s, z5.s, z5.s +; CHECK-NEXT: uzp1 z2.s, z4.s, z4.s +; CHECK-NEXT: uzp1 z5.s, z0.s, z0.s +; CHECK-NEXT: splice z0.s, p0, { z6.s, z7.s } +; CHECK-NEXT: uzp1 z4.s, z1.s, z1.s +; CHECK-NEXT: splice z1.s, p0, { z16.s, z17.s } +; CHECK-NEXT: splice z2.s, p0, { z2.s, z3.s } +; CHECK-NEXT: splice z3.s, p0, { z4.s, z5.s } ; CHECK-NEXT: add z0.s, z0.s, z0.s +; CHECK-NEXT: add z1.s, z1.s, z1.s ; CHECK-NEXT: add z2.s, z2.s, z2.s -; CHECK-NEXT: add z1.s, z4.s, z4.s -; CHECK-NEXT: add z3.s, z6.s, z6.s +; CHECK-NEXT: add z3.s, z3.s, z3.s ; CHECK-NEXT: stp q0, q1, [x1, #32] ; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret @@ -3297,49 +3297,49 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldp q4, q5, [x0] ; CHECK-NEXT: ldp q6, q7, [x0, #64] -; CHECK-NEXT: ldp q16, q17, [x0, #224] -; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: ldp q20, q21, [x0, #160] -; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s +; CHECK-NEXT: uzp1 z17.s, z3.s, z3.s +; CHECK-NEXT: ldp q3, q18, [x0, #224] +; CHECK-NEXT: uzp1 z16.s, z2.s, z2.s +; CHECK-NEXT: ldp q2, q19, [x0, #128] ; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: uzp1 z17.s, z17.s, z17.s -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: uzp1 z16.s, z16.s, z16.s -; CHECK-NEXT: ldp q18, q19, [x0, #128] -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: uzp1 z3.s, z21.s, z21.s -; CHECK-NEXT: uzp1 z20.s, z20.s, z20.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: ldp q21, q22, [x0] -; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s +; CHECK-NEXT: uzp1 z21.s, z18.s, z18.s +; CHECK-NEXT: ldp q18, q22, [x0, #160] +; CHECK-NEXT: uzp1 z20.s, z3.s, z3.s +; CHECK-NEXT: uzp1 z24.s, z19.s, z19.s +; CHECK-NEXT: ldp q3, q19, [x0, #96] +; CHECK-NEXT: uzp1 z23.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z26.s, z22.s, z22.s +; CHECK-NEXT: splice z2.s, p0, { z16.s, z17.s } +; CHECK-NEXT: uzp1 z17.s, z7.s, z7.s +; CHECK-NEXT: uzp1 z25.s, z18.s, z18.s +; CHECK-NEXT: splice z7.s, p0, { z20.s, z21.s } +; CHECK-NEXT: uzp1 z21.s, z5.s, z5.s ; CHECK-NEXT: uzp1 z19.s, z19.s, z19.s -; CHECK-NEXT: uzp1 z18.s, z18.s, z18.s -; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s -; CHECK-NEXT: splice z20.s, p0, z20.s, z3.s -; CHECK-NEXT: uzp1 z3.s, z5.s, z5.s -; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s -; CHECK-NEXT: uzp1 z5.s, z22.s, z22.s -; CHECK-NEXT: uzp1 z7.s, z21.s, z21.s -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: splice z18.s, p0, z18.s, z19.s -; CHECK-NEXT: add z2.s, z2.s, z2.s -; CHECK-NEXT: splice z4.s, p0, z4.s, z3.s -; CHECK-NEXT: add z3.s, z16.s, z16.s -; CHECK-NEXT: splice z7.s, p0, z7.s, z5.s -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: add z1.s, z20.s, z20.s -; CHECK-NEXT: add z5.s, z18.s, z18.s -; CHECK-NEXT: stp q2, q3, [x1, #96] -; CHECK-NEXT: add z2.s, z6.s, z6.s +; CHECK-NEXT: uzp1 z20.s, z4.s, z4.s +; CHECK-NEXT: uzp1 z5.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z16.s, z6.s, z6.s +; CHECK-NEXT: splice z6.s, p0, { z23.s, z24.s } +; CHECK-NEXT: uzp1 z18.s, z3.s, z3.s +; CHECK-NEXT: splice z3.s, p0, { z25.s, z26.s } +; CHECK-NEXT: uzp1 z4.s, z0.s, z0.s +; CHECK-NEXT: add z0.s, z2.s, z2.s +; CHECK-NEXT: add z7.s, z7.s, z7.s +; CHECK-NEXT: splice z1.s, p0, { z16.s, z17.s } +; CHECK-NEXT: splice z2.s, p0, { z18.s, z19.s } +; CHECK-NEXT: splice z16.s, p0, { z20.s, z21.s } +; CHECK-NEXT: splice z4.s, p0, { z4.s, z5.s } +; CHECK-NEXT: add z6.s, z6.s, z6.s +; CHECK-NEXT: add z3.s, z3.s, z3.s +; CHECK-NEXT: stp q0, q7, [x1, #96] +; CHECK-NEXT: add z0.s, z1.s, z1.s +; CHECK-NEXT: add z1.s, z2.s, z2.s +; CHECK-NEXT: add z2.s, z16.s, z16.s +; CHECK-NEXT: stp q6, q3, [x1, #64] ; CHECK-NEXT: add z3.s, z4.s, z4.s -; CHECK-NEXT: add z4.s, z7.s, z7.s -; CHECK-NEXT: add z0.s, z0.s, z0.s -; CHECK-NEXT: stp q5, q1, [x1, #64] -; CHECK-NEXT: stp q2, q3, [x1, #32] -; CHECK-NEXT: stp q4, q0, [x1] +; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q2, q3, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i32: