diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 9775238027650..509dd8b73a017 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2133,21 +2133,37 @@ let Predicates = [HasSVE_or_SME] in { (LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>; // Splice with lane bigger or equal to 0 - foreach VT = [nxv16i8] in + foreach VT = [nxv16i8] in { def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_255 i32:$index)))), (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + let AddedComplexity = 1 in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_255 i32:$index)))), + (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>; + } - foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in + foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in { def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_127 i32:$index)))), (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + let AddedComplexity = 1 in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_127 i32:$index)))), + (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>; + } - foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in + foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in { def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_63 i32:$index)))), (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + let AddedComplexity = 1 in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_63 i32:$index)))), + (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>; + } - foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in + foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in { def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_31 i32:$index)))), (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + let AddedComplexity = 1 in + def : Pat<(VT (vector_splice VT:$Z1, VT:$Z1, (i64 (sve_ext_imm_0_31 i32:$index)))), + (EXT_ZZI_CONSTRUCTIVE ZPR:$Z1, imm0_255:$index)>; + } defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>; defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>; diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll index 014eaff57ce81..72d839a21a29f 100644 --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -36,7 +36,7 @@ define @splice_nxv16i8_last_idx( %a, @splice_nxv16i8_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i8_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #1 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv16i8( %b, %b, i32 1) @@ -55,7 +55,7 @@ define @splice_nxv8i16_first_idx( %a, @splice_nxv8i16_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #2 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv8i16( %b, %b, i32 1) @@ -83,7 +83,7 @@ define @splice_nxv4i32_last_idx( %a, @splice_nxv4i32_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #4 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv4i32( %b, %b, i32 1) @@ -111,7 +111,7 @@ define @splice_nxv2i64_last_idx( %a, @splice_nxv2i64_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv2i64( %b, %b, i32 1) @@ -173,7 +173,7 @@ define @splice_nxv2f16_last_idx( %a, @splice_nxv2f16_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f16_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv2f16( %b, %b, i32 1) @@ -235,7 +235,7 @@ define @splice_nxv4f16_last_idx( %a, @splice_nxv4f16_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f16_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #4 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv4f16( %b, %b, i32 1) @@ -263,7 +263,7 @@ define @splice_nxv8f16_last_idx( %a, @splice_nxv8f16_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #2 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv8f16( %b, %b, i32 1) @@ -325,7 +325,7 @@ define @splice_nxv2f32_last_idx( %a, @splice_nxv2f32_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f32_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv2f32( %b, %b, i32 1) @@ -353,7 +353,7 @@ define @splice_nxv4f32_last_idx( %a, @splice_nxv4f32_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #4 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv4f32( %b, %b, i32 1) @@ -381,7 +381,7 @@ define @splice_nxv2f64_last_idx( %a, define @splice_nxv2f64_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv2f64( %b, %b, i32 1) @@ -879,7 +879,7 @@ define @splice_nxv2bf16_last_idx( %a, define @splice_nxv2bf16_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2bf16_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv2bf16( %b, %b, i32 1) @@ -941,7 +941,7 @@ define @splice_nxv4bf16_last_idx( %a, define @splice_nxv4bf16_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4bf16_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #4 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv4bf16( %b, %b, i32 1) @@ -969,7 +969,7 @@ define @splice_nxv8bf16_last_idx( %a, define @splice_nxv8bf16_first_idx_unary( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8bf16_first_idx_unary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #2 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv8bf16( %b, %b, i32 1) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll index 800f95d97af4c..7b438743487e1 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -50,7 +50,7 @@ define void @extract_v32i8_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range( ; CHECK-LABEL: extract_v32i8_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: str q1, [x1] ; CHECK-NEXT: str q0, [x2] @@ -68,7 +68,7 @@ define void @extract_v32i8_half_unaligned(ptr %in, ptr %out) #0 vscale_range(2,2 ; CHECK-LABEL: extract_v32i8_half_unaligned: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4 ; CHECK-NEXT: str q0, [x1] @@ -84,15 +84,16 @@ define void @extract_v32i8_quarters(ptr %in, ptr %out, ptr %out2, ptr %out3, ptr ; CHECK-LABEL: extract_v32i8_quarters: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #24 +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: str d1, [x1] ; CHECK-NEXT: str d2, [x2] ; CHECK-NEXT: str d0, [x3] -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: str d0, [x4] +; CHECK-NEXT: str d3, [x4] ; CHECK-NEXT: ret entry: %b = load <32 x i8>, ptr %in @@ -126,7 +127,7 @@ define void @extract_v64i8_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range( ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 ; CHECK-NEXT: st1b { z1.b }, p0, [x1] ; CHECK-NEXT: st1b { z0.b }, p0, [x2] @@ -207,7 +208,7 @@ define void @extract_v16i16_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range ; CHECK-LABEL: extract_v16i16_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: str q1, [x1] ; CHECK-NEXT: str q0, [x2] @@ -240,7 +241,7 @@ define void @extract_v32i16_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 ; CHECK-NEXT: st1h { z1.h }, p0, [x1] ; CHECK-NEXT: st1h { z0.h }, p0, [x2] @@ -322,7 +323,7 @@ define void @extract_v8i32_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range( ; CHECK-LABEL: extract_v8i32_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: str q1, [x1] ; CHECK-NEXT: str q0, [x2] @@ -355,7 +356,7 @@ define void @extract_v16i32_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 ; CHECK-NEXT: st1w { z1.s }, p0, [x1] ; CHECK-NEXT: st1w { z0.s }, p0, [x2] @@ -426,7 +427,7 @@ define void @extract_v4i64_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range( ; CHECK-LABEL: extract_v4i64_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: str q1, [x1] ; CHECK-NEXT: str q0, [x2] @@ -459,7 +460,7 @@ define void @extract_v8i64_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range( ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 ; CHECK-NEXT: st1d { z1.d }, p0, [x1] ; CHECK-NEXT: st1d { z0.d }, p0, [x2] @@ -553,7 +554,7 @@ define void @extract_v16half_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_rang ; CHECK-LABEL: extract_v16half_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: str q1, [x1] ; CHECK-NEXT: str q0, [x2] @@ -586,7 +587,7 @@ define void @extract_v32half_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_rang ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 ; CHECK-NEXT: st1h { z1.h }, p0, [x1] ; CHECK-NEXT: st1h { z0.h }, p0, [x2] @@ -668,7 +669,7 @@ define void @extract_v8float_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_rang ; CHECK-LABEL: extract_v8float_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: str q1, [x1] ; CHECK-NEXT: str q0, [x2] @@ -701,7 +702,7 @@ define void @extract_v16float_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_ran ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 ; CHECK-NEXT: st1w { z1.s }, p0, [x1] ; CHECK-NEXT: st1w { z0.s }, p0, [x2] @@ -772,7 +773,7 @@ define void @extract_v4double_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_ran ; CHECK-LABEL: extract_v4double_halves: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: str q1, [x1] ; CHECK-NEXT: str q0, [x2] @@ -805,7 +806,7 @@ define void @extract_v8double_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_ran ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr z0, [x0] ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 ; CHECK-NEXT: st1d { z1.d }, p0, [x1] ; CHECK-NEXT: st1d { z0.d }, p0, [x2] @@ -908,7 +909,7 @@ define void @extract_subvector_legalization_v8i32() vscale_range(2,2) #0 { ; CHECK-NEXT: add x8, x8, :lo12:.LCPI59_0 ; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ldr z0, [x8] -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: cmeq v1.4s, v1.4s, #0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll index af54b146c5b66..c8f6d98f5a63f 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -150,13 +150,14 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: fcvtzu z1.s, p0/m, z1.h +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: fcvtzu z0.s, p0/m, z0.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: fcvtzu z1.s, p0/m, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzu_v16f16_v16i32: @@ -551,13 +552,14 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: fcvtzu z1.d, p0/m, z1.s +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: fcvtzu z0.d, p0/m, z0.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: fcvtzu z1.d, p0/m, z1.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzu_v8f32_v8i64: @@ -1043,13 +1045,14 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: fcvtzs z1.s, p0/m, z1.h +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: fcvtzs z0.s, p0/m, z0.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: fcvtzs z1.s, p0/m, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzs_v16f16_v16i32: @@ -1444,13 +1447,14 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: fcvtzs z1.d, p0/m, z1.s +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: fcvtzs z0.d, p0/m, z0.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: fcvtzs z1.d, p0/m, z1.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzs_v8f32_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll index 4feb86305f8f6..d2fa65599b973 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll @@ -77,11 +77,12 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b -; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sext_v32i8_v32i16: @@ -326,11 +327,12 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h -; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sext_v16i16_v16i32: @@ -490,11 +492,12 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s -; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sext_v8i32_v8i64: @@ -573,11 +576,12 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b -; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: zext_v32i8_v32i16: @@ -822,11 +826,12 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h -; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: zext_v16i16_v16i32: @@ -986,11 +991,12 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s -; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: zext_v8i32_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll index 2d78945399176..27be84419d59e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -259,17 +259,17 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s -; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128 ; CHECK-NEXT: sunpklo z5.h, z5.b ; CHECK-NEXT: sunpklo z7.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #128 -; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: sunpklo z6.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 ; CHECK-NEXT: sunpklo z3.s, z3.h @@ -420,11 +420,11 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h -; VBITS_GE_256-NEXT: mov z4.d, z0.d +; VBITS_GE_256-NEXT: movprfx z4, z0 ; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16 -; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s -; VBITS_GE_256-NEXT: mov z3.d, z1.d ; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_256-NEXT: movprfx z3, z1 ; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16 ; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h ; VBITS_GE_256-NEXT: sdivr z3.s, p1/m, z3.s, z4.s @@ -507,11 +507,11 @@ define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128 -; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z4.s @@ -1077,17 +1077,17 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s -; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128 ; CHECK-NEXT: uunpklo z5.h, z5.b ; CHECK-NEXT: uunpklo z7.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #128 -; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uunpklo z6.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 ; CHECK-NEXT: uunpklo z3.s, z3.h @@ -1238,11 +1238,11 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h -; VBITS_GE_256-NEXT: mov z4.d, z0.d +; VBITS_GE_256-NEXT: movprfx z4, z0 ; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16 -; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s -; VBITS_GE_256-NEXT: mov z3.d, z1.d ; VBITS_GE_256-NEXT: uunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s +; VBITS_GE_256-NEXT: movprfx z3, z1 ; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16 ; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h ; VBITS_GE_256-NEXT: udivr z3.s, p1/m, z3.s, z4.s @@ -1325,11 +1325,11 @@ define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128 -; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z4.s diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll index 1d9e01f4ecfdf..5753e5972f9c8 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -150,13 +150,14 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v16i16_v16f32: @@ -554,13 +555,14 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v8i32_v8f64: @@ -1063,13 +1065,14 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v16i16_v16f32: @@ -1479,13 +1482,14 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v8i32_v8f64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll index 78b41f71f0ea2..8b584c448bbae 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll @@ -8,10 +8,11 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mov x8, #8 // =0x8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: add z1.s, z0.s, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #16 +; CHECK-NEXT: add z1.s, z1.s, z1.s ; CHECK-NEXT: add z2.s, z2.s, z2.s ; CHECK-NEXT: dup v0.4s, v0.s[2] ; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] @@ -31,10 +32,11 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mov x8, #8 // =0x8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: add z1.s, z0.s, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #24 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #24 +; CHECK-NEXT: add z1.s, z1.s, z1.s ; CHECK-NEXT: add z2.s, z2.s, z2.s ; CHECK-NEXT: dup v0.2s, v0.s[0] ; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll index 34dc0bb5ef2d2..864a9f7987bdb 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -375,11 +375,12 @@ define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16: @@ -476,11 +477,12 @@ define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32: @@ -542,11 +544,12 @@ define void @masked_load_sext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64: @@ -574,11 +577,12 @@ define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16: @@ -675,11 +679,12 @@ define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32: @@ -741,11 +746,12 @@ define void @masked_load_zext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64: @@ -782,11 +788,12 @@ define void @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p1.b, vl32 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] -; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16: @@ -915,11 +922,12 @@ define void @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: sunpklo z0.h, z1.b ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0] -; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32: @@ -1000,11 +1008,12 @@ define void @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0] -; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64: @@ -1041,11 +1050,12 @@ define void @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p1.b, vl32 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] -; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16: @@ -1174,11 +1184,12 @@ define void @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: sunpklo z0.h, z1.b ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0] -; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32: @@ -1259,11 +1270,12 @@ define void @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0] -; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64: @@ -1495,11 +1507,12 @@ define void @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64: @@ -1527,11 +1540,12 @@ define void @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: cmpgt p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll index 33d5ac4cd299e..cc19f6c2cbbc8 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll @@ -100,7 +100,7 @@ define <16 x i16> @two_way_i8_i16_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SVE-NEXT: uunpkhi z1.h, z1.b ; SVE-NEXT: mad z2.h, p0/m, z3.h, z4.h ; SVE-NEXT: mad z0.h, p0/m, z1.h, z2.h -; SVE-NEXT: mov z1.d, z0.d +; SVE-NEXT: movprfx z1, z0 ; SVE-NEXT: ext z1.b, z1.b, z0.b, #16 ; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -113,7 +113,7 @@ define <16 x i16> @two_way_i8_i16_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SME-NEXT: ldr z2, [x2] ; SME-NEXT: umlalb z0.h, z2.b, z1.b ; SME-NEXT: umlalt z0.h, z2.b, z1.b -; SME-NEXT: mov z1.d, z0.d +; SME-NEXT: movprfx z1, z0 ; SME-NEXT: ext z1.b, z1.b, z0.b, #16 ; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -223,7 +223,7 @@ define <8 x i32> @two_way_i16_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SVE-NEXT: uunpkhi z1.s, z1.h ; SVE-NEXT: mad z2.s, p0/m, z3.s, z4.s ; SVE-NEXT: mad z0.s, p0/m, z1.s, z2.s -; SVE-NEXT: mov z1.d, z0.d +; SVE-NEXT: movprfx z1, z0 ; SVE-NEXT: ext z1.b, z1.b, z0.b, #16 ; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -236,7 +236,7 @@ define <8 x i32> @two_way_i16_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SME-NEXT: ldr z2, [x2] ; SME-NEXT: umlalb z0.s, z2.h, z1.h ; SME-NEXT: umlalt z0.s, z2.h, z1.h -; SME-NEXT: mov z1.d, z0.d +; SME-NEXT: movprfx z1, z0 ; SME-NEXT: ext z1.b, z1.b, z0.b, #16 ; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -346,7 +346,7 @@ define <4 x i64> @two_way_i32_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SVE-NEXT: uunpkhi z1.d, z1.s ; SVE-NEXT: mad z2.d, p0/m, z3.d, z4.d ; SVE-NEXT: mad z0.d, p0/m, z1.d, z2.d -; SVE-NEXT: mov z1.d, z0.d +; SVE-NEXT: movprfx z1, z0 ; SVE-NEXT: ext z1.b, z1.b, z0.b, #16 ; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -359,7 +359,7 @@ define <4 x i64> @two_way_i32_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SME-NEXT: ldr z2, [x2] ; SME-NEXT: umlalb z0.d, z2.s, z1.s ; SME-NEXT: umlalt z0.d, z2.s, z1.s -; SME-NEXT: mov z1.d, z0.d +; SME-NEXT: movprfx z1, z0 ; SME-NEXT: ext z1.b, z1.b, z0.b, #16 ; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -635,7 +635,7 @@ define <8 x i32> @four_way_i8_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SVE-NEXT: ldr z1, [x1] ; SVE-NEXT: ldr z2, [x2] ; SVE-NEXT: udot z0.s, z2.b, z1.b -; SVE-NEXT: mov z1.d, z0.d +; SVE-NEXT: movprfx z1, z0 ; SVE-NEXT: ext z1.b, z1.b, z0.b, #16 ; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -647,7 +647,7 @@ define <8 x i32> @four_way_i8_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SME-NEXT: ldr z1, [x1] ; SME-NEXT: ldr z2, [x2] ; SME-NEXT: udot z0.s, z2.b, z1.b -; SME-NEXT: mov z1.d, z0.d +; SME-NEXT: movprfx z1, z0 ; SME-NEXT: ext z1.b, z1.b, z0.b, #16 ; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -680,7 +680,7 @@ define <8 x i32> @four_way_i8_i32_vl256_usdot(ptr %accptr, ptr %uptr, ptr %sptr) ; SVE-NEXT: ldr z1, [x1] ; SVE-NEXT: ldr z2, [x2] ; SVE-NEXT: usdot z0.s, z1.b, z2.b -; SVE-NEXT: mov z1.d, z0.d +; SVE-NEXT: movprfx z1, z0 ; SVE-NEXT: ext z1.b, z1.b, z0.b, #16 ; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -692,7 +692,7 @@ define <8 x i32> @four_way_i8_i32_vl256_usdot(ptr %accptr, ptr %uptr, ptr %sptr) ; SME-NEXT: ldr z1, [x1] ; SME-NEXT: ldr z2, [x2] ; SME-NEXT: usdot z0.s, z1.b, z2.b -; SME-NEXT: mov z1.d, z0.d +; SME-NEXT: movprfx z1, z0 ; SME-NEXT: ext z1.b, z1.b, z0.b, #16 ; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -813,7 +813,7 @@ define <4 x i64> @four_way_i16_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vsca ; SVE-NEXT: ldr z1, [x1] ; SVE-NEXT: ldr z2, [x2] ; SVE-NEXT: udot z0.d, z2.h, z1.h -; SVE-NEXT: mov z1.d, z0.d +; SVE-NEXT: movprfx z1, z0 ; SVE-NEXT: ext z1.b, z1.b, z0.b, #16 ; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -825,7 +825,7 @@ define <4 x i64> @four_way_i16_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vsca ; SME-NEXT: ldr z1, [x1] ; SME-NEXT: ldr z2, [x2] ; SME-NEXT: udot z0.d, z2.h, z1.h -; SME-NEXT: mov z1.d, z0.d +; SME-NEXT: movprfx z1, z0 ; SME-NEXT: ext z1.b, z1.b, z0.b, #16 ; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -984,7 +984,7 @@ define <4 x i64> @four_way_i8_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SVE-NEXT: uunpkhi z0.d, z0.s ; SVE-NEXT: add z1.d, z2.d, z1.d ; SVE-NEXT: add z0.d, z1.d, z0.d -; SVE-NEXT: mov z1.d, z0.d +; SVE-NEXT: movprfx z1, z0 ; SVE-NEXT: ext z1.b, z1.b, z0.b, #16 ; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -999,7 +999,7 @@ define <4 x i64> @four_way_i8_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SME-NEXT: ldr z0, [x0] ; SME-NEXT: uaddwb z0.d, z0.d, z2.s ; SME-NEXT: uaddwt z0.d, z0.d, z2.s -; SME-NEXT: mov z1.d, z0.d +; SME-NEXT: movprfx z1, z0 ; SME-NEXT: ext z1.b, z1.b, z0.b, #16 ; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SME-NEXT: // kill: def $q1 killed $q1 killed $z1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index 2eff6da0866f8..ba4a3a2042305 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -12,8 +12,8 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) v ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z0.s, s0 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #16 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: st2 { v0.4s, v1.4s }, [x0] ; CHECK-NEXT: ret %splat = shufflevector <2 x i32> %b, <2 x i32> poison, <8 x i32> zeroinitializer @@ -33,6 +33,9 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: umov w8, v0.b[8] ; CHECK-NEXT: mov v1.b[1], v0.b[1] +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #16 +; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #8 ; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: mov v1.b[2], v0.b[2] ; CHECK-NEXT: mov v2.b[1], v0.b[9] @@ -48,32 +51,30 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: mov v2.b[6], v0.b[14] ; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: mov v2.b[7], v0.b[15] -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: uunpklo z0.h, z3.b +; CHECK-NEXT: uunpklo z3.h, z4.b ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpklo z2.h, z2.b -; CHECK-NEXT: lsl z1.s, z1.s, #31 -; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 -; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 -; CHECK-NEXT: lsl z2.s, z2.s, #31 -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: lsl z3.s, z3.s, #31 -; CHECK-NEXT: asr z2.s, z2.s, #31 -; CHECK-NEXT: st1w { z1.s }, p1, [x0] -; CHECK-NEXT: cmpne p2.s, p0/z, z0.s, #0 +; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: cmpne p3.s, p0/z, z3.s, #0 +; CHECK-NEXT: lsl z2.s, z2.s, #31 +; CHECK-NEXT: cmpne p3.s, p0/z, z1.s, #0 +; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 +; CHECK-NEXT: asr z2.s, z2.s, #31 ; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 -; CHECK-NEXT: st1w { z1.s }, p2, [x0, #2, mul vl] -; CHECK-NEXT: st1w { z1.s }, p3, [x0, #3, mul vl] -; CHECK-NEXT: st1w { z1.s }, p0, [x0, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p1, [x0, #2, mul vl] +; CHECK-NEXT: st1w { z0.s }, p2, [x0, #3, mul vl] +; CHECK-NEXT: st1w { z0.s }, p3, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x0, #1, mul vl] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll index 5e934bd95e995..f1d5813433489 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll @@ -921,7 +921,7 @@ define <4 x i64> @llrint_v4i64_v4fp128(<4 x fp128> %x) nounwind { ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -993,12 +993,12 @@ define <8 x i64> @llrint_v8i64_v8fp128(<8 x fp128> %x) nounwind { ; CHECK-NEXT: ldr z2, [x8, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload -; CHECK-NEXT: mov z3.d, z2.d -; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: movprfx z3, z2 ; CHECK-NEXT: ext z3.b, z3.b, z2.b, #16 ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2 +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q3 killed $q3 killed $z3 -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -1146,20 +1146,20 @@ define <16 x i64> @llrint_v16fp128(<16 x fp128> %x) nounwind { ; CHECK-NEXT: ldr z6, [x8, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload -; CHECK-NEXT: mov z3.d, z2.d -; CHECK-NEXT: mov z5.d, z4.d -; CHECK-NEXT: mov z7.d, z6.d -; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: movprfx z3, z2 ; CHECK-NEXT: ext z3.b, z3.b, z2.b, #16 +; CHECK-NEXT: movprfx z5, z4 ; CHECK-NEXT: ext z5.b, z5.b, z4.b, #16 -; CHECK-NEXT: ext z7.b, z7.b, z6.b, #16 ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2 ; CHECK-NEXT: // kill: def $q4 killed $q4 killed $z4 +; CHECK-NEXT: movprfx z7, z6 +; CHECK-NEXT: ext z7.b, z7.b, z6.b, #16 +; CHECK-NEXT: // kill: def $q6 killed $q6 killed $z6 +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q3 killed $q3 killed $z3 ; CHECK-NEXT: // kill: def $q5 killed $q5 killed $z5 -; CHECK-NEXT: // kill: def $q6 killed $q6 killed $z6 ; CHECK-NEXT: // kill: def $q7 killed $q7 killed $z7 -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll index 90f9eb6aa0a10..62a3fa7f29bb5 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll @@ -1704,7 +1704,7 @@ define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) nounwind { ; CHECK-i64-NEXT: mov v0.d[1], v1.d[0] ; CHECK-i64-NEXT: ldr z1, [x8] // 16-byte Folded Reload ; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-i64-NEXT: mov z1.d, z0.d +; CHECK-i64-NEXT: movprfx z1, z0 ; CHECK-i64-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-i64-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-i64-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -1825,12 +1825,12 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind { ; CHECK-i64-NEXT: ldr z2, [x8, #1, mul vl] // 16-byte Folded Reload ; CHECK-i64-NEXT: mov v0.d[1], v1.d[0] ; CHECK-i64-NEXT: ldr z1, [x8] // 16-byte Folded Reload -; CHECK-i64-NEXT: mov z3.d, z2.d -; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-i64-NEXT: movprfx z3, z2 ; CHECK-i64-NEXT: ext z3.b, z3.b, z2.b, #16 ; CHECK-i64-NEXT: // kill: def $q2 killed $q2 killed $z2 +; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-i64-NEXT: // kill: def $q3 killed $q3 killed $z3 -; CHECK-i64-NEXT: mov z1.d, z0.d +; CHECK-i64-NEXT: movprfx z1, z0 ; CHECK-i64-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-i64-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-i64-NEXT: // kill: def $q1 killed $q1 killed $z1 @@ -2082,20 +2082,20 @@ define <16 x iXLen> @lrint_v16fp128(<16 x fp128> %x) nounwind { ; CHECK-i64-NEXT: ldr z6, [x8, #3, mul vl] // 16-byte Folded Reload ; CHECK-i64-NEXT: mov v0.d[1], v1.d[0] ; CHECK-i64-NEXT: ldr z1, [x8] // 16-byte Folded Reload -; CHECK-i64-NEXT: mov z3.d, z2.d -; CHECK-i64-NEXT: mov z5.d, z4.d -; CHECK-i64-NEXT: mov z7.d, z6.d -; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-i64-NEXT: movprfx z3, z2 ; CHECK-i64-NEXT: ext z3.b, z3.b, z2.b, #16 +; CHECK-i64-NEXT: movprfx z5, z4 ; CHECK-i64-NEXT: ext z5.b, z5.b, z4.b, #16 -; CHECK-i64-NEXT: ext z7.b, z7.b, z6.b, #16 ; CHECK-i64-NEXT: // kill: def $q2 killed $q2 killed $z2 ; CHECK-i64-NEXT: // kill: def $q4 killed $q4 killed $z4 +; CHECK-i64-NEXT: movprfx z7, z6 +; CHECK-i64-NEXT: ext z7.b, z7.b, z6.b, #16 +; CHECK-i64-NEXT: // kill: def $q6 killed $q6 killed $z6 +; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d ; CHECK-i64-NEXT: // kill: def $q3 killed $q3 killed $z3 ; CHECK-i64-NEXT: // kill: def $q5 killed $q5 killed $z5 -; CHECK-i64-NEXT: // kill: def $q6 killed $q6 killed $z6 ; CHECK-i64-NEXT: // kill: def $q7 killed $q7 killed $z7 -; CHECK-i64-NEXT: mov z1.d, z0.d +; CHECK-i64-NEXT: movprfx z1, z0 ; CHECK-i64-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-i64-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-i64-NEXT: // kill: def $q1 killed $q1 killed $z1 diff --git a/llvm/test/CodeGen/AArch64/sve-pr92779.ll b/llvm/test/CodeGen/AArch64/sve-pr92779.ll index 3f34d79b3bb49..bce2d4bede385 100644 --- a/llvm/test/CodeGen/AArch64/sve-pr92779.ll +++ b/llvm/test/CodeGen/AArch64/sve-pr92779.ll @@ -5,16 +5,16 @@ define void @main(ptr %0) { ; CHECK-LABEL: main: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d, vl1 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uzp1 v0.2s, v1.2s, v0.2s -; CHECK-NEXT: neg v0.2s, v0.2s -; CHECK-NEXT: smov x8, v0.s[0] -; CHECK-NEXT: smov x9, v0.s[1] -; CHECK-NEXT: mov z1.d, p0/m, x8 -; CHECK-NEXT: mov z1.d, p0/m, x9 -; CHECK-NEXT: str z1, [x0] +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: uzp1 v1.2s, v0.2s, v1.2s +; CHECK-NEXT: neg v1.2s, v1.2s +; CHECK-NEXT: smov x8, v1.s[0] +; CHECK-NEXT: smov x9, v1.s[1] +; CHECK-NEXT: mov z0.d, p0/m, x8 +; CHECK-NEXT: mov z0.d, p0/m, x9 +; CHECK-NEXT: str z0, [x0] ; CHECK-NEXT: ret "entry": %1 = bitcast zeroinitializer to diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll index 56149e99b15f8..6951a6cd50ef6 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll @@ -75,13 +75,14 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) { ; CHECK-LABEL: fcvt_v8f16_to_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: fcvt z1.s, p0/m, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: fcvt z1.s, p0/m, z1.h +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32: @@ -124,19 +125,21 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: fcvt z2.s, p0/m, z2.h -; CHECK-NEXT: fcvt z3.s, p0/m, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: fcvt z1.s, p0/m, z1.h ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h -; CHECK-NEXT: stp q3, q0, [x0] -; CHECK-NEXT: stp q2, q1, [x0, #32] +; CHECK-NEXT: fcvt z2.s, p0/m, z2.h +; CHECK-NEXT: fcvt z3.s, p0/m, z3.h +; CHECK-NEXT: stp q0, q3, [x0] +; CHECK-NEXT: stp q1, q2, [x0, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index 94d756a36ab92..66e157c779abf 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -262,12 +262,13 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.h +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i32: @@ -320,18 +321,20 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: fcvtzu z2.s, p0/m, z2.h -; CHECK-NEXT: fcvtzu z3.s, p0/m, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.h -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: fcvtzu z2.s, p0/m, z2.h +; CHECK-NEXT: fcvtzu z3.s, p0/m, z3.h +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i32: @@ -521,27 +524,27 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: mov z2.h, z0.h[3] ; CHECK-NEXT: mov z3.h, z0.h[2] ; CHECK-NEXT: mov z4.h, z0.h[1] +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h ; CHECK-NEXT: fcvtzu z2.d, p0/m, z2.h ; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.h -; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h ; CHECK-NEXT: fcvtzu z4.d, p0/m, z4.h ; CHECK-NEXT: mov z5.h, z1.h[3] ; CHECK-NEXT: mov z6.h, z1.h[2] ; CHECK-NEXT: mov z7.h, z1.h[1] ; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.h -; CHECK-NEXT: zip1 z2.d, z3.d, z2.d -; CHECK-NEXT: zip1 z0.d, z0.d, z4.d ; CHECK-NEXT: fcvtzu z5.d, p0/m, z5.h ; CHECK-NEXT: fcvtzu z6.d, p0/m, z6.h ; CHECK-NEXT: fcvtzu z7.d, p0/m, z7.h -; CHECK-NEXT: stp q0, q2, [x1] +; CHECK-NEXT: zip1 z2.d, z3.d, z2.d +; CHECK-NEXT: zip1 z0.d, z0.d, z4.d ; CHECK-NEXT: zip1 z3.d, z6.d, z5.d ; CHECK-NEXT: zip1 z1.d, z1.d, z7.d +; CHECK-NEXT: stp q0, q2, [x1] ; CHECK-NEXT: stp q1, q3, [x1, #32] ; CHECK-NEXT: ret ; @@ -597,52 +600,53 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z3.h, z1.h[1] -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: mov z6.h, z0.h[2] -; CHECK-NEXT: mov z16.d, z0.d -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzu z2.d, p0/m, z1.h -; CHECK-NEXT: mov z4.h, z1.h[3] -; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z3.h, z0.h[3] +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8 +; CHECK-NEXT: movprfx z7, z1 +; CHECK-NEXT: fcvtzu z7.d, p0/m, z1.h +; CHECK-NEXT: mov z16.h, z1.h[3] +; CHECK-NEXT: mov z1.h, z1.h[2] ; CHECK-NEXT: mov z17.h, z0.h[1] -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z6, z0 +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 +; CHECK-NEXT: fcvtzu z2.d, p0/m, z2.h ; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.h -; CHECK-NEXT: fcvtzu z5.d, p0/m, z5.h -; CHECK-NEXT: fcvtzu z6.d, p0/m, z6.h -; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 -; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h ; CHECK-NEXT: fcvtzu z4.d, p0/m, z4.h +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h +; CHECK-NEXT: fcvtzu z16.d, p0/m, z16.h +; CHECK-NEXT: mov z18.h, z5.h[3] ; CHECK-NEXT: fcvtzu z17.d, p0/m, z17.h -; CHECK-NEXT: fcvtzu z7.d, p0/m, z7.h -; CHECK-NEXT: mov z20.h, z1.h[3] -; CHECK-NEXT: mov z18.h, z16.h[3] -; CHECK-NEXT: mov z19.h, z16.h[2] -; CHECK-NEXT: mov z21.h, z16.h[1] -; CHECK-NEXT: zip1 z2.d, z2.d, z3.d -; CHECK-NEXT: mov z3.h, z1.h[2] -; CHECK-NEXT: zip1 z5.d, z6.d, z5.d -; CHECK-NEXT: mov z6.h, z1.h[1] +; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.h +; CHECK-NEXT: mov z19.h, z6.h[3] +; CHECK-NEXT: mov z20.h, z6.h[2] +; CHECK-NEXT: mov z21.h, z6.h[1] +; CHECK-NEXT: fcvtzu z6.d, p0/m, z6.h +; CHECK-NEXT: zip1 z2.d, z7.d, z2.d +; CHECK-NEXT: mov z7.h, z5.h[2] +; CHECK-NEXT: zip1 z3.d, z4.d, z3.d +; CHECK-NEXT: mov z4.h, z5.h[1] +; CHECK-NEXT: fcvtzu z19.d, p0/m, z19.h +; CHECK-NEXT: fcvtzu z5.d, p0/m, z5.h +; CHECK-NEXT: fcvtzu z20.d, p0/m, z20.h ; CHECK-NEXT: zip1 z0.d, z0.d, z17.d -; CHECK-NEXT: fcvtzu z16.d, p0/m, z16.h -; CHECK-NEXT: fcvtzu z18.d, p0/m, z18.h ; CHECK-NEXT: movprfx z17, z21 ; CHECK-NEXT: fcvtzu z17.d, p0/m, z21.h -; CHECK-NEXT: fcvtzu z19.d, p0/m, z19.h -; CHECK-NEXT: zip1 z4.d, z7.d, z4.d -; CHECK-NEXT: movprfx z7, z20 -; CHECK-NEXT: fcvtzu z7.d, p0/m, z20.h -; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.h -; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.h -; CHECK-NEXT: stp q0, q5, [x1, #64] -; CHECK-NEXT: fcvtzu z6.d, p0/m, z6.h -; CHECK-NEXT: zip1 z0.d, z19.d, z18.d -; CHECK-NEXT: zip1 z5.d, z16.d, z17.d -; CHECK-NEXT: stp q2, q4, [x1] -; CHECK-NEXT: zip1 z2.d, z3.d, z7.d -; CHECK-NEXT: zip1 z1.d, z1.d, z6.d -; CHECK-NEXT: stp q5, q0, [x1, #96] -; CHECK-NEXT: stp q1, q2, [x1, #32] +; CHECK-NEXT: zip1 z1.d, z1.d, z16.d +; CHECK-NEXT: movprfx z16, z18 +; CHECK-NEXT: fcvtzu z16.d, p0/m, z18.h +; CHECK-NEXT: fcvtzu z7.d, p0/m, z7.h +; CHECK-NEXT: fcvtzu z4.d, p0/m, z4.h +; CHECK-NEXT: stp q0, q3, [x1, #64] +; CHECK-NEXT: zip1 z0.d, z20.d, z19.d +; CHECK-NEXT: zip1 z3.d, z6.d, z17.d +; CHECK-NEXT: stp q2, q1, [x1] +; CHECK-NEXT: zip1 z1.d, z7.d, z16.d +; CHECK-NEXT: zip1 z2.d, z5.d, z4.d +; CHECK-NEXT: stp q3, q0, [x1, #96] +; CHECK-NEXT: stp q2, q1, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64: @@ -1068,12 +1072,13 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.s +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i64: @@ -1106,18 +1111,20 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: fcvtzu z2.d, p0/m, z2.s -; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.s +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.s -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: fcvtzu z2.d, p0/m, z2.s +; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.s +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i64: @@ -1957,12 +1964,13 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.h +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i32: @@ -2015,18 +2023,20 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: fcvtzs z2.s, p0/m, z2.h -; CHECK-NEXT: fcvtzs z3.s, p0/m, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.h -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: fcvtzs z2.s, p0/m, z2.h +; CHECK-NEXT: fcvtzs z3.s, p0/m, z3.h +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i32: @@ -2217,27 +2227,27 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: mov z2.h, z0.h[3] ; CHECK-NEXT: mov z3.h, z0.h[2] ; CHECK-NEXT: mov z4.h, z0.h[1] +; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h ; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.h ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.h -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h ; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.h ; CHECK-NEXT: mov z5.h, z1.h[3] ; CHECK-NEXT: mov z6.h, z1.h[2] ; CHECK-NEXT: mov z7.h, z1.h[1] ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.h -; CHECK-NEXT: zip1 z2.d, z3.d, z2.d -; CHECK-NEXT: zip1 z0.d, z0.d, z4.d ; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.h ; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.h ; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.h -; CHECK-NEXT: stp q0, q2, [x1] +; CHECK-NEXT: zip1 z2.d, z3.d, z2.d +; CHECK-NEXT: zip1 z0.d, z0.d, z4.d ; CHECK-NEXT: zip1 z3.d, z6.d, z5.d ; CHECK-NEXT: zip1 z1.d, z1.d, z7.d +; CHECK-NEXT: stp q0, q2, [x1] ; CHECK-NEXT: stp q1, q3, [x1, #32] ; CHECK-NEXT: ret ; @@ -2293,52 +2303,53 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z3.h, z1.h[1] -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: mov z6.h, z0.h[2] -; CHECK-NEXT: mov z16.d, z0.d -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z1.h -; CHECK-NEXT: mov z4.h, z1.h[3] -; CHECK-NEXT: mov z7.h, z1.h[2] +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z3.h, z0.h[3] +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: movprfx z5, z1 +; CHECK-NEXT: ext z5.b, z5.b, z1.b, #8 +; CHECK-NEXT: movprfx z7, z1 +; CHECK-NEXT: fcvtzs z7.d, p0/m, z1.h +; CHECK-NEXT: mov z16.h, z1.h[3] +; CHECK-NEXT: mov z1.h, z1.h[2] ; CHECK-NEXT: mov z17.h, z0.h[1] -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z6, z0 +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.h ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.h -; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.h -; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.h -; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h ; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.h +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h +; CHECK-NEXT: fcvtzs z16.d, p0/m, z16.h +; CHECK-NEXT: mov z18.h, z5.h[3] ; CHECK-NEXT: fcvtzs z17.d, p0/m, z17.h -; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.h -; CHECK-NEXT: mov z20.h, z1.h[3] -; CHECK-NEXT: mov z18.h, z16.h[3] -; CHECK-NEXT: mov z19.h, z16.h[2] -; CHECK-NEXT: mov z21.h, z16.h[1] -; CHECK-NEXT: zip1 z2.d, z2.d, z3.d -; CHECK-NEXT: mov z3.h, z1.h[2] -; CHECK-NEXT: zip1 z5.d, z6.d, z5.d -; CHECK-NEXT: mov z6.h, z1.h[1] +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.h +; CHECK-NEXT: mov z19.h, z6.h[3] +; CHECK-NEXT: mov z20.h, z6.h[2] +; CHECK-NEXT: mov z21.h, z6.h[1] +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.h +; CHECK-NEXT: zip1 z2.d, z7.d, z2.d +; CHECK-NEXT: mov z7.h, z5.h[2] +; CHECK-NEXT: zip1 z3.d, z4.d, z3.d +; CHECK-NEXT: mov z4.h, z5.h[1] +; CHECK-NEXT: fcvtzs z19.d, p0/m, z19.h +; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.h +; CHECK-NEXT: fcvtzs z20.d, p0/m, z20.h ; CHECK-NEXT: zip1 z0.d, z0.d, z17.d -; CHECK-NEXT: fcvtzs z16.d, p0/m, z16.h -; CHECK-NEXT: fcvtzs z18.d, p0/m, z18.h ; CHECK-NEXT: movprfx z17, z21 ; CHECK-NEXT: fcvtzs z17.d, p0/m, z21.h -; CHECK-NEXT: fcvtzs z19.d, p0/m, z19.h -; CHECK-NEXT: zip1 z4.d, z7.d, z4.d -; CHECK-NEXT: movprfx z7, z20 -; CHECK-NEXT: fcvtzs z7.d, p0/m, z20.h -; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.h -; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.h -; CHECK-NEXT: stp q0, q5, [x1, #64] -; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.h -; CHECK-NEXT: zip1 z0.d, z19.d, z18.d -; CHECK-NEXT: zip1 z5.d, z16.d, z17.d -; CHECK-NEXT: stp q2, q4, [x1] -; CHECK-NEXT: zip1 z2.d, z3.d, z7.d -; CHECK-NEXT: zip1 z1.d, z1.d, z6.d -; CHECK-NEXT: stp q5, q0, [x1, #96] -; CHECK-NEXT: stp q1, q2, [x1, #32] +; CHECK-NEXT: zip1 z1.d, z1.d, z16.d +; CHECK-NEXT: movprfx z16, z18 +; CHECK-NEXT: fcvtzs z16.d, p0/m, z18.h +; CHECK-NEXT: fcvtzs z7.d, p0/m, z7.h +; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.h +; CHECK-NEXT: stp q0, q3, [x1, #64] +; CHECK-NEXT: zip1 z0.d, z20.d, z19.d +; CHECK-NEXT: zip1 z3.d, z6.d, z17.d +; CHECK-NEXT: stp q2, q1, [x1] +; CHECK-NEXT: zip1 z1.d, z7.d, z16.d +; CHECK-NEXT: zip1 z2.d, z5.d, z4.d +; CHECK-NEXT: stp q3, q0, [x1, #96] +; CHECK-NEXT: stp q2, q1, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64: @@ -2764,12 +2775,13 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.s +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i64: @@ -2802,18 +2814,20 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.s -; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.s +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.s -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.s +; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.s +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll index 75911e5ff1569..c3dcb0f6d7f1f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll @@ -18,14 +18,15 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 -; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: lsl z1.s, z1.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i1_v8i32: @@ -74,14 +75,15 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: lsl z1.d, z1.d, #61 +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: lsl z0.d, z0.d, #61 -; CHECK-NEXT: asr z1.d, z1.d, #61 +; CHECK-NEXT: lsl z1.d, z1.d, #61 ; CHECK-NEXT: asr z0.d, z0.d, #61 -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: asr z1.d, z1.d, #61 +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v4i3_v4i64: @@ -116,10 +118,11 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) { ; CHECK-LABEL: sext_v16i8_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i16: @@ -176,14 +179,16 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: sunpklo z2.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: sunpklo z3.h, z3.b +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i16: @@ -369,10 +374,11 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i8_v8i32: @@ -405,17 +411,20 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) { ; CHECK-LABEL: sext_v16i8_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: stp q1, q3, [x0, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i32: @@ -465,28 +474,33 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: sunpklo z2.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z3.h, z1.b ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: sunpklo z4.s, z2.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: sunpklo z6.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z7.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: ext z5.b, z5.b, z3.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: movprfx z6, z2 +; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 +; CHECK-NEXT: movprfx z7, z1 +; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: stp q4, q2, [x1, #64] -; CHECK-NEXT: stp q5, q3, [x1] -; CHECK-NEXT: stp q6, q0, [x1, #96] -; CHECK-NEXT: stp q7, q1, [x1, #32] +; CHECK-NEXT: stp q3, q5, [x1] +; CHECK-NEXT: sunpklo z3.s, z7.h +; CHECK-NEXT: stp q0, q4, [x1, #64] +; CHECK-NEXT: sunpklo z0.s, z6.h +; CHECK-NEXT: stp q1, q3, [x1, #32] +; CHECK-NEXT: stp q2, q0, [x1, #96] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i32: @@ -664,12 +678,13 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) { ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: sxtb z1.d, p0/m, z1.d +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: sxtb z0.d, p0/m, z0.d -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: sxtb z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v4i8_v4i64: @@ -699,17 +714,20 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z3.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: stp q1, q3, [x0, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i8_v8i64: @@ -744,31 +762,37 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) { ; CHECK-LABEL: sext_v16i8_v16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z4.d, z2.s -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z6.d, z1.s +; CHECK-NEXT: sunpklo z3.s, z1.h ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z5.d, z3.s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: ext z5.b, z5.b, z3.b, #8 +; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: sunpklo z4.d, z4.s +; CHECK-NEXT: movprfx z6, z2 +; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 ; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: sunpklo z6.d, z6.s +; CHECK-NEXT: stp q0, q4, [x0] +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: sunpklo z7.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.d, z3.s -; CHECK-NEXT: stp q4, q2, [x0] +; CHECK-NEXT: stp q3, q5, [x0, #64] ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q6, q1, [x0, #32] -; CHECK-NEXT: stp q5, q3, [x0, #64] -; CHECK-NEXT: stp q7, q0, [x0, #96] +; CHECK-NEXT: stp q2, q6, [x0, #32] +; CHECK-NEXT: stp q1, q0, [x0, #96] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i64: @@ -819,64 +843,69 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) { define void @sext_v32i8_v32i64(ptr %in, ptr %out) { ; CHECK-LABEL: sext_v32i8_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: sunpklo z4.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z3.h, z3.b ; CHECK-NEXT: sunpklo z2.h, z2.b -; CHECK-NEXT: sunpklo z5.s, z0.h -; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpklo z5.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z7, z3 +; CHECK-NEXT: ext z7.b, z7.b, z3.b, #8 ; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sunpklo z16.d, z4.s -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: movprfx z16, z0 +; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z6.s, z2.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 -; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z17, z5 +; CHECK-NEXT: ext z17.b, z17.b, z5.b, #8 +; CHECK-NEXT: sunpklo z7.s, z7.h ; CHECK-NEXT: sunpklo z5.d, z5.s -; CHECK-NEXT: sunpklo z4.d, z4.s -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z19.d, z3.s +; CHECK-NEXT: sunpklo z16.d, z16.s +; CHECK-NEXT: movprfx z20, z3 +; CHECK-NEXT: ext z20.b, z20.b, z3.b, #8 +; CHECK-NEXT: sunpklo z19.d, z4.s ; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: sunpklo z17.d, z17.s ; CHECK-NEXT: sunpklo z18.d, z6.s ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: sunpklo z20.d, z1.s +; CHECK-NEXT: str q5, [x1] +; CHECK-NEXT: stp q0, q16, [x1, #128] +; CHECK-NEXT: sunpklo z0.d, z1.s +; CHECK-NEXT: sunpklo z16.d, z20.s +; CHECK-NEXT: sunpklo z4.d, z4.s ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: stp q16, q4, [x1, #128] -; CHECK-NEXT: sunpklo z3.d, z3.s -; CHECK-NEXT: sunpklo z16.d, z0.s -; CHECK-NEXT: sunpklo z17.d, z17.s -; CHECK-NEXT: mov z4.d, z7.d -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z6.d, z6.s +; CHECK-NEXT: stp q17, q0, [x1, #16] +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: ext z0.b, z0.b, z2.b, #8 ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: ext z4.b, z4.b, z7.b, #8 -; CHECK-NEXT: stp q19, q3, [x1, #160] -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q5, q17, [x1] -; CHECK-NEXT: sunpklo z5.d, z6.s -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: stp q20, q1, [x1, #192] -; CHECK-NEXT: sunpklo z7.d, z7.s -; CHECK-NEXT: sunpklo z1.d, z4.s -; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 +; CHECK-NEXT: stp q3, q16, [x1, #64] +; CHECK-NEXT: movprfx z3, z7 +; CHECK-NEXT: ext z3.b, z3.b, z7.b, #8 ; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: stp q16, q0, [x1, #32] -; CHECK-NEXT: stp q18, q5, [x1, #64] -; CHECK-NEXT: sunpklo z3.d, z6.s -; CHECK-NEXT: stp q7, q1, [x1, #224] -; CHECK-NEXT: stp q2, q3, [x1, #96] +; CHECK-NEXT: stp q19, q4, [x1, #160] +; CHECK-NEXT: sunpklo z4.d, z7.s +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: stp q18, q6, [x1, #192] +; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: str q1, [x1, #48] +; CHECK-NEXT: stp q2, q0, [x1, #224] +; CHECK-NEXT: stp q4, q3, [x1, #96] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i64: @@ -1057,10 +1086,11 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) { ; CHECK-LABEL: sext_v8i16_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i16_v8i32: @@ -1096,14 +1126,16 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: add z1.h, z1.h, z1.h -; CHECK-NEXT: sunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v16i16_v16i32: @@ -1189,10 +1221,11 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v4i16_v4i64: @@ -1219,17 +1252,20 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) { ; CHECK-LABEL: sext_v8i16_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: sunpklo z3.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: stp q1, q3, [x0, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i16_v8i64: @@ -1267,28 +1303,33 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: add z1.h, z1.h, z1.h -; CHECK-NEXT: sunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z3.s, z1.h ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z4.d, z2.s -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: sunpklo z5.d, z3.s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: sunpklo z6.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: sunpklo z7.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: ext z5.b, z5.b, z3.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: sunpklo z4.d, z4.s +; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: movprfx z6, z2 +; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 +; CHECK-NEXT: movprfx z7, z1 +; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: sunpklo z2.d, z2.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: stp q4, q2, [x1, #64] -; CHECK-NEXT: stp q5, q3, [x1] -; CHECK-NEXT: stp q6, q0, [x1, #96] -; CHECK-NEXT: stp q7, q1, [x1, #32] +; CHECK-NEXT: stp q3, q5, [x1] +; CHECK-NEXT: sunpklo z3.d, z7.s +; CHECK-NEXT: stp q0, q4, [x1, #64] +; CHECK-NEXT: sunpklo z0.d, z6.s +; CHECK-NEXT: stp q1, q3, [x1, #32] +; CHECK-NEXT: stp q2, q0, [x1, #96] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v16i16_v16i64: @@ -1378,10 +1419,11 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) { ; CHECK-LABEL: sext_v4i32_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: sunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v4i32_v4i64: @@ -1409,14 +1451,16 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.s, z0.s, z0.s ; CHECK-NEXT: add z1.s, z1.s, z1.s -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: sext_v8i32_v8i64: @@ -1469,10 +1513,11 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) { ; CHECK-LABEL: zext_v16i8_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i16: @@ -1529,14 +1574,16 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: uunpklo z2.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i16: @@ -1722,10 +1769,11 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v8i8_v8i32: @@ -1758,17 +1806,20 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) { ; CHECK-LABEL: zext_v16i8_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: stp q1, q3, [x0, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i32: @@ -1818,28 +1869,33 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: uunpklo z2.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpklo z3.h, z1.b ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z4.s, z2.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: uunpklo z6.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z7.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: ext z5.b, z5.b, z3.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: movprfx z6, z2 +; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 +; CHECK-NEXT: movprfx z7, z1 +; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: stp q4, q2, [x1, #64] -; CHECK-NEXT: stp q5, q3, [x1] -; CHECK-NEXT: stp q6, q0, [x1, #96] -; CHECK-NEXT: stp q7, q1, [x1, #32] +; CHECK-NEXT: stp q3, q5, [x1] +; CHECK-NEXT: uunpklo z3.s, z7.h +; CHECK-NEXT: stp q0, q4, [x1, #64] +; CHECK-NEXT: uunpklo z0.s, z6.h +; CHECK-NEXT: stp q1, q3, [x1, #32] +; CHECK-NEXT: stp q2, q0, [x1, #96] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i32: @@ -2017,10 +2073,11 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) { ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v4i8_v4i64: @@ -2050,17 +2107,20 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z3.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: stp q1, q3, [x0, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v8i8_v8i64: @@ -2099,31 +2159,37 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) { ; CHECK-LABEL: zext_v16i8_v16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.h, z0.b -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z4.d, z2.s -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z6.d, z1.s +; CHECK-NEXT: uunpklo z3.s, z1.h ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z5.d, z3.s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: ext z5.b, z5.b, z3.b, #8 +; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: uunpklo z4.d, z4.s +; CHECK-NEXT: movprfx z6, z2 +; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 ; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uunpklo z5.d, z5.s +; CHECK-NEXT: uunpklo z6.d, z6.s +; CHECK-NEXT: stp q0, q4, [x0] +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 ; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: uunpklo z7.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: stp q4, q2, [x0] +; CHECK-NEXT: stp q3, q5, [x0, #64] ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q6, q1, [x0, #32] -; CHECK-NEXT: stp q5, q3, [x0, #64] -; CHECK-NEXT: stp q7, q0, [x0, #96] +; CHECK-NEXT: stp q2, q6, [x0, #32] +; CHECK-NEXT: stp q1, q0, [x0, #96] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i64: @@ -2182,64 +2248,69 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) { define void @zext_v32i8_v32i64(ptr %in, ptr %out) { ; CHECK-LABEL: zext_v32i8_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: add z1.b, z1.b, z1.b -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: uunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z4.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z2.h, z2.b -; CHECK-NEXT: uunpklo z5.s, z0.h -; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z5.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z7, z3 +; CHECK-NEXT: ext z7.b, z7.b, z3.b, #8 ; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: uunpklo z16.d, z4.s -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: movprfx z16, z0 +; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z6.s, z2.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 -; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z17, z5 +; CHECK-NEXT: ext z17.b, z17.b, z5.b, #8 +; CHECK-NEXT: uunpklo z7.s, z7.h ; CHECK-NEXT: uunpklo z5.d, z5.s -; CHECK-NEXT: uunpklo z4.d, z4.s -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z19.d, z3.s +; CHECK-NEXT: uunpklo z16.d, z16.s +; CHECK-NEXT: movprfx z20, z3 +; CHECK-NEXT: ext z20.b, z20.b, z3.b, #8 +; CHECK-NEXT: uunpklo z19.d, z4.s ; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: uunpklo z17.d, z17.s ; CHECK-NEXT: uunpklo z18.d, z6.s ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: uunpklo z20.d, z1.s +; CHECK-NEXT: str q5, [x1] +; CHECK-NEXT: stp q0, q16, [x1, #128] +; CHECK-NEXT: uunpklo z0.d, z1.s +; CHECK-NEXT: uunpklo z16.d, z20.s +; CHECK-NEXT: uunpklo z4.d, z4.s ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: stp q16, q4, [x1, #128] -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: uunpklo z16.d, z0.s -; CHECK-NEXT: uunpklo z17.d, z17.s -; CHECK-NEXT: mov z4.d, z7.d -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z6.d, z6.s +; CHECK-NEXT: stp q17, q0, [x1, #16] +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: ext z0.b, z0.b, z2.b, #8 ; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: ext z4.b, z4.b, z7.b, #8 -; CHECK-NEXT: stp q19, q3, [x1, #160] -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q5, q17, [x1] -; CHECK-NEXT: uunpklo z5.d, z6.s -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: stp q20, q1, [x1, #192] -; CHECK-NEXT: uunpklo z7.d, z7.s -; CHECK-NEXT: uunpklo z1.d, z4.s -; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 +; CHECK-NEXT: stp q3, q16, [x1, #64] +; CHECK-NEXT: movprfx z3, z7 +; CHECK-NEXT: ext z3.b, z3.b, z7.b, #8 ; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: stp q16, q0, [x1, #32] -; CHECK-NEXT: stp q18, q5, [x1, #64] -; CHECK-NEXT: uunpklo z3.d, z6.s -; CHECK-NEXT: stp q7, q1, [x1, #224] -; CHECK-NEXT: stp q2, q3, [x1, #96] +; CHECK-NEXT: stp q19, q4, [x1, #160] +; CHECK-NEXT: uunpklo z4.d, z7.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: stp q18, q6, [x1, #192] +; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: str q1, [x1, #48] +; CHECK-NEXT: stp q2, q0, [x1, #224] +; CHECK-NEXT: stp q4, q3, [x1, #96] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i64: @@ -2443,10 +2514,11 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) { ; CHECK-LABEL: zext_v8i16_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v8i16_v8i32: @@ -2482,14 +2554,16 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: add z1.h, z1.h, z1.h -; CHECK-NEXT: uunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v16i16_v16i32: @@ -2575,10 +2649,11 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v4i16_v4i64: @@ -2607,17 +2682,20 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) { ; CHECK-LABEL: zext_v8i16_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: uunpklo z3.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: stp q3, q0, [x0, #32] +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: stp q1, q3, [x0, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v8i16_v8i64: @@ -2659,28 +2737,33 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: add z1.h, z1.h, z1.h -; CHECK-NEXT: uunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z3.s, z1.h ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z4.d, z2.s -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: uunpklo z5.d, z3.s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: uunpklo z6.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: uunpklo z7.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: ext z5.b, z5.b, z3.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: uunpklo z4.d, z4.s +; CHECK-NEXT: uunpklo z5.d, z5.s +; CHECK-NEXT: movprfx z6, z2 +; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 +; CHECK-NEXT: movprfx z7, z1 +; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: stp q4, q2, [x1, #64] -; CHECK-NEXT: stp q5, q3, [x1] -; CHECK-NEXT: stp q6, q0, [x1, #96] -; CHECK-NEXT: stp q7, q1, [x1, #32] +; CHECK-NEXT: stp q3, q5, [x1] +; CHECK-NEXT: uunpklo z3.d, z7.s +; CHECK-NEXT: stp q0, q4, [x1, #64] +; CHECK-NEXT: uunpklo z0.d, z6.s +; CHECK-NEXT: stp q1, q3, [x1, #32] +; CHECK-NEXT: stp q2, q0, [x1, #96] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v16i16_v16i64: @@ -2780,10 +2863,11 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) { ; CHECK-LABEL: zext_v4i32_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v4i32_v4i64: @@ -2813,14 +2897,16 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) { ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z0.s, z0.s, z0.s ; CHECK-NEXT: add z1.s, z1.s, z1.s -; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: zext_v8i32_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index 9497ec88e57b4..372f6a06bf64b 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -151,14 +151,14 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 ; CHECK-NEXT: sunpklo z5.h, z5.b ; CHECK-NEXT: sunpklo z7.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: sunpklo z3.h, z3.b ; CHECK-NEXT: sunpklo z6.s, z3.h @@ -279,35 +279,35 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: sunpklo z4.h, z0.b -; CHECK-NEXT: sunpklo z2.s, z3.h -; CHECK-NEXT: sunpklo z5.s, z4.h +; CHECK-NEXT: sunpklo z2.h, z1.b +; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: sunpklo z4.s, z2.h +; CHECK-NEXT: sunpklo z5.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: movprfx z5, z4 -; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: sunpklo z7.h, z3.b -; CHECK-NEXT: sunpklo z16.h, z4.b -; CHECK-NEXT: sunpklo z3.s, z7.h -; CHECK-NEXT: sunpklo z4.s, z16.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z2.s +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: sunpklo z7.h, z2.b +; CHECK-NEXT: sunpklo z16.h, z3.b +; CHECK-NEXT: sunpklo z2.s, z7.h +; CHECK-NEXT: sunpklo z3.s, z16.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 ; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 ; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: movprfx z6, z4 -; CHECK-NEXT: sdiv z6.s, p0/m, z6.s, z3.s -; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: ldr q4, [x1] +; CHECK-NEXT: movprfx z6, z3 +; CHECK-NEXT: sdiv z6.s, p0/m, z6.s, z2.s +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: sunpklo z16.s, z16.h -; CHECK-NEXT: sunpklo z17.h, z4.b -; CHECK-NEXT: sunpklo z18.h, z3.b +; CHECK-NEXT: sunpklo z17.h, z3.b +; CHECK-NEXT: sunpklo z18.h, z2.b ; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s ; CHECK-NEXT: sunpklo z19.s, z17.h ; CHECK-NEXT: sunpklo z20.s, z18.h @@ -316,44 +316,44 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z17.s, z17.h ; CHECK-NEXT: sunpklo z18.s, z18.h ; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s -; CHECK-NEXT: mov z20.d, z3.d -; CHECK-NEXT: ext z20.b, z20.b, z3.b, #8 +; CHECK-NEXT: movprfx z20, z2 +; CHECK-NEXT: ext z20.b, z20.b, z2.b, #8 ; CHECK-NEXT: sunpklo z20.h, z20.b ; CHECK-NEXT: sunpklo z22.s, z20.h ; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 -; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: mov z18.d, z4.d ; CHECK-NEXT: sunpklo z20.s, z20.h -; CHECK-NEXT: ext z18.b, z18.b, z4.b, #8 +; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: movprfx z18, z3 +; CHECK-NEXT: ext z18.b, z18.b, z3.b, #8 ; CHECK-NEXT: sunpklo z18.h, z18.b ; CHECK-NEXT: sunpklo z21.s, z18.h ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: sunpklo z18.s, z18.h ; CHECK-NEXT: sdivr z21.s, p0/m, z21.s, z22.s -; CHECK-NEXT: uzp1 z22.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z22.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z23.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h ; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z16.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z17.h, z7.h, z7.h ; CHECK-NEXT: splice z7.h, p0, { z22.h, z23.h } -; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } -; CHECK-NEXT: uzp1 z16.h, z21.h, z21.h -; CHECK-NEXT: splice z2.h, p0, { z19.h, z20.h } +; CHECK-NEXT: uzp1 z4.h, z21.h, z21.h +; CHECK-NEXT: splice z6.h, p0, { z19.h, z20.h } +; CHECK-NEXT: uzp1 z5.h, z18.h, z18.h +; CHECK-NEXT: splice z4.h, p0, { z4.h, z5.h } +; CHECK-NEXT: splice z5.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z16.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z6.b, z7.b, z7.b -; CHECK-NEXT: uzp1 z7.b, z5.b, z5.b -; CHECK-NEXT: uzp1 z17.h, z18.h, z18.h -; CHECK-NEXT: splice z16.h, p0, { z16.h, z17.h } -; CHECK-NEXT: uzp1 z17.b, z2.b, z2.b ; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z17.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z7.b, z5.b, z5.b +; CHECK-NEXT: splice z4.b, p0, { z16.b, z17.b } ; CHECK-NEXT: splice z5.b, p0, { z6.b, z7.b } -; CHECK-NEXT: uzp1 z18.b, z16.b, z16.b -; CHECK-NEXT: splice z2.b, p0, { z17.b, z18.b } ; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mls z2.b, p0/m, z4.b, z3.b ; CHECK-NEXT: mls z0.b, p0/m, z5.b, z1.b -; CHECK-NEXT: msb z2.b, p0/m, z4.b, z3.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret ; @@ -590,13 +590,13 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 -; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: ptrue p0.h, vl4 @@ -668,21 +668,21 @@ define void @srem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h ; CHECK-NEXT: sunpklo z5.s, z4.h -; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: movprfx z16, z0 +; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 -; CHECK-NEXT: sunpklo z6.s, z3.h -; CHECK-NEXT: mov z7.d, z3.d ; CHECK-NEXT: sunpklo z16.s, z16.h +; CHECK-NEXT: sunpklo z6.s, z3.h +; CHECK-NEXT: movprfx z7, z3 ; CHECK-NEXT: ext z7.b, z7.b, z3.b, #8 ; CHECK-NEXT: sunpklo z7.s, z7.h ; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: mov z6.d, z4.d +; CHECK-NEXT: movprfx z6, z4 ; CHECK-NEXT: ext z6.b, z6.b, z4.b, #8 ; CHECK-NEXT: sunpklo z6.s, z6.h ; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: movprfx z7, z1 ; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 ; CHECK-NEXT: sunpklo z7.s, z7.h ; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s @@ -1201,14 +1201,14 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 ; CHECK-NEXT: uunpklo z5.h, z5.b ; CHECK-NEXT: uunpklo z7.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z6.s, z3.h @@ -1329,35 +1329,35 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpklo z3.h, z1.b -; CHECK-NEXT: uunpklo z4.h, z0.b -; CHECK-NEXT: uunpklo z2.s, z3.h -; CHECK-NEXT: uunpklo z5.s, z4.h +; CHECK-NEXT: uunpklo z2.h, z1.b +; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: uunpklo z4.s, z2.h +; CHECK-NEXT: uunpklo z5.s, z3.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: movprfx z5, z4 -; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: uunpklo z7.h, z3.b -; CHECK-NEXT: uunpklo z16.h, z4.b -; CHECK-NEXT: uunpklo z3.s, z7.h -; CHECK-NEXT: uunpklo z4.s, z16.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z2.s +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 +; CHECK-NEXT: movprfx z3, z0 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: uunpklo z7.h, z2.b +; CHECK-NEXT: uunpklo z16.h, z3.b +; CHECK-NEXT: uunpklo z2.s, z7.h +; CHECK-NEXT: uunpklo z3.s, z16.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 ; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 ; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: movprfx z6, z4 -; CHECK-NEXT: udiv z6.s, p0/m, z6.s, z3.s -; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: ldr q4, [x1] +; CHECK-NEXT: movprfx z6, z3 +; CHECK-NEXT: udiv z6.s, p0/m, z6.s, z2.s +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: uunpklo z16.s, z16.h -; CHECK-NEXT: uunpklo z17.h, z4.b -; CHECK-NEXT: uunpklo z18.h, z3.b +; CHECK-NEXT: uunpklo z17.h, z3.b +; CHECK-NEXT: uunpklo z18.h, z2.b ; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s ; CHECK-NEXT: uunpklo z19.s, z17.h ; CHECK-NEXT: uunpklo z20.s, z18.h @@ -1366,44 +1366,44 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z17.s, z17.h ; CHECK-NEXT: uunpklo z18.s, z18.h ; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s -; CHECK-NEXT: mov z20.d, z3.d -; CHECK-NEXT: ext z20.b, z20.b, z3.b, #8 +; CHECK-NEXT: movprfx z20, z2 +; CHECK-NEXT: ext z20.b, z20.b, z2.b, #8 ; CHECK-NEXT: uunpklo z20.h, z20.b ; CHECK-NEXT: uunpklo z22.s, z20.h ; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 -; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: mov z18.d, z4.d ; CHECK-NEXT: uunpklo z20.s, z20.h -; CHECK-NEXT: ext z18.b, z18.b, z4.b, #8 +; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s +; CHECK-NEXT: movprfx z18, z3 +; CHECK-NEXT: ext z18.b, z18.b, z3.b, #8 ; CHECK-NEXT: uunpklo z18.h, z18.b ; CHECK-NEXT: uunpklo z21.s, z18.h ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: uunpklo z18.s, z18.h ; CHECK-NEXT: udivr z21.s, p0/m, z21.s, z22.s -; CHECK-NEXT: uzp1 z22.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z22.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z23.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h ; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z16.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z17.h, z7.h, z7.h ; CHECK-NEXT: splice z7.h, p0, { z22.h, z23.h } -; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } -; CHECK-NEXT: uzp1 z16.h, z21.h, z21.h -; CHECK-NEXT: splice z2.h, p0, { z19.h, z20.h } +; CHECK-NEXT: uzp1 z4.h, z21.h, z21.h +; CHECK-NEXT: splice z6.h, p0, { z19.h, z20.h } +; CHECK-NEXT: uzp1 z5.h, z18.h, z18.h +; CHECK-NEXT: splice z4.h, p0, { z4.h, z5.h } +; CHECK-NEXT: splice z5.h, p0, { z16.h, z17.h } +; CHECK-NEXT: uzp1 z16.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z6.b, z7.b, z7.b -; CHECK-NEXT: uzp1 z7.b, z5.b, z5.b -; CHECK-NEXT: uzp1 z17.h, z18.h, z18.h -; CHECK-NEXT: splice z16.h, p0, { z16.h, z17.h } -; CHECK-NEXT: uzp1 z17.b, z2.b, z2.b ; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z17.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z7.b, z5.b, z5.b +; CHECK-NEXT: splice z4.b, p0, { z16.b, z17.b } ; CHECK-NEXT: splice z5.b, p0, { z6.b, z7.b } -; CHECK-NEXT: uzp1 z18.b, z16.b, z16.b -; CHECK-NEXT: splice z2.b, p0, { z17.b, z18.b } ; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mls z2.b, p0/m, z4.b, z3.b ; CHECK-NEXT: mls z0.b, p0/m, z5.b, z1.b -; CHECK-NEXT: msb z2.b, p0/m, z4.b, z3.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret ; @@ -1640,13 +1640,13 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 -; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: ptrue p0.h, vl4 @@ -1718,21 +1718,21 @@ define void @urem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: movprfx z16, z0 +; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ldr q3, [x0] -; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 -; CHECK-NEXT: uunpklo z6.s, z3.h -; CHECK-NEXT: mov z7.d, z3.d ; CHECK-NEXT: uunpklo z16.s, z16.h +; CHECK-NEXT: uunpklo z6.s, z3.h +; CHECK-NEXT: movprfx z7, z3 ; CHECK-NEXT: ext z7.b, z7.b, z3.b, #8 ; CHECK-NEXT: uunpklo z7.s, z7.h ; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s -; CHECK-NEXT: mov z6.d, z4.d +; CHECK-NEXT: movprfx z6, z4 ; CHECK-NEXT: ext z6.b, z6.b, z4.b, #8 ; CHECK-NEXT: uunpklo z6.s, z6.h ; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: movprfx z7, z1 ; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 ; CHECK-NEXT: uunpklo z7.s, z7.h ; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 7df362826d052..f7fadaa1217bc 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -256,12 +256,13 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f32: @@ -306,18 +307,20 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: uunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: ucvtf z2.s, p0/m, z2.s -; CHECK-NEXT: ucvtf z3.s, p0/m, z3.s +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: ucvtf z2.s, p0/m, z2.s +; CHECK-NEXT: ucvtf z3.s, p0/m, z3.s +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f32: @@ -442,12 +445,13 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f64: @@ -481,21 +485,24 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z3.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: ucvtf z2.d, p0/m, z2.d -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d ; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d -; CHECK-NEXT: stp q2, q1, [x1] -; CHECK-NEXT: stp q3, q0, [x1, #32] +; CHECK-NEXT: stp q0, q2, [x1] +; CHECK-NEXT: stp q1, q3, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64: @@ -542,43 +549,43 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: uunpklo z3.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: uunpklo z5.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: mov z7.d, z3.d +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: ext z5.b, z5.b, z3.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: uunpklo z4.d, z4.s -; CHECK-NEXT: ucvtf z5.d, p0/m, z5.d -; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d +; CHECK-NEXT: uunpklo z5.d, z5.s +; CHECK-NEXT: movprfx z6, z2 ; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z3.b, #8 +; CHECK-NEXT: movprfx z7, z1 +; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 ; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d -; CHECK-NEXT: ucvtf z4.d, p0/m, z4.d +; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d ; CHECK-NEXT: uunpklo z6.d, z6.s +; CHECK-NEXT: ucvtf z4.d, p0/m, z4.d ; CHECK-NEXT: uunpklo z7.d, z7.s +; CHECK-NEXT: ucvtf z5.d, p0/m, z5.d ; CHECK-NEXT: ucvtf z2.d, p0/m, z2.d -; CHECK-NEXT: stp q5, q0, [x1, #64] -; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d -; CHECK-NEXT: stp q1, q4, [x1] -; CHECK-NEXT: movprfx z1, z6 -; CHECK-NEXT: ucvtf z1.d, p0/m, z6.d -; CHECK-NEXT: movprfx z0, z7 -; CHECK-NEXT: ucvtf z0.d, p0/m, z7.d -; CHECK-NEXT: stp q3, q0, [x1, #32] -; CHECK-NEXT: stp q2, q1, [x1, #96] +; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d +; CHECK-NEXT: stp q3, q5, [x1] +; CHECK-NEXT: movprfx z3, z7 +; CHECK-NEXT: ucvtf z3.d, p0/m, z7.d +; CHECK-NEXT: stp q0, q4, [x1, #64] +; CHECK-NEXT: movprfx z0, z6 +; CHECK-NEXT: ucvtf z0.d, p0/m, z6.d +; CHECK-NEXT: stp q1, q3, [x1, #32] +; CHECK-NEXT: stp q2, q0, [x1, #96] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64: @@ -997,12 +1004,13 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f64: @@ -1035,18 +1043,20 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: ucvtf z2.d, p0/m, z2.d -; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: ucvtf z2.d, p0/m, z2.d +; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f64: @@ -1669,12 +1679,13 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: scvtf z1.s, p0/m, z1.s +; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: scvtf z1.s, p0/m, z1.s +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f32: @@ -1719,18 +1730,20 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: sunpklo z2.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: scvtf z2.s, p0/m, z2.s -; CHECK-NEXT: scvtf z3.s, p0/m, z3.s +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: scvtf z1.s, p0/m, z1.s -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: scvtf z2.s, p0/m, z2.s +; CHECK-NEXT: scvtf z3.s, p0/m, z3.s +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f32: @@ -1833,12 +1846,13 @@ define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f64: @@ -1872,21 +1886,24 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: sunpklo z1.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z3.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: sunpklo z2.d, z2.s ; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: sunpklo z3.d, z3.s ; CHECK-NEXT: scvtf z2.d, p0/m, z2.d -; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: scvtf z1.d, p0/m, z1.d ; CHECK-NEXT: scvtf z3.d, p0/m, z3.d -; CHECK-NEXT: scvtf z0.d, p0/m, z0.d -; CHECK-NEXT: stp q2, q1, [x1] -; CHECK-NEXT: stp q3, q0, [x1, #32] +; CHECK-NEXT: stp q0, q2, [x1] +; CHECK-NEXT: stp q1, q3, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64: @@ -1933,43 +1950,43 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sunpklo z3.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sunpklo z5.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: mov z7.d, z3.d +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: ext z5.b, z5.b, z3.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: sunpklo z3.d, z3.s ; CHECK-NEXT: sunpklo z4.d, z4.s -; CHECK-NEXT: scvtf z5.d, p0/m, z5.d -; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: movprfx z6, z2 ; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z3.b, #8 +; CHECK-NEXT: movprfx z7, z1 +; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 ; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d -; CHECK-NEXT: scvtf z4.d, p0/m, z4.d +; CHECK-NEXT: scvtf z3.d, p0/m, z3.d ; CHECK-NEXT: sunpklo z6.d, z6.s +; CHECK-NEXT: scvtf z4.d, p0/m, z4.d ; CHECK-NEXT: sunpklo z7.d, z7.s +; CHECK-NEXT: scvtf z5.d, p0/m, z5.d ; CHECK-NEXT: scvtf z2.d, p0/m, z2.d -; CHECK-NEXT: stp q5, q0, [x1, #64] -; CHECK-NEXT: scvtf z3.d, p0/m, z3.d -; CHECK-NEXT: stp q1, q4, [x1] -; CHECK-NEXT: movprfx z1, z6 -; CHECK-NEXT: scvtf z1.d, p0/m, z6.d -; CHECK-NEXT: movprfx z0, z7 -; CHECK-NEXT: scvtf z0.d, p0/m, z7.d -; CHECK-NEXT: stp q3, q0, [x1, #32] -; CHECK-NEXT: stp q2, q1, [x1, #96] +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: stp q3, q5, [x1] +; CHECK-NEXT: movprfx z3, z7 +; CHECK-NEXT: scvtf z3.d, p0/m, z7.d +; CHECK-NEXT: stp q0, q4, [x1, #64] +; CHECK-NEXT: movprfx z0, z6 +; CHECK-NEXT: scvtf z0.d, p0/m, z6.d +; CHECK-NEXT: stp q1, q3, [x1, #32] +; CHECK-NEXT: stp q2, q0, [x1, #96] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64: @@ -2293,12 +2310,13 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: sunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f64: @@ -2331,18 +2349,20 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: scvtf z2.d, p0/m, z2.d -; CHECK-NEXT: scvtf z3.d, p0/m, z3.d +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: sunpklo z3.d, z3.s ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: scvtf z1.d, p0/m, z1.d -; CHECK-NEXT: stp q2, q0, [x1, #32] -; CHECK-NEXT: stp q3, q1, [x1] +; CHECK-NEXT: scvtf z2.d, p0/m, z2.d +; CHECK-NEXT: scvtf z3.d, p0/m, z3.d +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: stp q0, q2, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f64: @@ -2388,15 +2408,15 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q5, q3, [x0] -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: mov z6.d, z3.d -; CHECK-NEXT: mov z7.d, z5.d +; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: movprfx z6, z3 ; CHECK-NEXT: ext z6.b, z6.b, z3.b, #8 +; CHECK-NEXT: movprfx z7, z5 ; CHECK-NEXT: ext z7.b, z7.b, z5.b, #8 ; CHECK-NEXT: sunpklo z3.d, z3.s ; CHECK-NEXT: sunpklo z5.d, z5.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll index 3627390b5edfa..daf2734ce7d55 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll @@ -99,15 +99,16 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ldp q3, q4, [x0] -; CHECK-NEXT: add z2.s, z0.s, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: add z1.s, z1.s, z1.s -; CHECK-NEXT: add z3.s, z3.s, z3.s -; CHECK-NEXT: add z4.s, z4.s, z4.s -; CHECK-NEXT: mov z0.s, s0 -; CHECK-NEXT: stp q1, q2, [x0, #32] -; CHECK-NEXT: stp q3, q4, [x0] +; CHECK-NEXT: add z5.s, z0.s, z0.s +; CHECK-NEXT: mov z0.s, s2 +; CHECK-NEXT: add z2.s, z3.s, z3.s +; CHECK-NEXT: add z3.s, z4.s, z4.s +; CHECK-NEXT: stp q1, q5, [x0, #32] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: stp q2, q3, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: test2: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll index 48a642c908bfe..42b947604b860 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -1815,17 +1815,18 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) { ; CHECK-LABEL: masked_load_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.h, z0.b +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: mov x8, #8 // =0x8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: lsl z1.h, z1.h, #15 -; CHECK-NEXT: asr z1.h, z1.h, #15 +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: lsl z0.h, z0.h, #15 -; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, #0 +; CHECK-NEXT: lsl z1.h, z1.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 -; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: asr z1.h, z1.h, #15 +; CHECK-NEXT: cmpne p1.h, p0/z, z0.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 ; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] @@ -2572,15 +2573,16 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x8, #2 // =0x2 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: lsl z1.d, z1.d, #63 +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: lsl z0.d, z0.d, #63 -; CHECK-NEXT: asr z1.d, z1.d, #63 +; CHECK-NEXT: lsl z1.d, z1.d, #63 ; CHECK-NEXT: asr z0.d, z0.d, #63 -; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: asr z1.d, z1.d, #63 +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll index 265480b571970..9b3da75be47ec 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll @@ -797,20 +797,21 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) { ; CHECK-LABEL: masked_store_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: uunpklo z1.h, z0.b +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: mov x8, #8 // =0x8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: lsl z1.h, z1.h, #15 -; CHECK-NEXT: asr z1.h, z1.h, #15 +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: lsl z0.h, z0.h, #15 +; CHECK-NEXT: lsl z1.h, z1.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 -; CHECK-NEXT: cmpne p1.h, p0/z, z0.h, #0 -; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: st1h { z0.h }, p1, [x0, x8, lsl #1] -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: asr z1.h, z1.h, #15 +; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: mov z1.h, #0 // =0x0 +; CHECK-NEXT: st1h { z1.h }, p1, [x0, x8, lsl #1] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_store_v16f16: @@ -1165,18 +1166,19 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x8, #2 // =0x2 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: lsl z1.d, z1.d, #63 +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: lsl z0.d, z0.d, #63 -; CHECK-NEXT: asr z1.d, z1.d, #63 +; CHECK-NEXT: lsl z1.d, z1.d, #63 ; CHECK-NEXT: asr z0.d, z0.d, #63 -; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: asr z1.d, z1.d, #63 +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: st1d { z1.d }, p1, [x0, x8, lsl #3] +; CHECK-NEXT: st1d { z1.d }, p0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: masked_store_v4f64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll index 688537704a6f7..7363c306033a1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll @@ -37,34 +37,39 @@ define i32 @reduce_uaddv_v16i8(<32 x i8> %a) { ; STREAMING-SVE-LABEL: reduce_uaddv_v16i8: ; STREAMING-SVE: // %bb.0: ; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 -; STREAMING-SVE-NEXT: uunpklo z2.h, z1.b ; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 -; STREAMING-SVE-NEXT: uunpklo z3.h, z0.b +; STREAMING-SVE-NEXT: movprfx z2, z1 +; STREAMING-SVE-NEXT: ext z2.b, z2.b, z1.b, #8 +; STREAMING-SVE-NEXT: movprfx z3, z0 +; STREAMING-SVE-NEXT: ext z3.b, z3.b, z0.b, #8 ; STREAMING-SVE-NEXT: ptrue p0.s, vl4 -; STREAMING-SVE-NEXT: ext z1.b, z1.b, z1.b, #8 -; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 ; STREAMING-SVE-NEXT: uunpklo z1.h, z1.b ; STREAMING-SVE-NEXT: uunpklo z0.h, z0.b -; STREAMING-SVE-NEXT: uunpklo z4.s, z2.h -; STREAMING-SVE-NEXT: ext z2.b, z2.b, z2.b, #8 -; STREAMING-SVE-NEXT: uunpklo z6.s, z3.h -; STREAMING-SVE-NEXT: ext z3.b, z3.b, z3.b, #8 -; STREAMING-SVE-NEXT: mov z5.d, z1.d -; STREAMING-SVE-NEXT: uunpklo z7.s, z0.h -; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 -; STREAMING-SVE-NEXT: uunpklo z2.s, z2.h -; STREAMING-SVE-NEXT: uunpklo z3.s, z3.h -; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s -; STREAMING-SVE-NEXT: ext z5.b, z5.b, z1.b, #8 +; STREAMING-SVE-NEXT: uunpklo z2.h, z2.b +; STREAMING-SVE-NEXT: uunpklo z3.h, z3.b +; STREAMING-SVE-NEXT: movprfx z4, z1 +; STREAMING-SVE-NEXT: ext z4.b, z4.b, z1.b, #8 +; STREAMING-SVE-NEXT: movprfx z7, z0 +; STREAMING-SVE-NEXT: ext z7.b, z7.b, z0.b, #8 ; STREAMING-SVE-NEXT: uunpklo z1.s, z1.h ; STREAMING-SVE-NEXT: uunpklo z0.s, z0.h -; STREAMING-SVE-NEXT: add z2.s, z3.s, z2.s +; STREAMING-SVE-NEXT: movprfx z5, z2 +; STREAMING-SVE-NEXT: ext z5.b, z5.b, z2.b, #8 +; STREAMING-SVE-NEXT: movprfx z6, z3 +; STREAMING-SVE-NEXT: ext z6.b, z6.b, z3.b, #8 +; STREAMING-SVE-NEXT: uunpklo z2.s, z2.h +; STREAMING-SVE-NEXT: uunpklo z4.s, z4.h +; STREAMING-SVE-NEXT: uunpklo z7.s, z7.h +; STREAMING-SVE-NEXT: uunpklo z3.s, z3.h ; STREAMING-SVE-NEXT: uunpklo z5.s, z5.h -; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s -; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s -; STREAMING-SVE-NEXT: add z1.s, z4.s, z1.s -; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s -; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s +; STREAMING-SVE-NEXT: uunpklo z6.s, z6.h +; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s +; STREAMING-SVE-NEXT: add z1.s, z3.s, z2.s +; STREAMING-SVE-NEXT: add z2.s, z7.s, z4.s +; STREAMING-SVE-NEXT: add z3.s, z6.s, z5.s +; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s +; STREAMING-SVE-NEXT: add z1.s, z2.s, z3.s +; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s ; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s ; STREAMING-SVE-NEXT: fmov w0, s0 ; STREAMING-SVE-NEXT: ret @@ -104,34 +109,39 @@ define i32 @reduce_saddv_v16i8(<32 x i8> %a) { ; STREAMING-SVE-LABEL: reduce_saddv_v16i8: ; STREAMING-SVE: // %bb.0: ; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 -; STREAMING-SVE-NEXT: sunpklo z2.h, z1.b ; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 -; STREAMING-SVE-NEXT: sunpklo z3.h, z0.b +; STREAMING-SVE-NEXT: movprfx z2, z1 +; STREAMING-SVE-NEXT: ext z2.b, z2.b, z1.b, #8 +; STREAMING-SVE-NEXT: movprfx z3, z0 +; STREAMING-SVE-NEXT: ext z3.b, z3.b, z0.b, #8 ; STREAMING-SVE-NEXT: ptrue p0.s, vl4 -; STREAMING-SVE-NEXT: ext z1.b, z1.b, z1.b, #8 -; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 ; STREAMING-SVE-NEXT: sunpklo z1.h, z1.b ; STREAMING-SVE-NEXT: sunpklo z0.h, z0.b -; STREAMING-SVE-NEXT: sunpklo z4.s, z2.h -; STREAMING-SVE-NEXT: ext z2.b, z2.b, z2.b, #8 -; STREAMING-SVE-NEXT: sunpklo z6.s, z3.h -; STREAMING-SVE-NEXT: ext z3.b, z3.b, z3.b, #8 -; STREAMING-SVE-NEXT: mov z5.d, z1.d -; STREAMING-SVE-NEXT: sunpklo z7.s, z0.h -; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 -; STREAMING-SVE-NEXT: sunpklo z2.s, z2.h -; STREAMING-SVE-NEXT: sunpklo z3.s, z3.h -; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s -; STREAMING-SVE-NEXT: ext z5.b, z5.b, z1.b, #8 +; STREAMING-SVE-NEXT: sunpklo z2.h, z2.b +; STREAMING-SVE-NEXT: sunpklo z3.h, z3.b +; STREAMING-SVE-NEXT: movprfx z4, z1 +; STREAMING-SVE-NEXT: ext z4.b, z4.b, z1.b, #8 +; STREAMING-SVE-NEXT: movprfx z7, z0 +; STREAMING-SVE-NEXT: ext z7.b, z7.b, z0.b, #8 ; STREAMING-SVE-NEXT: sunpklo z1.s, z1.h ; STREAMING-SVE-NEXT: sunpklo z0.s, z0.h -; STREAMING-SVE-NEXT: add z2.s, z3.s, z2.s +; STREAMING-SVE-NEXT: movprfx z5, z2 +; STREAMING-SVE-NEXT: ext z5.b, z5.b, z2.b, #8 +; STREAMING-SVE-NEXT: movprfx z6, z3 +; STREAMING-SVE-NEXT: ext z6.b, z6.b, z3.b, #8 +; STREAMING-SVE-NEXT: sunpklo z2.s, z2.h +; STREAMING-SVE-NEXT: sunpklo z4.s, z4.h +; STREAMING-SVE-NEXT: sunpklo z7.s, z7.h +; STREAMING-SVE-NEXT: sunpklo z3.s, z3.h ; STREAMING-SVE-NEXT: sunpklo z5.s, z5.h -; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s -; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s -; STREAMING-SVE-NEXT: add z1.s, z4.s, z1.s -; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s -; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s +; STREAMING-SVE-NEXT: sunpklo z6.s, z6.h +; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s +; STREAMING-SVE-NEXT: add z1.s, z3.s, z2.s +; STREAMING-SVE-NEXT: add z2.s, z7.s, z4.s +; STREAMING-SVE-NEXT: add z3.s, z6.s, z5.s +; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s +; STREAMING-SVE-NEXT: add z1.s, z2.s, z3.s +; STREAMING-SVE-NEXT: add z0.s, z0.s, z1.s ; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s ; STREAMING-SVE-NEXT: fmov w0, s0 ; STREAMING-SVE-NEXT: ret