diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 6194de2d56b63..7765fa6ed63e9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7222,8 +7222,23 @@ def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2) )>; -multiclass Neon_INS_elt_pattern { +// Move elements between vectors +multiclass Neon_INS_elt_pattern { + // Extracting from the lowest 128-bits of an SVE vector + def : Pat<(VT128 (vector_insert VT128:$Rn, + (VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))), + (i64 imm:$Immd))), + (INS VT128:$Rn, imm:$Immd, (VT128 (EXTRACT_SUBREG VTSVE:$Rm, zsub)), SVEIdxTy:$Immn)>; + + def : Pat<(VT64 (vector_insert VT64:$Rn, + (VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))), + (i64 imm:$Immd))), + (EXTRACT_SUBREG + (INS (SUBREG_TO_REG (i64 0), VT64:$Rn, dsub), imm:$Immd, + (VT128 (EXTRACT_SUBREG VTSVE:$Rm, zsub)), SVEIdxTy:$Immn), + dsub)>; + // Extracting from another NEON vector def : Pat<(VT128 (vector_insert V128:$src, (VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))), (i64 imm:$Immd))), @@ -7251,15 +7266,15 @@ multiclass Neon_INS_elt_pattern; } -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; // Insert from bitcast // vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 2b69903b133fe..e68544361ff2e 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3365,6 +3365,21 @@ let Predicates = [HasSVEorSME] in { (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>; def : Pat<(i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)), (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>; + + // Move element from the bottom 128-bits of a scalable vector to a single-element vector. + // Alternative case where insertelement is just scalar_to_vector rather than vector_insert. + def : Pat<(v1f64 (scalar_to_vector + (f64 (vector_extract nxv2f64:$vec, VectorIndexD:$index)))), + (EXTRACT_SUBREG + (INSvi64lane (IMPLICIT_DEF), (i64 0), + (EXTRACT_SUBREG nxv2f64:$vec, zsub), VectorIndexD:$index), + dsub)>; + def : Pat<(v1i64 (scalar_to_vector + (i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)))), + (EXTRACT_SUBREG + (INSvi64lane (IMPLICIT_DEF), (i64 0), + (EXTRACT_SUBREG nxv2i64:$vec, zsub), VectorIndexD:$index), + dsub)>; } // End HasNEON let Predicates = [HasNEON] in { diff --git a/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll b/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll new file mode 100644 index 0000000000000..0f4eec4fdfda1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll @@ -0,0 +1,469 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s + +; Inserting an element from the bottom 128-bits of an SVE type into a NEON vector should use INS (element) to +; avoid pointless FMOV trips. + +; --------- extraction from nxv16i8 + +define <8 x i8> @test_lane0_nxv16i8(<8 x i8> %a, %b) { +; CHECK-LABEL: test_lane0_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.b[0], v1.b[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <8 x i8> %a, i8 %c, i32 0 + ret <8 x i8> %d +} + +define <8 x i8> @test_lane15_nxv16i8(<8 x i8> %a, %b) { +; CHECK-LABEL: test_lane15_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.b[7], v1.b[15] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 15 + %d = insertelement <8 x i8> %a, i8 %c, i32 7 + ret <8 x i8> %d +} + +define <16 x i8> @test_q_lane0_nxv16i8(<16 x i8> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.b[0], v1.b[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <16 x i8> %a, i8 %c, i32 0 + ret <16 x i8> %d +} + +define <16 x i8> @test_q_lane15_nxv16i8(<16 x i8> %a, %b) { +; CHECK-LABEL: test_q_lane15_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.b[15], v1.b[15] +; CHECK-NEXT: ret + %c = extractelement %b, i32 15 + %d = insertelement <16 x i8> %a, i8 %c, i32 15 + ret <16 x i8> %d +} + +; (negative test) Extracted element is not within Vn +define <16 x i8> @test_q_lane16_nxv16i8(<16 x i8> %a, %b) { +; CHECK-LABEL: test_q_lane16_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, z1.b[16] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ret + %c = extractelement %b, i32 16 + %d = insertelement <16 x i8> %a, i8 %c, i32 15 + ret <16 x i8> %d +} + +; --------- extraction from nxv8f16 + +define <4 x half> @test_lane0_nxv8f16(<4 x half> %a, %b) { +; CHECK-LABEL: test_lane0_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <4 x half> %a, half %c, i32 0 + ret <4 x half> %d +} + +define <4 x half> @test_lane7_nxv8f16(<4 x half> %a, %b) { +; CHECK-LABEL: test_lane7_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[3], v1.h[7] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <4 x half> %a, half %c, i32 3 + ret <4 x half> %d +} + +define <8 x half> @test_q_lane0_nxv8f16(<8 x half> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <8 x half> %a, half %c, i32 0 + ret <8 x half> %d +} + +define <8 x half> @test_q_lane7_nxv8f16(<8 x half> %a, %b) { +; CHECK-LABEL: test_q_lane7_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[7], v1.h[7] +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <8 x half> %a, half %c, i32 7 + ret <8 x half> %d +} + +; (negative test) Extracted element is not within Vn +define <8 x half> @test_q_lane8_nxv8f16(<8 x half> %a, %b) { +; CHECK-LABEL: test_q_lane8_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, z1.h[8] +; CHECK-NEXT: mov v0.h[7], v1.h[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 8 + %d = insertelement <8 x half> %a, half %c, i32 7 + ret <8 x half> %d +} + +; --------- extraction from nxv8bf16 + +define <4 x bfloat> @test_lane0_nxv8bf16(<4 x bfloat> %a, %b) { +; CHECK-LABEL: test_lane0_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <4 x bfloat> %a, bfloat %c, i32 0 + ret <4 x bfloat> %d +} + +define <4 x bfloat> @test_lane7_nxv8bf16(<4 x bfloat> %a, %b) { +; CHECK-LABEL: test_lane7_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[3], v1.h[7] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <4 x bfloat> %a, bfloat %c, i32 3 + ret <4 x bfloat> %d +} + +define <8 x bfloat> @test_q_lane0_nxv8bf16(<8 x bfloat> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <8 x bfloat> %a, bfloat %c, i32 0 + ret <8 x bfloat> %d +} + +define <8 x bfloat> @test_q_lane7_nxv8bf16(<8 x bfloat> %a, %b) { +; CHECK-LABEL: test_q_lane7_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[7], v1.h[7] +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <8 x bfloat> %a, bfloat %c, i32 7 + ret <8 x bfloat> %d +} + +; (negative test) Extracted element is not within Vn +define <8 x bfloat> @test_q_lane8_nxv8bf16(<8 x bfloat> %a, %b) { +; CHECK-LABEL: test_q_lane8_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, z1.h[8] +; CHECK-NEXT: mov v0.h[7], v1.h[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 8 + %d = insertelement <8 x bfloat> %a, bfloat %c, i32 7 + ret <8 x bfloat> %d +} + +; --------- extraction from nxv8i16 + +define <4 x i16> @test_lane0_nxv8i16(<4 x i16> %a, %b) { +; CHECK-LABEL: test_lane0_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <4 x i16> %a, i16 %c, i32 0 + ret <4 x i16> %d +} + +define <4 x i16> @test_lane7_nxv8i16(<4 x i16> %a, %b) { +; CHECK-LABEL: test_lane7_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[3], v1.h[7] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <4 x i16> %a, i16 %c, i32 3 + ret <4 x i16> %d +} + +define <8 x i16> @test_q_lane0_nxv8i16(<8 x i16> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <8 x i16> %a, i16 %c, i32 0 + ret <8 x i16> %d +} + +define <8 x i16> @test_q_lane7_nxv8i16(<8 x i16> %a, %b) { +; CHECK-LABEL: test_q_lane7_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[7], v1.h[7] +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <8 x i16> %a, i16 %c, i32 7 + ret <8 x i16> %d +} + +; (negative test) Extracted element is not within Vn +define <8 x i16> @test_q_lane8_nxv8i16(<8 x i16> %a, %b) { +; CHECK-LABEL: test_q_lane8_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, z1.h[8] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.h[7], w8 +; CHECK-NEXT: ret + %c = extractelement %b, i32 8 + %d = insertelement <8 x i16> %a, i16 %c, i32 7 + ret <8 x i16> %d +} + +; --------- extraction from nxv4f32 + +define <2 x float> @test_lane0_nxv4f32(<2 x float> %a, %b) { +; CHECK-LABEL: test_lane0_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <2 x float> %a, float %c, i32 0 + ret <2 x float> %d +} + +define <2 x float> @test_lane3_nxv4f32(<2 x float> %a, %b) { +; CHECK-LABEL: test_lane3_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[1], v1.s[3] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 3 + %d = insertelement <2 x float> %a, float %c, i32 1 + ret <2 x float> %d +} + +define <4 x float> @test_q_lane0_nxv4f32(<4 x float> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <4 x float> %a, float %c, i32 0 + ret <4 x float> %d +} + +define <4 x float> @test_q_lane3_nxv4f32(<4 x float> %a, %b) { +; CHECK-LABEL: test_q_lane3_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.s[3], v1.s[3] +; CHECK-NEXT: ret + %c = extractelement %b, i32 3 + %d = insertelement <4 x float> %a, float %c, i32 3 + ret <4 x float> %d +} + +; (negative test) Extracted element is not within Vn +define <4 x float> @test_q_lane4_nxv4f32(<4 x float> %a, %b) { +; CHECK-LABEL: test_q_lane4_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.s, z1.s[4] +; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 4 + %d = insertelement <4 x float> %a, float %c, i32 3 + ret <4 x float> %d +} + +; --------- extraction from nxv4i32 + +define <2 x i32> @test_lane0_nxv4i32(<2 x i32> %a, %b) { +; CHECK-LABEL: test_lane0_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <2 x i32> %a, i32 %c, i32 0 + ret <2 x i32> %d +} + +define <2 x i32> @test_lane3_nxv4i32(<2 x i32> %a, %b) { +; CHECK-LABEL: test_lane3_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[1], v1.s[3] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 3 + %d = insertelement <2 x i32> %a, i32 %c, i32 1 + ret <2 x i32> %d +} + +define <4 x i32> @test_q_lane0_nxv4i32(<4 x i32> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <4 x i32> %a, i32 %c, i32 0 + ret <4 x i32> %d +} + +define <4 x i32> @test_q_lane3_nxv4i32(<4 x i32> %a, %b) { +; CHECK-LABEL: test_q_lane3_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.s[3], v1.s[3] +; CHECK-NEXT: ret + %c = extractelement %b, i32 3 + %d = insertelement <4 x i32> %a, i32 %c, i32 3 + ret <4 x i32> %d +} + +; (negative test) Extracted element is not within Vn +define <4 x i32> @test_q_lane4_nxv4i32(<4 x i32> %a, %b) { +; CHECK-LABEL: test_q_lane4_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.s, z1.s[4] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: ret + %c = extractelement %b, i32 4 + %d = insertelement <4 x i32> %a, i32 %c, i32 3 + ret <4 x i32> %d +} + +; --------- extraction from nxv2f64 + +define <1 x double> @test_lane0_nxv2f64(<1 x double> %a, %b) { +; CHECK-LABEL: test_lane0_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <1 x double> %a, double %c, i32 0 + ret <1 x double> %d +} + +define <1 x double> @test_lane1_nxv2f64(<1 x double> %a, %b) { +; CHECK-LABEL: test_lane1_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 1 + %d = insertelement <1 x double> %a, double %c, i32 0 + ret <1 x double> %d +} + +define <2 x double> @test_q_lane0_nxv2f64(<2 x double> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <2 x double> %a, double %c, i32 0 + ret <2 x double> %d +} + +define <2 x double> @test_q_lane1_nxv2f64(<2 x double> %a, %b) { +; CHECK-LABEL: test_q_lane1_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[1], v1.d[1] +; CHECK-NEXT: ret + %c = extractelement %b, i32 1 + %d = insertelement <2 x double> %a, double %c, i32 1 + ret <2 x double> %d +} + +; (negative test) Extracted element is not within Vn +define <2 x double> @test_q_lane2_nxv2f64(<2 x double> %a, %b) { +; CHECK-LABEL: test_q_lane2_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, z1.d[2] +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 2 + %d = insertelement <2 x double> %a, double %c, i32 1 + ret <2 x double> %d +} + +; --------- extraction from nxv2i64 + +define <1 x i64> @test_lane0_nxv2i64(<1 x i64> %a, %b) { +; CHECK-LABEL: test_lane0_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <1 x i64> %a, i64 %c, i32 0 + ret <1 x i64> %d +} + +define <1 x i64> @test_lane1_nxv2i64(<1 x i64> %a, %b) { +; CHECK-LABEL: test_lane1_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 1 + %d = insertelement <1 x i64> %a, i64 %c, i32 0 + ret <1 x i64> %d +} + +define <2 x i64> @test_q_lane0_nxv2i64(<2 x i64> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <2 x i64> %a, i64 %c, i32 0 + ret <2 x i64> %d +} + +define <2 x i64> @test_q_lane1_nxv2i64(<2 x i64> %a, %b) { +; CHECK-LABEL: test_q_lane1_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[1], v1.d[1] +; CHECK-NEXT: ret + %c = extractelement %b, i32 1 + %d = insertelement <2 x i64> %a, i64 %c, i32 1 + ret <2 x i64> %d +} + +; (negative test) Extracted element is not within Vn +define <2 x i64> @test_q_lane2_nxv2i64(<2 x i64> %a, %b) { +; CHECK-LABEL: test_q_lane2_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, z1.d[2] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ret + %c = extractelement %b, i32 2 + %d = insertelement <2 x i64> %a, i64 %c, i32 1 + ret <2 x i64> %d +} diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll index 678afc4dea309..518e3573b5edd 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -290,41 +290,28 @@ define <8 x i1> @extract_v8i1_nxv8i1( %inmask) { ret <8 x i1> %mask } +; TODO: Apply better reasoning when lowering extract_subvector from the bottom 128-bits +; of an SVE type. define <16 x i1> @extract_v16i1_nxv16i1( %inmask) { ; CHECK-LABEL: extract_v16i1_nxv16i1: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.b[1] ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.b[2] -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: umov w8, v1.b[3] -; CHECK-NEXT: mov v0.b[2], w9 -; CHECK-NEXT: umov w9, v1.b[4] -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: umov w8, v1.b[5] -; CHECK-NEXT: mov v0.b[4], w9 -; CHECK-NEXT: umov w9, v1.b[6] -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: umov w8, v1.b[7] -; CHECK-NEXT: mov v0.b[6], w9 -; CHECK-NEXT: umov w9, v1.b[8] -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: umov w8, v1.b[9] -; CHECK-NEXT: mov v0.b[8], w9 -; CHECK-NEXT: umov w9, v1.b[10] -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: umov w8, v1.b[11] -; CHECK-NEXT: mov v0.b[10], w9 -; CHECK-NEXT: umov w9, v1.b[12] -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: umov w8, v1.b[13] -; CHECK-NEXT: mov v0.b[12], w9 -; CHECK-NEXT: umov w9, v1.b[14] -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: umov w8, v1.b[15] -; CHECK-NEXT: mov v0.b[14], w9 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: mov v0.b[1], v1.b[1] +; CHECK-NEXT: mov v0.b[2], v1.b[2] +; CHECK-NEXT: mov v0.b[3], v1.b[3] +; CHECK-NEXT: mov v0.b[4], v1.b[4] +; CHECK-NEXT: mov v0.b[5], v1.b[5] +; CHECK-NEXT: mov v0.b[6], v1.b[6] +; CHECK-NEXT: mov v0.b[7], v1.b[7] +; CHECK-NEXT: mov v0.b[8], v1.b[8] +; CHECK-NEXT: mov v0.b[9], v1.b[9] +; CHECK-NEXT: mov v0.b[10], v1.b[10] +; CHECK-NEXT: mov v0.b[11], v1.b[11] +; CHECK-NEXT: mov v0.b[12], v1.b[12] +; CHECK-NEXT: mov v0.b[13], v1.b[13] +; CHECK-NEXT: mov v0.b[14], v1.b[14] +; CHECK-NEXT: mov v0.b[15], v1.b[15] ; CHECK-NEXT: ret %mask = call <16 x i1> @llvm.vector.extract.v16i1.nxv16i1( %inmask, i64 0) ret <16 x i1> %mask diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index fb169491b0c90..749a1866e7192 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -30,78 +30,64 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: // %bb.1: // %vector.body ; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x9, #8 // =0x8 +; CHECK-NEXT: mov x10, #24 // =0x18 ; CHECK-NEXT: umov w8, v0.b[8] -; CHECK-NEXT: umov w9, v0.b[9] -; CHECK-NEXT: umov w10, v0.b[1] ; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: umov w11, v0.b[15] +; CHECK-NEXT: mov v1.b[1], v0.b[1] ; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: umov w8, v0.b[10] -; CHECK-NEXT: mov v1.b[1], w10 -; CHECK-NEXT: umov w10, v0.b[11] -; CHECK-NEXT: mov v2.b[1], w9 -; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: mov v2.b[2], w8 -; CHECK-NEXT: umov w8, v0.b[3] -; CHECK-NEXT: mov v1.b[2], w9 -; CHECK-NEXT: umov w9, v0.b[12] -; CHECK-NEXT: mov v2.b[3], w10 -; CHECK-NEXT: umov w10, v0.b[4] -; CHECK-NEXT: mov v1.b[3], w8 -; CHECK-NEXT: umov w8, v0.b[13] -; CHECK-NEXT: mov v2.b[4], w9 -; CHECK-NEXT: umov w9, v0.b[5] -; CHECK-NEXT: mov v1.b[4], w10 -; CHECK-NEXT: umov w10, v0.b[14] -; CHECK-NEXT: mov v2.b[5], w8 -; CHECK-NEXT: umov w8, v0.b[6] -; CHECK-NEXT: mov v1.b[5], w9 -; CHECK-NEXT: umov w9, v0.b[7] +; CHECK-NEXT: mov x8, #16 // =0x10 +; CHECK-NEXT: mov v2.b[1], v0.b[9] +; CHECK-NEXT: mov v1.b[2], v0.b[2] +; CHECK-NEXT: mov v2.b[2], v0.b[10] +; CHECK-NEXT: mov v1.b[3], v0.b[3] +; CHECK-NEXT: mov v2.b[3], v0.b[11] +; CHECK-NEXT: mov v1.b[4], v0.b[4] +; CHECK-NEXT: mov v2.b[4], v0.b[12] +; CHECK-NEXT: mov v1.b[5], v0.b[5] +; CHECK-NEXT: mov v2.b[5], v0.b[13] +; CHECK-NEXT: mov v1.b[6], v0.b[6] +; CHECK-NEXT: mov v2.b[6], v0.b[14] +; CHECK-NEXT: mov v1.b[7], v0.b[7] +; CHECK-NEXT: mov v2.b[7], v0.b[15] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 -; CHECK-NEXT: mov v2.b[6], w10 -; CHECK-NEXT: mov v1.b[6], w8 +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: mov x8, #16 // =0x10 -; CHECK-NEXT: mov x10, #8 // =0x8 -; CHECK-NEXT: mov v2.b[7], w11 -; CHECK-NEXT: mov v1.b[7], w9 +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: mov x9, #24 // =0x18 -; CHECK-NEXT: uunpklo z2.h, z2.b -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: lsl z0.s, z0.s, #31 ; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: lsl z3.s, z3.s, #31 -; CHECK-NEXT: asr z0.s, z0.s, #31 -; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: lsl z2.s, z2.s, #31 ; CHECK-NEXT: lsl z1.s, z1.s, #31 -; CHECK-NEXT: and z0.s, z0.s, #0x1 -; CHECK-NEXT: and z3.s, z3.s, #0x1 -; CHECK-NEXT: asr z2.s, z2.s, #31 +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: lsl z0.s, z0.s, #31 ; CHECK-NEXT: asr z1.s, z1.s, #31 -; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x9, lsl #2] -; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: lsl z2.s, z2.s, #31 +; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: and z1.s, z1.s, #0x1 -; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 -; CHECK-NEXT: cmpne p3.s, p0/z, z2.s, #0 +; CHECK-NEXT: lsl z3.s, z3.s, #31 +; CHECK-NEXT: asr z2.s, z2.s, #31 +; CHECK-NEXT: and z0.s, z0.s, #0x1 ; CHECK-NEXT: cmpne p4.s, p0/z, z1.s, #0 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: mov z3.s, p2/m, #0 // =0x0 -; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; CHECK-NEXT: mov z2.s, p3/m, #0 // =0x0 +; CHECK-NEXT: asr z3.s, z3.s, #31 +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: and z3.s, z3.s, #0x1 +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: mov z1.s, p4/m, #0 // =0x0 -; CHECK-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] -; CHECK-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; CHECK-NEXT: cmpne p3.s, p0/z, z3.s, #0 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 +; CHECK-NEXT: mov z2.s, p2/m, #0 // =0x0 ; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0 +; CHECK-NEXT: st1w { z2.s }, p0, [x0, x9, lsl #2] +; CHECK-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer